diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..71dbdabf3dd9f49d690f3ec24300a1a5d8864a69 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,34 @@ +version: 2.1 + +# this allows you to use CircleCI's dynamic configuration feature +setup: true + +# the path-filtering orb is required to continue a pipeline based on +# the path of an updated fileset +orbs: + path-filtering: circleci/path-filtering@0.1.2 + +workflows: + # the always-run workflow is always triggered, regardless of the pipeline parameters. + always-run: + jobs: + # the path-filtering/filter job determines which pipeline + # parameters to update. + - path-filtering/filter: + name: check-updated-files + # 3-column, whitespace-delimited mapping. One mapping per + # line: + # + mapping: | + mmaction/.* lint_only false + requirements/.* lint_only false + tests/.* lint_only false + tools/.* lint_only false + configs/.* lint_only false + .circleci/.* lint_only false + base-revision: dev-1.x + # this is the path of the configuration we should trigger once + # path filtering and pipeline parameter value updates are + # complete. In this case, we are using the parent dynamic + # configuration itself. + config-path: .circleci/test.yml diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..53c009c9ec39b9884bec484e2af2d9302faed008 --- /dev/null +++ b/.circleci/docker/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx diff --git a/.circleci/test.yml b/.circleci/test.yml new file mode 100644 index 0000000000000000000000000000000000000000..933efc31bd806246f92052c8a796bb79e30bcd56 --- /dev/null +++ b/.circleci/test.yml @@ -0,0 +1,211 @@ +version: 2.1 + +# the default pipeline parameters, which will be updated according to +# the results of the path-filtering orb +parameters: + lint_only: + type: boolean + default: true + +jobs: + lint: + docker: + - image: cimg/python:3.7.4 + steps: + - checkout + - run: + name: Install pre-commit hook + command: | + pip install pre-commit + pre-commit install + - run: + name: Linting + command: pre-commit run --all-files + - run: + name: Check docstring coverage + command: | + pip install interrogate + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 50 mmaction + build_cpu: + parameters: + # The python version must match available image tags in + # https://circleci.com/developer/images/image/cimg/python + python: + type: string + torch: + type: string + torchvision: + type: string + docker: + - image: cimg/python:<< parameters.python >> + resource_class: large + steps: + - checkout + - run: + name: Install Libraries + command: | + sudo apt-get update + sudo apt-get upgrade + sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config + sudo apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1 + - run: + name: Configure Python & pip + command: | + pip install --upgrade pip + pip install wheel + - run: + name: Install PyTorch + command: | + python -V + pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html + - run: + name: Install mmaction dependencies + command: | + pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main + pip install -U openmim + mim install 'mmcv >= 2.0.0' + pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x + pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x + pip install git+https://github.com/open-mmlab/mmpretrain.git@dev + pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x + pip install -r requirements.txt + - run: + name: Install timm + command: | + pip install timm + - run: + name: Install transformers + command: | + pip install transformers + - when: + condition: + equal: [ "0.10.0", << parameters.torchvision >> ] + steps: + - run: python -m pip install pytorchvideo + - run: + name: Build and install + command: | + pip install -e . + - run: + name: Run unittests + command: | + coverage run --branch --source mmaction -m pytest tests/ + coverage xml + coverage report -m + build_cuda: + parameters: + torch: + type: string + cuda: + type: enum + enum: ["11.1"] + cudnn: + type: integer + default: 8 + machine: + image: ubuntu-2004-cuda-11.4:202110-01 + # docker_layer_caching: true + resource_class: gpu.nvidia.small + steps: + - checkout + - run: + name: Build Docker image + command: | + docker build .circleci/docker -t mmaction:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >> + docker run --gpus all -t -d -v /home/circleci/project:/mmaction -w /mmaction --name mmaction mmaction:gpu + docker exec mmaction apt-get update + docker exec mmaction apt-get upgrade -y + docker exec mmaction apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config + docker exec mmaction apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1 + - run: + name: Install PytorchVideo and timm + command: | + docker exec mmaction pip install timm + docker exec mmaction python -m pip install pytorchvideo + - run: + name: Install transformers + command: | + docker exec mmaction pip install transformers + - run: + name: Install mmaction dependencies + command: | + docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmengine.git@main + docker exec mmaction pip install -U openmim + docker exec mmaction mim install 'mmcv >= 2.0.0' + docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x + docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmpose.git@dev-1.x + docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x + docker exec mmaction pip install git+https://github.com/open-mmlab/mmpretrain.git@dev + docker exec mmaction pip install -r requirements.txt + - run: + name: Build and install + command: | + docker exec mmaction pip install -e . + - run: + name: Run unittests + command: | + docker exec mmaction pytest tests/ +workflows: + pr_stage_lint: + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - dev-1.x + - main + pr_stage_test: + when: + not: + << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - dev-1.x + - main + - build_cpu: + name: minimum_version_cpu + torch: 1.8.1 + torchvision: 0.9.1 + python: 3.7.4 + requires: + - lint + - build_cpu: + name: maximum_version_cpu + torch: 1.13.0 + torchvision: 0.14.0 + python: 3.9.0 + requires: + - minimum_version_cpu + - hold: + type: approval + requires: + - maximum_version_cpu + - build_cuda: + name: mainstream_version_gpu + torch: 1.8.1 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "11.1" + requires: + - hold + merge_stage_test: + when: + not: + << pipeline.parameters.lint_only >> + jobs: + - build_cuda: + name: minimum_version_gpu + torch: 1.8.1 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "11.1" + filters: + branches: + only: + - dev-1.x + - main diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..dcf374a00d865ee72d4fbf2245f8192091741e91 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +demo/demo_skeleton.mp4 filter=lfs diff=lfs merge=lfs -text +demo/demo_spatiotemporal_det.mp4 filter=lfs diff=lfs merge=lfs -text +demo/demo.mp4 filter=lfs diff=lfs merge=lfs -text +demo/test_video_structuralize.mp4 filter=lfs diff=lfs merge=lfs -text +resources/data_pipeline.png filter=lfs diff=lfs merge=lfs -text +resources/miaomiao_qrcode.jpg filter=lfs diff=lfs merge=lfs -text +resources/mmaction2_overview.gif filter=lfs diff=lfs merge=lfs -text +resources/qq_group_qrcode.jpg filter=lfs diff=lfs merge=lfs -text +resources/spatio-temporal-det.gif filter=lfs diff=lfs merge=lfs -text +resources/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text +tests/data/rawvideo_dataset/part_0.mp4 filter=lfs diff=lfs merge=lfs -text +tests/data/rawvideo_dataset/part_1.mp4 filter=lfs diff=lfs merge=lfs -text +tests/data/test.avi filter=lfs diff=lfs merge=lfs -text +tests/data/test.mp4 filter=lfs diff=lfs merge=lfs -text +tests/data/test.wav filter=lfs diff=lfs merge=lfs -text +tools/data/skeleton/S001C001P001R001A001_rgb.avi filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b05b2f768cd121cc33150cbdebf477666bb207a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,151 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +**/*.pyc + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Auto generate documentation +docs/*/_build/ +docs/*/model_zoo/ +docs/*/dataset_zoo/ +docs/*/_model_zoo.rst +docs/*/modelzoo_statistics.md +docs/*/datasetzoo_statistics.md +docs/*/projectzoo.md +docs/*/papers/ +docs/*/api/generated/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# custom +/data +.vscode +.idea +*.pkl +*.pkl.json +*.log.json +benchlist.txt +work_dirs/ +/projects/*/work_dirs +/projects/*/data +.DS_Store + +# Pytorch +*.pth + +# Profile +*.prof + +# lmdb +*.mdb + +# unignore some data file in tests/data +!tests/data/**/*.pkl +!tests/data/**/*.pkl.json +!tests/data/**/*.log.json +!tests/data/**/*.pth + +# avoid soft links created by MIM +mmaction/tools/* + +*.ipynb + +# unignore ipython notebook files in demo +!demo/*.ipynb +!projects/stad_tutorial/*.ipynb +mmaction/.mim diff --git a/.owners.yml b/.owners.yml new file mode 100644 index 0000000000000000000000000000000000000000..626aaab1890e4f3e5183be2cfa2fdc4ed4c9289b --- /dev/null +++ b/.owners.yml @@ -0,0 +1,16 @@ +assign: + issues: enabled + pull_requests: disabled + strategy: + # random + daily-shift-based + scedule: + '*/1 * * * *' + assignees: + - hukkai + - Dai-Wenxun + - cir7 + - Dai-Wenxun + - cir7 + - hukkai + - hukkai diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..91b101351e98ee45f74446e68b7787faba0f1ab2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,52 @@ +exclude: ^tests/data/ +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + args: ["--skip", "*.ipynb", "-L", "ECT,Gool,tread,gool,mot"] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.14 + hooks: + - id: mdformat + args: ["--number", "--table-width", "200"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-algo-readme + - id: check-copyright + args: ["mmaction", "tests", "demo", "tools"] diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000000000000000000000000000000000000..121ebd1e079a81927454c66e80d10530764e040e --- /dev/null +++ b/.pylintrc @@ -0,0 +1,624 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist= + +# Specify a score threshold to be exceeded before program exits with error. +fail-under=10 + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=CVS,configs + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=import-outside-toplevel + redefined-outer-name + print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape, + no-member, + invalid-name, + too-many-branches, + wrong-import-order, + too-many-arguments, + missing-function-docstring, + missing-module-docstring, + too-many-locals, + too-few-public-methods, + abstract-method, + broad-except, + too-many-nested-blocks, + too-many-instance-attributes, + missing-class-docstring, + duplicate-code, + not-callable, + protected-access, + dangerous-default-value, + no-name-in-module, + logging-fstring-interpolation, + super-init-not-called, + redefined-builtin, + attribute-defined-outside-init, + arguments-differ, + cyclic-import, + bad-super-call, + too-many-statements, + line-too-long + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'error', 'warning', 'refactor', and 'convention' +# which contain the number of messages in each category, as well as 'statement' +# which is the total number of statements analyzed. This score is used by the +# global evaluation report (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +#notes-rgx= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _, + x, + y, + w, + h, + a, + b + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +#variable-rgx= + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=BaseException, + Exception diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..50641e1bf8a13f12a6a90a7ea335fa53e3c8341a --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,14 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.9" + +formats: + - epub + +python: + install: + - requirements: requirements/docs.txt + - requirements: requirements/readthedocs.txt diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..c1d3eb5836d14209fbea113ea6b9625f262022a9 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,8 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - name: "MMAction2 Contributors" +title: "OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark" +date-released: 2020-07-21 +url: "https://github.com/open-mmlab/mmaction2" +license: Apache-2.0 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..797bf40e85c5d2986ebcec9cb51aed979ca88b82 --- /dev/null +++ b/LICENSE @@ -0,0 +1,203 @@ +Copyright 2018-2019 Open-MMLab. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018-2019 Open-MMLab. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..13a0db319d94bf8988e73ee1d45954de4efe18d9 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include mmaction/.mim/model-index.yml +include mmaction/.mim/dataset-index.yml +recursive-include mmaction/.mim/configs *.py *.yml +recursive-include mmaction/.mim/tools *.sh *.py diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d857c748ee17fe55824a1847d709d923d1ffa417 --- /dev/null +++ b/README.md @@ -0,0 +1,387 @@ +
+ +
 
+
+ OpenMMLab website + + + HOT + + +      + OpenMMLab platform + + + TRY IT OUT + + +
+ +[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/) +[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2) +[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/) +[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/main/LICENSE) +[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) +[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) + +[๐Ÿ“˜Documentation](https://mmaction2.readthedocs.io/en/latest/) | +[๐Ÿ› ๏ธInstallation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) | +[๐Ÿ‘€Model Zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo_statistics.html) | +[๐Ÿ†•Update News](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) | +[๐Ÿš€Ongoing Projects](https://github.com/open-mmlab/mmaction2/projects) | +[๐Ÿค”Reporting Issues](https://github.com/open-mmlab/mmaction2/issues/new/choose) + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +English | [็ฎ€ไฝ“ไธญๆ–‡](/README_zh-CN.md) + +## ๐Ÿ“„ Table of Contents + +- [๐Ÿ“„ Table of Contents](#-table-of-contents) +- [๐Ÿฅณ ๐Ÿš€ What's New](#--whats-new-) +- [๐Ÿ“– Introduction](#-introduction-) +- [๐ŸŽ Major Features](#-major-features-) +- [๐Ÿ› ๏ธ Installation](#๏ธ-installation-) +- [๐Ÿ‘€ Model Zoo](#-model-zoo-) +- [๐Ÿ‘จโ€๐Ÿซ Get Started](#-get-started-) +- [๐ŸŽซ License](#-license-) +- [๐Ÿ–Š๏ธ Citation](#๏ธ-citation-) +- [๐Ÿ™Œ Contributing](#-contributing-) +- [๐Ÿค Acknowledgement](#-acknowledgement-) +- [๐Ÿ—๏ธ Projects in OpenMMLab](#๏ธ-projects-in-openmmlab-) + +## ๐Ÿฅณ ๐Ÿš€ What's New [๐Ÿ”](#-table-of-contents) + +**The default branch has been switched to `main`(previous `1.x`) from `master`(current `0.x`), and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.** + +**Release (2023.10.12)**: v1.2.0 with the following new features: + +- Support VindLU multi-modality algorithm and the Training of ActionClip +- Support lightweight model MobileOne TSN/TSM +- Support video retrieval dataset MSVD +- Support SlowOnly K700 feature to train localization models +- Support Video and Audio Demos + +## ๐Ÿ“– Introduction [๐Ÿ”](#-table-of-contents) + +MMAction2 is an open-source toolbox for video understanding based on PyTorch. +It is a part of the [OpenMMLab](http://openmmlab.com/) project. + +
+ + +

Action Recognition on Kinetics-400 (left) and Skeleton-based Action Recognition on NTU-RGB+D-120 (right)

+
+ +
+
+

Skeleton-based Spatio-Temporal Action Detection and Action Recognition Results on Kinetics-400

+
+
+
+

Spatio-Temporal Action Detection Results on AVA-2.1

+
+ +## ๐ŸŽ Major Features [๐Ÿ”](#-table-of-contents) + +- **Modular design**: We decompose a video understanding framework into different components. One can easily construct a customized video understanding framework by combining different modules. + +- **Support five major video understanding tasks**: MMAction2 implements various algorithms for multiple video understanding tasks, including action recognition, action localization, spatio-temporal action detection, skeleton-based action detection and video retrieval. + +- **Well tested and documented**: We provide detailed documentation and API reference, as well as unit tests. + +## ๐Ÿ› ๏ธ Installation [๐Ÿ”](#-table-of-contents) + +MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional) and [MMPose](https://github.com/open-mmlab/mmpose) (optional). + +Please refer to [install.md](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) for detailed instructions. + +
+Quick instructions + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +conda install pytorch torchvision -c pytorch # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment. +pip install -U openmim +mim install mmengine +mim install mmcv +mim install mmdet # optional +mim install mmpose # optional +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +pip install -v -e . +``` + +
+ +## ๐Ÿ‘€ Model Zoo [๐Ÿ”](#-table-of-contents) + +Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/latest/model_zoo/modelzoo.html). + +
+ +Supported model + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Action Recognition
C3D (CVPR'2014)TSN (ECCV'2016)I3D (CVPR'2017)C2D (CVPR'2018)I3D Non-Local (CVPR'2018)
R(2+1)D (CVPR'2018)TRN (ECCV'2018)TSM (ICCV'2019)TSM Non-Local (ICCV'2019)SlowOnly (ICCV'2019)
SlowFast (ICCV'2019)CSN (ICCV'2019)TIN (AAAI'2020)TPN (CVPR'2020)X3D (CVPR'2020)
MultiModality: Audio (ArXiv'2020)TANet (ArXiv'2020)TimeSformer (ICML'2021)ActionCLIP (ArXiv'2021)VideoSwin (CVPR'2022)
VideoMAE (NeurIPS'2022)MViT V2 (CVPR'2022)UniFormer V1 (ICLR'2022)UniFormer V2 (Arxiv'2022)VideoMAE V2 (CVPR'2023)
Action Localization
BSN (ECCV'2018)BMN (ICCV'2019)TCANet (CVPR'2021)
Spatio-Temporal Action Detection
ACRN (ECCV'2018)SlowOnly+Fast R-CNN (ICCV'2019)SlowFast+Fast R-CNN (ICCV'2019)LFB (CVPR'2019)VideoMAE (NeurIPS'2022)
Skeleton-based Action Recognition
ST-GCN (AAAI'2018)2s-AGCN (CVPR'2019)PoseC3D (CVPR'2022)STGCN++ (ArXiv'2022)CTRGCN (CVPR'2021)
MSG3D (CVPR'2020)
Video Retrieval
CLIP4Clip (ArXiv'2022)
+ +
+ +
+ +Supported dataset + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Action Recognition
HMDB51 (Homepage) (ICCV'2011)UCF101 (Homepage) (CRCV-IR-12-01)ActivityNet (Homepage) (CVPR'2015)Kinetics-[400/600/700] (Homepage) (CVPR'2017)
SthV1 (ICCV'2017)SthV2 (Homepage) (ICCV'2017)Diving48 (Homepage) (ECCV'2018)Jester (Homepage) (ICCV'2019)
Moments in Time (Homepage) (TPAMI'2019)Multi-Moments in Time (Homepage) (ArXiv'2019)HVU (Homepage) (ECCV'2020)OmniSource (Homepage) (ECCV'2020)
FineGYM (Homepage) (CVPR'2020)Kinetics-710 (Homepage) (Arxiv'2022)
Action Localization
THUMOS14 (Homepage) (THUMOS Challenge 2014)ActivityNet (Homepage) (CVPR'2015)HACS (Homepage) (ICCV'2019)
Spatio-Temporal Action Detection
UCF101-24* (Homepage) (CRCV-IR-12-01)JHMDB* (Homepage) (ICCV'2015)AVA (Homepage) (CVPR'2018)AVA-Kinetics (Homepage) (Arxiv'2020)
MultiSports (Homepage) (ICCV'2021)
Skeleton-based Action Recognition
PoseC3D-FineGYM (Homepage) (ArXiv'2021)PoseC3D-NTURGB+D (Homepage) (ArXiv'2021)PoseC3D-UCF101 (Homepage) (ArXiv'2021)PoseC3D-HMDB51 (Homepage) (ArXiv'2021)
Video Retrieval
MSRVTT (Homepage) (CVPR'2016)
+ +
+ +## ๐Ÿ‘จโ€๐Ÿซ Get Started [๐Ÿ”](#-table-of-contents) + +For tutorials, we provide the following user guides for basic usage: + +- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/latest/migration.html) +- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/config.html) +- [Prepare Datasets](https://mmaction2.readthedocs.io/en/latest/user_guides/prepare_dataset.html) +- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/latest/user_guides/inference.html) +- [Training and Testing](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html) + +
+Research works built on MMAction2 by users from community + +- Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer) +- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR) +- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS) + +
+ +## ๐ŸŽซ License [๐Ÿ”](#-table-of-contents) + +This project is released under the [Apache 2.0 license](LICENSE). + +## ๐Ÿ–Š๏ธ Citation [๐Ÿ”](#-table-of-contents) + +If you find this project useful in your research, please consider cite: + +```BibTeX +@misc{2020mmaction2, + title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark}, + author={MMAction2 Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmaction2}}, + year={2020} +} +``` + +## ๐Ÿ™Œ Contributing [๐Ÿ”](#-table-of-contents) + +We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. + +## ๐Ÿค Acknowledgement [๐Ÿ”](#-table-of-contents) + +MMAction2 is an open-source project that is contributed by researchers and engineers from various colleges and companies. +We appreciate all the contributors who implement their methods or add new features and users who give valuable feedback. +We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their new models. + +## ๐Ÿ—๏ธ Projects in OpenMMLab [๐Ÿ”](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries. +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. +- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab. diff --git a/README_zh-CN.md b/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..862d202ed532115e5c1d7d0a1b286005bdd5ac01 --- /dev/null +++ b/README_zh-CN.md @@ -0,0 +1,398 @@ +
+ +
 
+
+ OpenMMLab ๅฎ˜็ฝ‘ + + + HOT + + +      + OpenMMLab ๅผ€ๆ”พๅนณๅฐ + + + TRY IT OUT + + +
+ +[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/) +[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2) +[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/) +[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/main/LICENSE) +[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) +[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) + +[๐Ÿ“˜ไธญๆ–‡ๆ–‡ๆกฃ](https://mmaction2.readthedocs.io/zh_CN/latest/index.html) | +[๐Ÿ› ๏ธๅฎ‰่ฃ…ๆŒ‡ๅ—](https://mmaction2.readthedocs.io/zh_CN/latest/get_started/installation.html) | +[๐Ÿ‘€ๆจกๅž‹ๅบ“](https://mmaction2.readthedocs.io/zh_CN/latest/modelzoo_statistics.html) | +[๐Ÿ†•ๆ›ดๆ–ฐๆ—ฅๅฟ—](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) | +[๐Ÿš€่ฟ›่กŒไธญ้กน็›ฎ](https://github.com/open-mmlab/mmaction2/projects) | +[๐Ÿค”ๆŠฅๅ‘Š้—ฎ้ข˜](https://github.com/open-mmlab/mmaction2/issues/new/choose) + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +[English](/README.md) | ็ฎ€ไฝ“ไธญๆ–‡ + +## ๐Ÿ“„ ็›ฎๅฝ• + +- [๐Ÿ“„ ็›ฎๅฝ•](#-็›ฎๅฝ•) +- [๐Ÿฅณ ๐Ÿš€ ๆœ€ๆ–ฐ่ฟ›ๅฑ•](#--ๆœ€ๆ–ฐ่ฟ›ๅฑ•-) +- [๐Ÿ“– ็ฎ€ไป‹](#-็ฎ€ไป‹-) +- [๐ŸŽ ไธป่ฆๅŠŸ่ƒฝ](#-ไธป่ฆๅŠŸ่ƒฝ-) +- [๐Ÿ› ๏ธ ๅฎ‰่ฃ…](#๏ธ-ๅฎ‰่ฃ…-) +- [๐Ÿ‘€ ๆจกๅž‹ๅบ“](#-ๆจกๅž‹ๅบ“-) +- [๐Ÿ‘จโ€๐Ÿซ ๆ–ฐๆ‰‹ๅ…ฅ้—จ](#-ๆ–ฐๆ‰‹ๅ…ฅ้—จ-) +- [๐ŸŽซ ่ฎธๅฏ่ฏ](#-่ฎธๅฏ่ฏ-) +- [๐Ÿ–Š๏ธ ๅผ•็”จ](#๏ธ-ๅผ•็”จ-) +- [๐Ÿ™Œ ๅ‚ไธŽ่ดก็Œฎ](#-ๅ‚ไธŽ่ดก็Œฎ-) +- [๐Ÿค ่‡ด่ฐข](#-่‡ด่ฐข-) +- [๐Ÿ—๏ธ OpenMMLab ็š„ๅ…ถไป–้กน็›ฎ](#๏ธ-openmmlab-็š„ๅ…ถไป–้กน็›ฎ-) +- [โค๏ธ ๆฌข่ฟŽๅŠ ๅ…ฅ OpenMMLab ็คพๅŒบ](#๏ธ-ๆฌข่ฟŽๅŠ ๅ…ฅ-openmmlab-็คพๅŒบ-) + +## ๐Ÿฅณ ๐Ÿš€ ๆœ€ๆ–ฐ่ฟ›ๅฑ• [๐Ÿ”](#-table-of-contents) + +**้ป˜่ฎคๅˆ†ๆ”ฏๅทฒ็ปไปŽ `master` ๏ผˆๅฝ“ๅ‰็š„`0.x`๏ผ‰ ๅˆ‡ๆขๅˆฐ `main`๏ผˆไน‹ๅ‰็š„ `1.x`๏ผ‰๏ผŒๆˆ‘ไปฌๅปบ่ฎฎ็”จๆˆทๆ›ดๆ–ฐ่‡ณๆœ€ๆ–ฐ็‰ˆๆœฌ๏ผŒๅ…ถๆ”ฏๆŒๆ›ดๅคšๆจกๅž‹๏ผŒๆ›ดๅผบ็š„้ข„่ฎญ็ปƒๆƒ้‡๏ผŒไปฅๅŠๆ›ด็ฎ€ๆด็š„ไปฃ็ ๅฎž็Žฐใ€‚่ฏฆๆƒ…่ฏทๅ‚้˜…[่ฟ็งปๆŒ‡ๅ—](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html)** + +**Release (2023.07.04)**: v1.1.0 ๆ”ฏๆŒไปฅไธ‹ๆ–ฐๅŠŸ่ƒฝ: + +- ๆ”ฏๆŒๅŸบไบŽ CLIP ็š„ๅคšๆจกๆ€ๆจกๅž‹: ActionCLIP(Arxiv'2021) ๅ’Œ CLIP4clip(ArXiv'2022) +- ๆ”ฏๆŒไธฐๅฏŒ็š„ project: ๆ‰‹ๅŠฟ่ฏ†ๅˆซ, ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ tutorial, ไปฅๅŠๅŸบไบŽ [MMRazor](https://github.com/open-mmlab/mmrazor) ็š„็Ÿฅ่ฏ†่’ธ้ฆ +- ๆ”ฏๆŒ HACS-segments ๆ•ฐๆฎ้›†(ICCV'2019), MultiSports ๆ•ฐๆฎ้›†(ICCV'2021), Kinetics-710 ๆ•ฐๆฎ้›†(Arxiv'2022) +- ๆ”ฏๆŒ VideoMAE V2(CVPR'2023), VideoMAE(NeurIPS'2022) ๆ”ฏๆŒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ไปปๅŠก +- ๆ”ฏๆŒ TCANet(CVPR'2021) +- ๆ”ฏๆŒ [็บฏ Python ้ฃŽๆ ผ็š„้…็ฝฎๆ–‡ไปถ](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) ๅ’Œไฝฟ็”จ MIM ไธ€้”ฎไธ‹่ฝฝๆ•ฐๆฎ้›† + +## ๐Ÿ“– ็ฎ€ไป‹ [๐Ÿ”](#-table-of-contents) + +MMAction2 ๆ˜ฏไธ€ๆฌพๅŸบไบŽ PyTorch ๅผ€ๅ‘็š„่กŒไธบ่ฏ†ๅˆซๅผ€ๆบๅทฅๅ…ทๅŒ…๏ผŒๆ˜ฏ [open-mmlab](https://github.com/open-mmlab) ้กน็›ฎ็š„ไธ€ไธชๅญ้กน็›ฎใ€‚ + +
+ + +

Kinetics-400 ๆ•ฐๆฎ้›†่กŒไธบ่ฏ†ๅˆซ็ป“ๆžœ๏ผˆๅทฆ๏ผ‰ ๅ’Œ NTU-RGB+D-120 ๆ•ฐๆฎ้›†ๅŸบไบŽ้ชจๆžถ็š„่กŒไธบ่ฏ†ๅˆซ็ป“ๆžœ๏ผˆๅณ๏ผ‰

+
+ +
+
+

Kinetics-400 ๆ•ฐๆฎ้›†ๅŸบไบŽ้ชจ้ชผ็‚น็š„ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๅŠ่ง†้ข‘่กŒไธบ่ฏ†ๅˆซ็ป“ๆžœ

+
+
+
+

AVA-2.1 ๆ•ฐๆฎ้›†ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹็ป“ๆžœ

+
+ +## ๐ŸŽ ไธป่ฆๅŠŸ่ƒฝ [๐Ÿ”](#-table-of-contents) + +- **ๆจกๅ—ๅŒ–่ฎพ่ฎก**๏ผš ๆˆ‘ไปฌๅฐ†่ง†้ข‘็†่งฃๆก†ๆžถๆ‹†ๅˆ†ๆˆไบ†ไธๅŒๆจกๅ—๏ผŒ็”จๆˆทๅฏไปฅๅพˆๆ–นไพฟๅœฐ้€š่ฟ‡็ป„ๅˆไธๅŒ็š„ๆจกๅ—ๆฅๆž„ๅปบๅ‡บ่‡ชๅฎšไน‰็š„่ง†้ข‘็†่งฃๆก†ๆžถใ€‚ + +- **ๆ”ฏๆŒไบ”็งไธป่ฆ็š„่ง†้ข‘็†่งฃไปปๅŠก**๏ผš MMAction2 ไธบ่ง†้ข‘็†่งฃไปปๅŠกๅฎž็Žฐไบ†ๅคš็งๅคšๆ ท็š„็ฎ—ๆณ•๏ผŒๅŒ…ๆ‹ฌ่กŒไธบ่ฏ†ๅˆซ๏ผŒๆ—ถๅบๅŠจไฝœๅฎšไฝ๏ผŒๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹๏ผŒๅŸบไบŽ้ชจ้ชผ็‚น็š„่กŒไธบ่ฏ†ๅˆซ๏ผŒไปฅๅŠ่ง†้ข‘ๆฃ€็ดขใ€‚ + +- **่ฏฆๅฐฝ็š„ๅ•ๅ…ƒๆต‹่ฏ•ๅ’Œๆ–‡ๆกฃ**๏ผšๆˆ‘ไปฌๆไพ›ไบ†่ฏฆๅฐฝ็š„ๆ–‡ๆกฃๅ’Œ API ๅ‚่€ƒๆ‰‹ๅ†Œ๏ผŒไปฅๅŠๅ•ๅ…ƒๆต‹่ฏ•ใ€‚ + +## ๐Ÿ› ๏ธ ๅฎ‰่ฃ… [๐Ÿ”](#-table-of-contents) + +MMAction2ไพ่ต–ไบŽ [PyTorch](https://pytorch.org/)๏ผŒ[MMCV](https://github.com/open-mmlab/mmcv)๏ผŒ[MMEngine](https://github.com/open-mmlab/mmengine)๏ผŒ[MMDetection](https://github.com/open-mmlab/mmdetection) ๏ผˆๅฏ้€‰๏ผ‰ๅ’Œ [MMPose](https://github.com/open-mmlab/mmpose) ๏ผˆๅฏ้€‰๏ผ‰ + +ๅ…ทไฝ“ๆญฅ้ชค่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆ–‡ๆกฃ](https://mmaction2.readthedocs.io/zh_cn/latest/get_started/installation.html)ใ€‚ + +
+ๅฟซ้€Ÿๅฎ‰่ฃ… + +```shell +conda create --name openmmlab python=3.8 -y +conda activate open-mmlab +conda install pytorch torchvision -c pytorch # ่ฏฅๅ‘ฝไปคๅฐ†่‡ชๅŠจๅฎ‰่ฃ…ๆœ€ๆ–ฐ็‰ˆ็š„ PyTorch ๅ’Œ cudatoolkit๏ผŒ่ฏท็กฎ่ฎคๆญคๆ˜ฏๅฆๅŒน้…ไฝ ็š„ๅฝ“ๅ‰็Žฏๅขƒใ€‚ +pip install -U openmim +mim install mmengine +mim install mmcv +mim install mmdet # ๅฏ้€‰ +mim install mmpose # ๅฏ้€‰ +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +pip install -v -e . +``` + +
+ +## ๐Ÿ‘€ ๆจกๅž‹ๅบ“ [๐Ÿ”](#-table-of-contents) + +็ป“ๆžœๅŠๆจกๅž‹ไฝไบŽ[ๆจกๅž‹ๅบ“](https://mmaction2.readthedocs.io/zh_cn/latest/modelzoo_statistics.html) + +
+ +ๆจกๅž‹ๆ”ฏๆŒ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
่กŒไธบ่ฏ†ๅˆซ
C3D (CVPR'2014)TSN (ECCV'2016)I3D (CVPR'2017)C2D (CVPR'2018)I3D Non-Local (CVPR'2018)
R(2+1)D (CVPR'2018)TRN (ECCV'2018)TSM (ICCV'2019)TSM Non-Local (ICCV'2019)SlowOnly (ICCV'2019)
SlowFast (ICCV'2019)CSN (ICCV'2019)TIN (AAAI'2020)TPN (CVPR'2020)X3D (CVPR'2020)
MultiModality: Audio (ArXiv'2020)TANet (ArXiv'2020)TimeSformer (ICML'2021)ActionCLIP (ArXiv'2021)VideoSwin (CVPR'2022)
VideoMAE (NeurIPS'2022)MViT V2 (CVPR'2022)UniFormer V1 (ICLR'2022)UniFormer V2 (Arxiv'2022)VideoMAE V2 (CVPR'2023)
ๆ—ถๅบๅŠจไฝœๅฎšไฝ
BSN (ECCV'2018)BMN (ICCV'2019)TCANet (CVPR'2021)
ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹
ACRN (ECCV'2018)SlowOnly+Fast R-CNN (ICCV'2019)SlowFast+Fast R-CNN (ICCV'2019)LFB (CVPR'2019)VideoMAE (NeurIPS'2022)
ๅŸบไบŽ้ชจ้ชผ็‚น็š„่กŒไธบ่ฏ†ๅˆซ
ST-GCN (AAAI'2018)2s-AGCN (CVPR'2019)PoseC3D (CVPR'2022)STGCN++ (ArXiv'2022)CTRGCN (CVPR'2021)
MSG3D (CVPR'2020)
่ง†้ข‘ๆฃ€็ดข
CLIP4Clip (ArXiv'2022)
+ +
+ +
+ +ๆ•ฐๆฎ้›†ๆ”ฏๆŒ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
่กŒไธบ่ฏ†ๅˆซ
HMDB51 (ๅฎ˜็ฝ‘) (ICCV'2011)UCF101 (ๅฎ˜็ฝ‘) (CRCV-IR-12-01)ActivityNet (ๅฎ˜็ฝ‘) (CVPR'2015)Kinetics-[400/600/700] (ๅฎ˜็ฝ‘) (CVPR'2017)
SthV1 (ICCV'2017)SthV2 (ๅฎ˜็ฝ‘) (ICCV'2017)Diving48 (ๅฎ˜็ฝ‘) (ECCV'2018)Jester (ๅฎ˜็ฝ‘) (ICCV'2019)
Moments in Time (ๅฎ˜็ฝ‘) (TPAMI'2019)Multi-Moments in Time (ๅฎ˜็ฝ‘) (ArXiv'2019)HVU (ๅฎ˜็ฝ‘) (ECCV'2020)OmniSource (ๅฎ˜็ฝ‘) (ECCV'2020)
FineGYM (ๅฎ˜็ฝ‘) (CVPR'2020)Kinetics-710 (ๅฎ˜็ฝ‘) (Arxiv'2022)
ๆ—ถๅบๅŠจไฝœๅฎšไฝ
THUMOS14 (ๅฎ˜็ฝ‘) (THUMOS Challenge 2014)ActivityNet (ๅฎ˜็ฝ‘) (CVPR'2015)HACS (ๅฎ˜็ฝ‘) (ICCV'2019)
ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹
UCF101-24* (ๅฎ˜็ฝ‘) (CRCV-IR-12-01)JHMDB* (ๅฎ˜็ฝ‘) (ICCV'2015)AVA (ๅฎ˜็ฝ‘) (CVPR'2018)AVA-Kinetics (ๅฎ˜็ฝ‘) (Arxiv'2020)
ๅŸบไบŽ้ชจๆžถ็š„่กŒไธบ่ฏ†ๅˆซ
PoseC3D-FineGYM (ๅฎ˜็ฝ‘) (ArXiv'2021)PoseC3D-NTURGB+D (ๅฎ˜็ฝ‘) (ArXiv'2021)PoseC3D-UCF101 (ๅฎ˜็ฝ‘) (ArXiv'2021)PoseC3D-HMDB51 (ๅฎ˜็ฝ‘) (ArXiv'2021)
่ง†้ข‘ๆฃ€็ดข
MSRVTT (ๅฎ˜็ฝ‘) (CVPR'2016)
+ +
+ +## ๐Ÿ‘จโ€๐Ÿซ ๆ–ฐๆ‰‹ๅ…ฅ้—จ [๐Ÿ”](#-table-of-contents) + +ๆˆ‘ไปฌๆไพ›ไบ†ไธ€็ณปๅˆ—็ฎ€ๆ˜Ž็š„ๆ•™็จ‹๏ผŒๅธฎๅŠฉๆ–ฐ็”จๆˆท่ฝปๆพไธŠๆ‰‹ไฝฟ็”จ๏ผš + +- [ไปŽ MMAction2 0.X ่ฟ็งป](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html) +- [ๅญฆไน ้…็ฝฎ็›ธๅ…ณ็Ÿฅ่ฏ†](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/config.html) +- [ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/prepare_dataset.html) +- [ไฝฟ็”จ็Žฐๆœ‰ๆจกๅž‹่ฟ›่กŒๆŽจ็†](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/inference.html) +- [่ฎญ็ปƒไธŽๆต‹่ฏ•](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/train_test.html) + +
+ๅŸบไบŽ MMAction2 ็š„็คพๅŒบๅทฅไฝœ + +- Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer) +- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR) +- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS) + +
+ +## ๐ŸŽซ ่ฎธๅฏ่ฏ [๐Ÿ”](#-table-of-contents) + +ๆœฌ้กน็›ฎๅŸบไบŽ [Apache 2.0 license](LICENSE) ๅ‘ๅธƒใ€‚ + +## ๐Ÿ–Š๏ธ ๅผ•็”จ [๐Ÿ”](#-table-of-contents) + +ๅฆ‚ไฝ ๅ‘็Žฐๆœฌ้กน็›ฎๅฏนไฝ ็š„็ ”็ฉถๆœ‰ๅธฎๅŠฉ๏ผŒ่ฏทๅ‚่€ƒๅฆ‚ไธ‹ bibtex ๅผ•็”จ MMAction2ใ€‚ + +```BibTeX +@misc{2020mmaction2, + title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark}, + author={MMAction2 Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmaction2}}, + year={2020} +} +``` + +## ๐Ÿ™Œ ๅ‚ไธŽ่ดก็Œฎ [๐Ÿ”](#-table-of-contents) + +ๆˆ‘ไปฌๆ„Ÿ่ฐขๆ‰€ๆœ‰็š„่ดก็Œฎ่€…ไธบๆ”น่ฟ›ๅ’Œๆๅ‡ MMAction2 ๆ‰€ไฝœๅ‡บ็š„ๅŠชๅŠ›ใ€‚่ฏทๅ‚่€ƒ[่ดก็ŒฎๆŒ‡ๅ—](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md)ๆฅไบ†่งฃๅ‚ไธŽ้กน็›ฎ่ดก็Œฎ็š„็›ธๅ…ณๆŒ‡ๅผ•ใ€‚ + +## ๐Ÿค ่‡ด่ฐข [๐Ÿ”](#-table-of-contents) + +MMAction2 ๆ˜ฏไธ€ๆฌพ็”ฑๆฅ่‡ชไธๅŒ้ซ˜ๆ กๅ’Œไผไธš็š„็ ”ๅ‘ไบบๅ‘˜ๅ…ฑๅŒๅ‚ไธŽ่ดก็Œฎ็š„ๅผ€ๆบ้กน็›ฎใ€‚ๆˆ‘ไปฌๆ„Ÿ่ฐขๆ‰€ๆœ‰ไธบ้กน็›ฎๆไพ›็ฎ—ๆณ•ๅค็Žฐๅ’Œๆ–ฐๅŠŸ่ƒฝๆ”ฏๆŒ็š„่ดก็Œฎ่€…๏ผŒไปฅๅŠๆไพ›ๅฎ่ดตๅ้ฆˆ็š„็”จๆˆทใ€‚ ๆˆ‘ไปฌๅธŒๆœ›ๆญคๅทฅๅ…ท็ฎฑๅฏไปฅๅธฎๅŠฉๅคงๅฎถๆฅๅค็Žฐๅทฒๆœ‰็š„ๆ–นๆณ•ๅ’Œๅผ€ๅ‘ๆ–ฐ็š„ๆ–นๆณ•๏ผŒไปŽ่€Œไธบ็ ”็ฉถ็คพๅŒบ่ดก็ŒฎๅŠ›้‡ใ€‚ + +## ๐Ÿ—๏ธ OpenMMLab ็š„ๅ…ถไป–้กน็›ฎ [๐Ÿ”](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab ๆทฑๅบฆๅญฆไน ๆจกๅž‹่ฎญ็ปƒๅŸบ็ก€ๅบ“ +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab ่ฎก็ฎ—ๆœบ่ง†่ง‰ๅŸบ็ก€ๅบ“ +- [MIM](https://github.com/open-mmlab/mim): MIM ๆ˜ฏ OpenMMlab ้กน็›ฎใ€็ฎ—ๆณ•ใ€ๆจกๅž‹็š„็ปŸไธ€ๅ…ฅๅฃ +- [MMEval](https://github.com/open-mmlab/mmeval): ็ปŸไธ€ๅผ€ๆ”พ็š„่ทจๆก†ๆžถ็ฎ—ๆณ•่ฏ„ๆต‹ๅบ“ +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab ๆทฑๅบฆๅญฆไน ้ข„่ฎญ็ปƒๅทฅๅ…ท็ฎฑ +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab ็›ฎๆ ‡ๆฃ€ๆต‹ๅทฅๅ…ท็ฎฑ +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab ๆ–ฐไธ€ไปฃ้€š็”จ 3D ็›ฎๆ ‡ๆฃ€ๆต‹ๅนณๅฐ +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab ๆ—‹่ฝฌๆก†ๆฃ€ๆต‹ๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO ็ณปๅˆ—ๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab ่ฏญไน‰ๅˆ†ๅ‰ฒๅทฅๅ…ท็ฎฑ +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab ๅ…จๆต็จ‹ๆ–‡ๅญ—ๆฃ€ๆต‹่ฏ†ๅˆซ็†่งฃๅทฅๅ…ทๅŒ… +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab ๅงฟๆ€ไผฐ่ฎกๅทฅๅ…ท็ฎฑ +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab ไบบไฝ“ๅ‚ๆ•ฐๅŒ–ๆจกๅž‹ๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab ่‡ช็›‘็ฃๅญฆไน ๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab ๆจกๅž‹ๅŽ‹็ผฉๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab ๅฐ‘ๆ ทๆœฌๅญฆไน ๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab ๆ–ฐไธ€ไปฃ่ง†้ข‘็†่งฃๅทฅๅ…ท็ฎฑ +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab ไธ€ไฝ“ๅŒ–่ง†้ข‘็›ฎๆ ‡ๆ„Ÿ็Ÿฅๅนณๅฐ +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab ๅ…‰ๆตไผฐ่ฎกๅทฅๅ…ท็ฎฑไธŽๆต‹่ฏ•ๅŸบๅ‡† +- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab ๆ–ฐไธ€ไปฃไบบๅทฅๆ™บ่ƒฝๅ†…ๅฎน็”Ÿๆˆ๏ผˆAIGC๏ผ‰ๅทฅๅ…ท็ฎฑ +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab ๅ›พ็‰‡่ง†้ข‘็”Ÿๆˆๆจกๅž‹ๅทฅๅ…ท็ฎฑ +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab ๆจกๅž‹้ƒจ็ฝฒๆก†ๆžถ +- [Playground](https://github.com/open-mmlab/playground): ๆ”ถ้›†ๅ’Œๅฑ•็คบ OpenMMLab ็›ธๅ…ณ็š„ๅ‰ๆฒฟใ€ๆœ‰่ถฃ็š„็คพๅŒบ้กน็›ฎ + +## โค๏ธ ๆฌข่ฟŽๅŠ ๅ…ฅ OpenMMLab ็คพๅŒบ [๐Ÿ”](#-table-of-contents) + +ๆ‰ซๆไธ‹ๆ–น็š„ไบŒ็ปด็ ๅฏๅ…ณๆณจ OpenMMLab ๅ›ข้˜Ÿ็š„ [็ŸฅไนŽๅฎ˜ๆ–น่ดฆๅท](https://www.zhihu.com/people/openmmlab)๏ผŒๆ‰ซๆไธ‹ๆ–นๅพฎไฟกไบŒ็ปด็ ๆทปๅŠ ๅ–ตๅ–ตๅฅฝๅ‹๏ผŒ่ฟ›ๅ…ฅ MMAction2 ๅพฎไฟกไบคๆต็คพ็พคใ€‚ใ€ๅŠ ๅฅฝๅ‹็”ณ่ฏทๆ ผๅผ๏ผš็ ”็ฉถๆ–นๅ‘+ๅœฐๅŒบ+ๅญฆๆ ก/ๅ…ฌๅธ+ๅง“ๅใ€‘ + +
+ +
+ +ๆˆ‘ไปฌไผšๅœจ OpenMMLab ็คพๅŒบไธบๅคงๅฎถ + +- ๐Ÿ“ข ๅˆ†ไบซ AI ๆก†ๆžถ็š„ๅ‰ๆฒฟๆ ธๅฟƒๆŠ€ๆœฏ +- ๐Ÿ’ป ่งฃ่ฏป PyTorch ๅธธ็”จๆจกๅ—ๆบ็  +- ๐Ÿ“ฐ ๅ‘ๅธƒ OpenMMLab ็š„็›ธๅ…ณๆ–ฐ้—ป +- ๐Ÿš€ ไป‹็ป OpenMMLab ๅผ€ๅ‘็š„ๅ‰ๆฒฟ็ฎ—ๆณ• +- ๐Ÿƒ ่Žทๅ–ๆ›ด้ซ˜ๆ•ˆ็š„้—ฎ้ข˜็ญ”็–‘ๅ’Œๆ„่งๅ้ฆˆ +- ๐Ÿ”ฅ ๆไพ›ไธŽๅ„่กŒๅ„ไธšๅผ€ๅ‘่€…ๅ……ๅˆ†ไบคๆต็š„ๅนณๅฐ + +ๅนฒ่ดงๆปกๆปก ๐Ÿ“˜๏ผŒ็ญ‰ไฝ ๆฅๆ’ฉ ๐Ÿ’—๏ผŒOpenMMLab ็คพๅŒบๆœŸๅพ…ๆ‚จ็š„ๅŠ ๅ…ฅ ๐Ÿ‘ฌ diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..25c9e4c3342f63b162455223cb63ef63eafb2ae8 --- /dev/null +++ b/configs/_base_/default_runtime.py @@ -0,0 +1,24 @@ +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/configs/_base_/models/audioonly_r50.py b/configs/_base_/models/audioonly_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..108cce24c978b80a53c819ccfbe68fb6b36f1aac --- /dev/null +++ b/configs/_base_/models/audioonly_r50.py @@ -0,0 +1,16 @@ +# model settings +model = dict( + type='RecognizerAudio', + backbone=dict( + type='ResNetAudio', + depth=50, + pretrained=None, + in_channels=1, + norm_eval=False), + cls_head=dict( + type='TSNAudioHead', + num_classes=400, + in_channels=1024, + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob')) diff --git a/configs/_base_/models/bmn_400x100.py b/configs/_base_/models/bmn_400x100.py new file mode 100644 index 0000000000000000000000000000000000000000..22178c00124aa9587b826444f3630cf16b0442f7 --- /dev/null +++ b/configs/_base_/models/bmn_400x100.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='BMN', + temporal_dim=100, + boundary_ratio=0.5, + num_samples=32, + num_samples_per_bin=3, + feat_dim=400, + soft_nms_alpha=0.4, + soft_nms_low_threshold=0.5, + soft_nms_high_threshold=0.9, + post_process_top_k=100) diff --git a/configs/_base_/models/bsn_pem.py b/configs/_base_/models/bsn_pem.py new file mode 100644 index 0000000000000000000000000000000000000000..7d5910c46fe935a305baea9d5fc24ac0c27800c8 --- /dev/null +++ b/configs/_base_/models/bsn_pem.py @@ -0,0 +1,13 @@ +# model settings +model = dict( + type='PEM', + pem_feat_dim=32, + pem_hidden_dim=256, + pem_u_ratio_m=1, + pem_u_ratio_l=2, + pem_high_temporal_iou_threshold=0.6, + pem_low_temporal_iou_threshold=0.2, + soft_nms_alpha=0.75, + soft_nms_low_threshold=0.65, + soft_nms_high_threshold=0.9, + post_process_top_k=100) diff --git a/configs/_base_/models/bsn_tem.py b/configs/_base_/models/bsn_tem.py new file mode 100644 index 0000000000000000000000000000000000000000..07433c95fc8dc8e7255933369f7b58ff3659c200 --- /dev/null +++ b/configs/_base_/models/bsn_tem.py @@ -0,0 +1,8 @@ +# model settings +model = dict( + type='TEM', + temporal_dim=100, + boundary_ratio=0.1, + tem_feat_dim=400, + tem_hidden_dim=512, + tem_match_threshold=0.5) diff --git a/configs/_base_/models/c2d_r50.py b/configs/_base_/models/c2d_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..48943d3696913f8fecbb25055e094755d5ecd34b --- /dev/null +++ b/configs/_base_/models/c2d_r50.py @@ -0,0 +1,20 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='C2D', + depth=50, + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + norm_eval=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/configs/_base_/models/c3d_sports1m_pretrained.py b/configs/_base_/models/c3d_sports1m_pretrained.py new file mode 100644 index 0000000000000000000000000000000000000000..396e0910a7404be2bd5e6470b8d0244a4f4bba07 --- /dev/null +++ b/configs/_base_/models/c3d_sports1m_pretrained.py @@ -0,0 +1,28 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='C3D', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth', # noqa: E501 + style='pytorch', + conv_cfg=dict(type='Conv3d'), + norm_cfg=None, + act_cfg=dict(type='ReLU'), + dropout_ratio=0.5, + init_std=0.005), + cls_head=dict( + type='I3DHead', + num_classes=101, + in_channels=4096, + spatial_type=None, + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[104, 117, 128], + std=[1, 1, 1], + format_shape='NCTHW'), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/i3d_r50.py b/configs/_base_/models/i3d_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..6308077eb36edf3db194c7cabc5566ec931798fd --- /dev/null +++ b/configs/_base_/models/i3d_r50.py @@ -0,0 +1,30 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3d', + pretrained2d=True, + pretrained='torchvision://resnet50', + depth=50, + conv1_kernel=(5, 7, 7), + conv1_stride_t=2, + pool1_stride_t=2, + conv_cfg=dict(type='Conv3d'), + norm_eval=False, + inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)), + zero_init_residual=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +# This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332 # noqa: E501 diff --git a/configs/_base_/models/ircsn_r152.py b/configs/_base_/models/ircsn_r152.py new file mode 100644 index 0000000000000000000000000000000000000000..cbdcd615e70539a10caeeddbbba72b6d8124f416 --- /dev/null +++ b/configs/_base_/models/ircsn_r152.py @@ -0,0 +1,28 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dCSN', + pretrained2d=False, + pretrained=None, + depth=152, + with_pool2=False, + bottleneck_mode='ir', + norm_eval=False, + zero_init_residual=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + # model training and testing settings + train_cfg=None, + test_cfg=dict(max_testing_views=10)) diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py new file mode 100644 index 0000000000000000000000000000000000000000..44fb80d6565f6481101d057265c0f8048a31c1fd --- /dev/null +++ b/configs/_base_/models/mvit_small.py @@ -0,0 +1,14 @@ +model = dict( + type='Recognizer3D', + backbone=dict(type='MViT', arch='small', drop_path_rate=0.2), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + cls_head=dict( + type='MViTHead', + in_channels=768, + num_classes=400, + label_smooth_eps=0.1, + average_clips='prob')) diff --git a/configs/_base_/models/r2plus1d_r34.py b/configs/_base_/models/r2plus1d_r34.py new file mode 100644 index 0000000000000000000000000000000000000000..7650de7d9144e9bc455dc8569bbe3f02294a9be0 --- /dev/null +++ b/configs/_base_/models/r2plus1d_r34.py @@ -0,0 +1,31 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet2Plus1d', + depth=34, + pretrained=None, + pretrained2d=False, + norm_eval=False, + conv_cfg=dict(type='Conv2plus1d'), + norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3), + conv1_kernel=(3, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(1, 1, 1, 1), + spatial_strides=(1, 2, 2, 2), + temporal_strides=(1, 2, 2, 2), + zero_init_residual=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=512, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/configs/_base_/models/slowfast_r50.py b/configs/_base_/models/slowfast_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..d4510d2ff93c1840c25645cc09adb22b745a8040 --- /dev/null +++ b/configs/_base_/models/slowfast_r50.py @@ -0,0 +1,42 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowFast', + pretrained=None, + resample_rate=8, # tau + speed_ratio=8, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_eval=False)), + cls_head=dict( + type='SlowFastHead', + in_channels=2304, # 2048+256 + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/configs/_base_/models/slowonly_r50.py b/configs/_base_/models/slowonly_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..b9ab55a92e52ff8305528c648cf1f8015ac10276 --- /dev/null +++ b/configs/_base_/models/slowonly_r50.py @@ -0,0 +1,24 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/configs/_base_/models/swin_tiny.py b/configs/_base_/models/swin_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..6186e7c79e19894fa4268365008e8985f2db0714 --- /dev/null +++ b/configs/_base_/models/swin_tiny.py @@ -0,0 +1,28 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='SwinTransformer3D', + arch='tiny', + pretrained=None, + pretrained2d=True, + patch_size=(2, 4, 4), + window_size=(8, 7, 7), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + patch_norm=True), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + cls_head=dict( + type='I3DHead', + in_channels=768, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob')) diff --git a/configs/_base_/models/tanet_r50.py b/configs/_base_/models/tanet_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..a82926fa2bd9f7d31063ccde0eaff8b61ac62acc --- /dev/null +++ b/configs/_base_/models/tanet_r50.py @@ -0,0 +1,23 @@ +# model settings +model = dict( + type='Recognizer2D', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.5], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + backbone=dict( + type='TANet', + pretrained='torchvision://resnet50', + depth=50, + num_segments=8, + tam_cfg=None), + cls_head=dict( + type='TSMHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + average_clips='prob')) diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..6de4d71ff6e5b8f9083250f2b56a157e81fa6dad --- /dev/null +++ b/configs/_base_/models/tin_r50.py @@ -0,0 +1,29 @@ +# model settings + +preprocess_cfg = dict( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW') + +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTIN', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=4), + cls_head=dict( + type='TSMHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=False, + average_clips='prob'), + data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg), + # model training and testing settings + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/tpn_slowonly_r50.py b/configs/_base_/models/tpn_slowonly_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..5ba75c93ad4f128f6a51553adfd0a558405e289c --- /dev/null +++ b/configs/_base_/models/tpn_slowonly_r50.py @@ -0,0 +1,45 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained='torchvision://resnet50', + lateral=False, + out_indices=(2, 3), + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + neck=dict( + type='TPN', + in_channels=(1024, 2048), + out_channels=1024, + spatial_modulation_cfg=dict( + in_channels=(1024, 2048), out_channels=2048), + temporal_modulation_cfg=dict(downsample_scales=(8, 8)), + upsample_cfg=dict(scale_factor=(1, 1, 1)), + downsample_cfg=dict(downsample_scale=(1, 1, 1)), + level_fusion_cfg=dict( + in_channels=(1024, 1024), + mid_channels=(1024, 1024), + out_channels=2048, + downsample_scales=((1, 1, 1), (1, 1, 1))), + aux_head_cfg=dict(out_channels=400, loss_weight=0.5)), + cls_head=dict( + type='TPNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + # model training and testing settings + train_cfg=None, + test_cfg=dict(fcn_test=True)) diff --git a/configs/_base_/models/tpn_tsm_r50.py b/configs/_base_/models/tpn_tsm_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..9074c9cc394d149433959ee60f34db19ea81f9ad --- /dev/null +++ b/configs/_base_/models/tpn_tsm_r50.py @@ -0,0 +1,40 @@ +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + out_indices=(2, 3), + norm_eval=False, + shift_div=8), + neck=dict( + type='TPN', + in_channels=(1024, 2048), + out_channels=1024, + spatial_modulation_cfg=dict( + in_channels=(1024, 2048), out_channels=2048), + temporal_modulation_cfg=dict(downsample_scales=(8, 8)), + upsample_cfg=dict(scale_factor=(1, 1, 1)), + downsample_cfg=dict(downsample_scale=(1, 1, 1)), + level_fusion_cfg=dict( + in_channels=(1024, 1024), + mid_channels=(1024, 1024), + out_channels=2048, + downsample_scales=((1, 1, 1), (1, 1, 1))), + aux_head_cfg=dict(out_channels=174, loss_weight=0.5)), + cls_head=dict( + type='TPNHead', + num_classes=174, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + train_cfg=None, + test_cfg=dict(fcn_test=True)) diff --git a/configs/_base_/models/trn_r50.py b/configs/_base_/models/trn_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..88caf404767c06312797da45082229b370187650 --- /dev/null +++ b/configs/_base_/models/trn_r50.py @@ -0,0 +1,25 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + partial_bn=True), + cls_head=dict( + type='TRNHead', + num_classes=400, + in_channels=2048, + num_segments=8, + spatial_type='avg', + relation_type='TRNMultiScale', + hidden_dim=256, + dropout_ratio=0.8, + init_std=0.001, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW')) diff --git a/configs/_base_/models/tsm_mobilenet_v2.py b/configs/_base_/models/tsm_mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6b8fa9239ea2f67cea8dfd2eb6c16aca0964230c --- /dev/null +++ b/configs/_base_/models/tsm_mobilenet_v2.py @@ -0,0 +1,27 @@ +# model settings +preprocess_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) + +model = dict( + type='Recognizer2D', + backbone=dict( + type='MobileNetV2TSM', + shift_div=8, + num_segments=8, + is_shift=True, + pretrained='mmcls://mobilenet_v2'), + cls_head=dict( + type='TSMHead', + num_segments=8, + num_classes=400, + in_channels=1280, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True, + average_clips='prob'), + # model training and testing settings + data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/tsm_mobileone_s4.py b/configs/_base_/models/tsm_mobileone_s4.py new file mode 100644 index 0000000000000000000000000000000000000000..27b5a410e3d35e5a61ec35107f630ebc98d26a07 --- /dev/null +++ b/configs/_base_/models/tsm_mobileone_s4.py @@ -0,0 +1,31 @@ +# model settings +preprocess_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) + +checkpoint = ('https://download.openmmlab.com/mmclassification/' + 'v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth') +model = dict( + type='Recognizer2D', + backbone=dict( + type='MobileOneTSM', + arch='s4', + shift_div=8, + num_segments=8, + is_shift=True, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint, prefix='backbone')), + cls_head=dict( + type='TSMHead', + num_segments=8, + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True, + average_clips='prob'), + # model training and testing settings + data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/tsm_r50.py b/configs/_base_/models/tsm_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..23b1eda5f10bbf0d24623d9b3eeb72da87d07654 --- /dev/null +++ b/configs/_base_/models/tsm_r50.py @@ -0,0 +1,24 @@ +preprocess_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) + +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=8), + cls_head=dict( + type='TSMHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True, + average_clips='prob'), + data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/tsn_mobileone_s0.py b/configs/_base_/models/tsn_mobileone_s0.py new file mode 100644 index 0000000000000000000000000000000000000000..5a67c9617cc3eeee3ad61e061284251453ba29fc --- /dev/null +++ b/configs/_base_/models/tsn_mobileone_s0.py @@ -0,0 +1,26 @@ +checkpoint = ('https://download.openmmlab.com/mmclassification/' + 'v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth') +model = dict( + type='Recognizer2D', + backbone=dict( + type='mmpretrain.MobileOne', + arch='s0', + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint, prefix='backbone'), + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=1024, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/tsn_r50.py b/configs/_base_/models/tsn_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..c277f9ae628cc6d833e8c6f63ae34eafa4bf2595 --- /dev/null +++ b/configs/_base_/models/tsn_r50.py @@ -0,0 +1,23 @@ +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/models/x3d.py b/configs/_base_/models/x3d.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a8a3b661d864f02152f73711fa3c26b9e9ab06 --- /dev/null +++ b/configs/_base_/models/x3d.py @@ -0,0 +1,20 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict(type='X3D', gamma_w=1, gamma_b=2.25, gamma_d=2.2), + cls_head=dict( + type='X3DHead', + in_channels=432, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + fc1_bias=False, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.38, 57.38, 57.38], + format_shape='NCTHW'), + # model training and testing settings + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/schedules/adam_20e.py b/configs/_base_/schedules/adam_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..45e5552591c4bad744e2ce2188b30372a24e7faa --- /dev/null +++ b/configs/_base_/schedules/adam_20e.py @@ -0,0 +1,20 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10], + gamma=0.1) +] + +optimizer = dict( + type='Adam', lr=0.01, weight_decay=0.00001) # this lr is used for 1 gpus + +optim_wrapper = dict( + optimizer=optimizer, clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_100e.py b/configs/_base_/schedules/sgd_100e.py new file mode 100644 index 0000000000000000000000000000000000000000..43ae5ef12a30ebfe9bdcfc6898227213055fdda3 --- /dev/null +++ b/configs/_base_/schedules/sgd_100e.py @@ -0,0 +1,18 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[40, 80], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_150e_warmup.py b/configs/_base_/schedules/sgd_150e_warmup.py new file mode 100644 index 0000000000000000000000000000000000000000..3360c6a2c68c99d642fa8cd22b98d79775b517c1 --- /dev/null +++ b/configs/_base_/schedules/sgd_150e_warmup.py @@ -0,0 +1,19 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=0, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_50e.py b/configs/_base_/schedules/sgd_50e.py new file mode 100644 index 0000000000000000000000000000000000000000..a6e8d185bde5ba7f019491fd8d700f44097a0b57 --- /dev/null +++ b/configs/_base_/schedules/sgd_50e.py @@ -0,0 +1,18 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[20, 40], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_tsm_100e.py b/configs/_base_/schedules/sgd_tsm_100e.py new file mode 100644 index 0000000000000000000000000000000000000000..52972c7655f392fb4cf2f69552b08df93b55629c --- /dev/null +++ b/configs/_base_/schedules/sgd_tsm_100e.py @@ -0,0 +1,20 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[40, 80], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_tsm_50e.py b/configs/_base_/schedules/sgd_tsm_50e.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9f1b561c4c6bb5f261c7912477fe671e3bbf65 --- /dev/null +++ b/configs/_base_/schedules/sgd_tsm_50e.py @@ -0,0 +1,20 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[20, 40], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py new file mode 100644 index 0000000000000000000000000000000000000000..76d4387393ddc86bcd16de88dc47a4f41f24e720 --- /dev/null +++ b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py @@ -0,0 +1,20 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[40, 80], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.00002), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py new file mode 100644 index 0000000000000000000000000000000000000000..a110189555458996f402664ee0aa2e065ac489ab --- /dev/null +++ b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py @@ -0,0 +1,20 @@ +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[20, 40], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.00002), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..487bd47a6d80c578dc22bdbac8b7ee9f8a2f2edf --- /dev/null +++ b/configs/detection/acrn/README.md @@ -0,0 +1,81 @@ +# ACRN + +[Actor-centric relation network](https://openaccess.thecvf.com/content_ECCV_2018/html/Chen_Sun_Actor-centric_Relation_Network_ECCV_2018_paper.html) + + + +## Abstract + + + +Current state-of-the-art approaches for spatio-temporal action localization rely on detections at the frame level and model temporal context with 3D ConvNets. Here, we go one step further and model spatio-temporal relations to capture the interactions between human actors, relevant objects and scene elements essential to differentiate similar human actions. Our approach is weakly supervised and mines the relevant elements automatically with an actor-centric relational network (ACRN). ACRN computes and accumulates pair-wise relation information from actor and global scene features, and generates relation features for action classification. It is implemented as neural networks and can be trained jointly with an existing action detection system. We show that ACRN outperforms alternative approaches which capture relation information, and that the proposed framework improves upon the state-of-the-art performance on JHMDB and AVA. A visualization of the learned relation features confirms that our approach is able to attend to the relevant relations for each action. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.65 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) | + +### AVA2.2 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.71 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. + +For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train ACRN with SlowFast backbone on AVA2.1 in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test ACRN with SlowFast backbone on AVA2.1 and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{sun2018actor, + title={Actor-centric relation network}, + author={Sun, Chen and Shrivastava, Abhinav and Vondrick, Carl and Murphy, Kevin and Sukthankar, Rahul and Schmid, Cordelia}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + pages={318--334}, + year={2018} +} +``` diff --git a/configs/detection/acrn/metafile.yml b/configs/detection/acrn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..dee30b9ad655bc3587f05a35e0f1e924ffa152ce --- /dev/null +++ b/configs/detection/acrn/metafile.yml @@ -0,0 +1,45 @@ +Collections: + - Name: ACRN + README: configs/detection/acrn/README.md + Paper: + URL: https://arxiv.org/abs/1807.10982 + Title: "Actor-Centric Relation Network" + +Models: + - Name: slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb + Config: configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py + In Collection: ACRN + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 27.65 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth + + - Name: slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py + In Collection: ACRN + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 27.71 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..99f242e7df802fa5425c7bf5034dbce226d9170a --- /dev/null +++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py @@ -0,0 +1,172 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + pretrained=None, + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + fusion_kernel=7, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1)), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1e2a58289603078afb9e7364874b9e5f5ebd16 --- /dev/null +++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py @@ -0,0 +1,175 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + pretrained=None, + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + fusion_kernel=7, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1)), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'})) +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ac1ec021a503247fd2f193c6d1a7d99108788e4b --- /dev/null +++ b/configs/detection/lfb/README.md @@ -0,0 +1,128 @@ +# LFB + +[Long-term feature banks for detailed video understanding](https://openaccess.thecvf.com/content_CVPR_2019/html/Wu_Long-Term_Feature_Banks_for_Detailed_Video_Understanding_CVPR_2019_paper.html) + + + +## Abstract + + + +To understand the world, we humans constantly need to relate the present to the past, and put events in context. In this paper, we enable existing video models to do the same. We propose a long-term feature bank---supportive information extracted over the entire span of a video---to augment state-of-the-art video models that otherwise would only view short clips of 2-5 seconds. Our experiments demonstrate that augmenting 3D convolutional networks with a long-term feature bank yields state-of-the-art results on three challenging video datasets: AVA, EPIC-Kitchens, and Charades. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: | +| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.11 | 8620 | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Max LFB) | Kinetics-400 | 22.15 | 8425 | [config](/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4963135b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) | + +Note: + +1. The **gpus** indicates the number of gpu we used to get the checkpoint. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.05 vs. author: 22.1 -> 25.8). +3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences. +4. Before train or test lfb, you need to infer feature bank with the [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py). For more details on infer feature bank, you can refer to [Train](#Train) part. +5. The ROIHead now supports single-label classification (i.e. the network outputs at most + one-label per actor). This can be done by (a) setting multilabel=False during training and + the test_cfg.rcnn.action_thr for testing. + +## Train + +### a. Infer long-term feature bank for training + +Before train or test lfb, you need to infer long-term feature bank first. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`. In this case, you can skip this step. + +Specifically, run the test on the training, validation, testing dataset with the config file [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py) will generate the feature bank. + +A long-term feature bank file of AVA training and validation datasets with float32 precision occupies 3.3 GB. If store the features with float16 precision, the feature bank occupies 1.65 GB. + +You can use the following command to infer feature bank of AVA training and validation dataset and the feature bank will be stored in `lfb_prefix_path/lfb_train.pkl` and `lfb_prefix_path/lfb_val.pkl`. + +```shell +# set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py +python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \ + checkpoints/YOUR_BASELINE_CHECKPOINT.pth + +# set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py +python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \ + checkpoints/YOUR_BASELINE_CHECKPOINT.pth +``` + +We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) to infer feature bank. + +### b. Train LFB + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train LFB model on AVA with half-precision long-term feature bank. + +```shell +python tools/train.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +### a. Infer long-term feature bank for testing + +Before train or test lfb, you also need to infer long-term feature bank first. If you have generated the feature bank file, you can skip it. + +The step is the same with **Infer long-term feature bank for training** part in [Train](#Train). + +### b. Test LFB + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test LFB model on AVA with half-precision long-term feature bank and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + + + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={6047--6056}, + year={2018} +} +``` + +```BibTeX +@inproceedings{wu2019long, + title={Long-term feature banks for detailed video understanding}, + author={Wu, Chao-Yuan and Feichtenhofer, Christoph and Fan, Haoqi and He, Kaiming and Krahenbuhl, Philipp and Girshick, Ross}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={284--293}, + year={2019} +} +``` diff --git a/configs/detection/lfb/metafile.yml b/configs/detection/lfb/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..cb28804befc3856f036e45a15b12121a78e05f34 --- /dev/null +++ b/configs/detection/lfb/metafile.yml @@ -0,0 +1,47 @@ +Collections: +- Name: LFB + README: configs/detection/lfb/README.md + Paper: + URL: https://arxiv.org/abs/1812.05038 + Title: "Long-Term Feature Banks for Detailed Video Understanding" + +Models: + - Name: slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb + Config: configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py + In Collection: LFB + Metadata: + Architecture: ResNet50 + Batch Size: 12 + Epochs: 20 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 24.11 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth + + - Name: slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb + Config: slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py + In Collection: LFB + Metadata: + Architecture: ResNet50 + Batch Size: 12 + Epochs: 20 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 22.15 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4963135b.pth diff --git a/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d704c5fcd72c817c8674485911c1f6386d1d9e00 --- /dev/null +++ b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py @@ -0,0 +1,115 @@ +# This config is used to generate long-term feature bank. +_base_ = '../../_base_/default_runtime.py' + +# model settings +lfb_prefix_path = 'data/ava/lfb_half' +dataset_mode = 'train' # ['train', 'val', 'test'] + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5), + shared_head=dict( + type='LFBInferHead', + lfb_prefix_path=lfb_prefix_path, + dataset_mode=dataset_mode, + use_half_precision=True)), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +# dataset settings +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv' + +exclude_file_infer = ( + f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv') + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_infer = ( + f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl') + +infer_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_infer, + exclude_file=exclude_file_infer, + pipeline=infer_pipeline, + label_file=label_file, + proposal_file=proposal_file_infer, + data_prefix=dict(img=data_root), + person_det_score_thr=0.9, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_infer, + label_file=label_file, + exclude_file=exclude_file_infer, + action_thr=0.0) diff --git a/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..afcb2613cf527cef05d2321f9526368adcca9be6 --- /dev/null +++ b/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,8 @@ +_base_ = [ + 'slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py' +] + +model = dict( + roi_head=dict( + shared_head=dict(fbo_cfg=dict(type='max')), + bbox_head=dict(in_channels=4096))) diff --git a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5872fc4b2d3d866363262558e9a5e2458bc744f0 --- /dev/null +++ b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,189 @@ +_base_ = '../../_base_/default_runtime.py' + +# model settings +lfb_prefix_path = 'data/ava/lfb_half' + +max_num_sampled_feat = 5 +window_size = 60 +lfb_channels = 2048 +dataset_modes = ('train', 'val') + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2560, + num_classes=81, + multilabel=True, + dropout_ratio=0.5), + shared_head=dict( + type='FBOHead', + lfb_cfg=dict( + lfb_prefix_path=lfb_prefix_path, + max_num_sampled_feat=max_num_sampled_feat, + window_size=window_size, + lfb_channels=lfb_channels, + dataset_modes=dataset_modes, + device='gpu'), + fbo_cfg=dict( + type='non_local', + st_feat_channels=2048, + lt_feat_channels=lfb_channels, + latent_channels=512, + num_st_feat=1, + num_lt_feat=window_size * max_num_sampled_feat, + num_non_local_layers=2, + st_feat_dropout_ratio=0.2, + lt_feat_dropout_ratio=0.2, + pre_activate=True))), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root), + person_det_score_thr=0.9)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + person_det_score_thr=0.85, + test_mode=True)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val, + action_thr=0.0) +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5447ce32a582137d824030f954949bb5bdfeec --- /dev/null +++ b/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py @@ -0,0 +1,116 @@ +# This config is used to generate long-term feature bank. +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +lfb_prefix_path = 'data/ava/lfb_half' +dataset_mode = 'val' # ['train', 'val', 'test'] + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5), + shared_head=dict( + type='LFBInferHead', + lfb_prefix_path=lfb_prefix_path, + dataset_mode=dataset_mode, + use_half_precision=True)), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +# dataset settings +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv' +exclude_file_infer = ( + f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv') +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' +proposal_file_infer = ( + f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl') + +file_client_args = dict( + io_backend='petrel', + path_mapping=dict({'data/ava': 's3://openmmlab/datasets/action/ava'})) + +infer_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_infer, + exclude_file=exclude_file_infer, + pipeline=infer_pipeline, + label_file=label_file, + proposal_file=proposal_file_infer, + data_prefix=dict(img=data_root), + person_det_score_thr=0.9, + test_mode=True)) + +test_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_infer, + label_file=label_file, + exclude_file=exclude_file_infer) + +test_cfg = dict(type='TestLoop') diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a17b010f019e5e9b8018108e66c32b7d51356d29 --- /dev/null +++ b/configs/detection/slowfast/README.md @@ -0,0 +1,106 @@ +# SlowFast + +[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :------------------------------: | :----------: | :---: | :-----------------------------------------: | :---------------------------------------: | :--------------------------------------: | +| 4x16x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 24.32 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowFast ResNet50 (with context) | Kinetics-400 | 25.34 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.80 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) | + +### AVA2.2 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------------------------------: | :----------: | :---: | :--------------------------------------: | :------------------------------------: | :-----------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.90 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max) | Kinetics-400 | 26.41 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.65 | [config](/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | + +### MultiSports + +| frame sampling strategy | gpus | backbone | pretrain | f-mAP | v-mAP@0.2 | v-mAP@0.5 | v-mAP@0.1:0.9 | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :-------: | :-------: | :-----------: | :--------: | :--------------------------------: | :------------------------------: | :------------------------------: | +| 4x16x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 36.88 | 22.83 | 16.9 | 14.74 | 18618 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-af666368.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. **with context** indicates that using both RoI feature and global pooled feature for classification; **temporal-max** indicates that using max pooling in the temporal dimension for the feature. +3. MultiSports dataset utilizes frame-mAP(f-mAP) and video-mAP(v-map) to evaluate performance. Frame-mAP evaluates on detection results of each frame, and video-mAP uses 3D IoU to evaluate tube-level results under several thresholds. You could refer to the [competition page](https://codalab.lisn.upsaclay.fr/competitions/3736#learn_the_details-evaluation) for details. + +For more details on data preparation, you can refer to + +- [AVA](/tools/data/ava/README.md) +- [MultiSports](/tools/data/multisports/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the SlowFast model on AVA2.1 in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test the SlowFast model on AVA2.1 and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={ICCV}, + pages={6202--6211}, + year={2019} +} +``` + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={CVPR}, + pages={6047--6056}, + year={2018} +} +``` diff --git a/configs/detection/slowfast/metafile.yml b/configs/detection/slowfast/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..6bc3b58be41ab0a7b3fe0cbc6df7c385de204d8e --- /dev/null +++ b/configs/detection/slowfast/metafile.yml @@ -0,0 +1,141 @@ +Collections: + - Name: SlowFast + README: configs/detection/slowfast/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 24.32 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth + + - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 25.34 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth + + - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 25.80 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth + + - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 25.90 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth + + - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 26.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth + + - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 26.65 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth + + - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 8 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: MultiSports + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: MultiSports + Task: Action Detection + Metrics: + f-mAP: 36.88 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-af666368.pth diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a51bfcff8a7628df46e2e53cdc22ad365c05f0bc --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True), + bbox_head=dict(in_channels=4608))) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..452392b57d3e69455a9446d8ec8e30c2a39c71f7 --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py @@ -0,0 +1,5 @@ +_base_ = [ + 'slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py' +] + +model = dict(roi_head=dict(bbox_roi_extractor=dict(temporal_pool_mode='max'))) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d16f88ef41885fe44178dfe6caf987a2b4f1770e --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,170 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + pretrained=None, + resample_rate=8, + speed_ratio=8, + channel_ratio=8, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1)), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5c1e105bb528cc7189c9b920d00d75f3b1591136 --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py @@ -0,0 +1,134 @@ +_base_ = [ + '../slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py' # noqa: E501 +] + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth') +num_classes = 66 +model = dict( + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + _delete_=True, + type='mmaction.ResNet3dSlowFast', + pretrained=None, + resample_rate=8, + speed_ratio=8, + channel_ratio=8, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1)), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict(bbox_head=dict(in_channels=2304))) + +dataset_type = 'AVADataset' +data_root = 'data/multisports/trainval' +anno_root = 'data/multisports/annotations' + +ann_file_train = f'{anno_root}/multisports_train.csv' +ann_file_val = f'{anno_root}/multisports_val.csv' +gt_file = f'{anno_root}/multisports_GT.pkl' + +proposal_file_train = f'{anno_root}/multisports_dense_proposals_train.recall_96.13.pkl' # noqa: E501 +proposal_file_val = f'{anno_root}/multisports_dense_proposals_val.recall_96.13.pkl' # noqa: E501 + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='DecordDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + pipeline=train_pipeline, + num_classes=num_classes, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root), + timestamp_start=1, + start_index=0, + use_frames=False, + fps=1, + )) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + pipeline=val_pipeline, + num_classes=num_classes, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True, + timestamp_start=1, + start_index=0, + use_frames=False, + fps=1, + )) +test_dataloader = val_dataloader + +val_evaluator = dict(type='MultiSportsMetric', ann_file=gt_file) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=8, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[6, 7], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01125, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=5, norm_type=2)) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..4f60e8dc72164b05b50ad415d29b90b40473b75c --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py @@ -0,0 +1,177 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + pretrained=None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1), + fusion_kernel=7), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=6, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a98c2122826057113aefbda2267a0c8d8b0069af --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,172 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + pretrained=None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1), + fusion_kernel=7), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..55f9b2dcf93a190e67c3be0ab0a92651110929dc --- /dev/null +++ b/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py @@ -0,0 +1,8 @@ +_base_ = [ + 'slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py' +] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(temporal_pool_mode='max'), + bbox_head=dict(focal_alpha=3.0, focal_gamma=1.0))) diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md new file mode 100644 index 0000000000000000000000000000000000000000..89bb2671f27ef1200aa5ef894acdc1ec6d207cdd --- /dev/null +++ b/configs/detection/slowonly/README.md @@ -0,0 +1,134 @@ +# SlowOnly + +[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :------------------------------------: | :----------: | :---: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 20.72 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 22.77 | [config](/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.55 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.77 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet101 | Kinetics-400 | 24.83 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) | + +### AVA2.2 (Trained on AVA-Kinetics) + +Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon. + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | + +### AVA2.2 (Trained on AVA-Kinetics with tricks) + +We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last row in **AVA2.2 (Trained on AVA-Kinetics)**. + +| method | frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :--------------------: | :---------------------: | :--: | :---------------: | :----------: | :---: | :--------------------------------------: | :-------------------------------------: | :------------------------------------: | +| baseline | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + context | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + temporal max pooling | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + nonlinear head | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + focal loss | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) | +| + more frames | 16x4x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) | + +### MultiSports + +| frame sampling strategy | gpus | backbone | pretrain | f-mAP | v-mAP@0.2 | v-mAP@0.5 | v-mAP@0.1:0.9 | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :-------: | :-------: | :-----------: | :--------: | :--------------------------------: | :------------------------------: | :------------------------------: | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.40 | 15.48 | 10.62 | 9.65 | 8509 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier. +3. MultiSports dataset utilizes frame-mAP(f-mAP) and video-mAP(v-map) to evaluate performance. Frame-mAP evaluates on detection results of each frame, and video-mAP uses 3D IoU to evaluate tube-level results under several thresholds. You could refer to the [competition page](https://codalab.lisn.upsaclay.fr/competitions/3736#learn_the_details-evaluation) for details. + +For more details on data preparation, you can refer to + +- [AVA](/tools/data/ava/README.md) +- [AVA-Kinetics](/tools/data/ava_kinetics/README.md) +- [MultiSports](/tools/data/multisports/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the SlowOnly model on AVA2.1 in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test the SlowOnly model on AVA2.1 and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={ICCV}, + pages={6202--6211}, + year={2019} +} +``` + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={CVPR}, + pages={6047--6056}, + year={2018} +} +``` + +```BibTeX +@article{li2020ava, + title={The ava-kinetics localized human actions video dataset}, + author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew}, + journal={arXiv preprint arXiv:2005.00214}, + year={2020} +} +``` diff --git a/configs/detection/slowonly/metafile.yml b/configs/detection/slowonly/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..6531b46127ab2711c8165d4fa567a96d807945e4 --- /dev/null +++ b/configs/detection/slowonly/metafile.yml @@ -0,0 +1,122 @@ +Collections: + - Name: SlowOnly + README: configs/detection/slowonly/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 20.72 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth + + - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-700 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 22.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth + + - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 21.55 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth + + - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 23.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth + + - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet101 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 24.83 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth + + - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 8 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: MultiSports + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: MultiSports + Task: Action Detection + Metrics: + f-mAP: 26.40 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth diff --git a/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..53ef4e7af9bc9e8fcf2a4e0ea3fc70b9829548bd --- /dev/null +++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,163 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVAKineticsDataset' +data_root = 'data/ava_kinetics/rawframes' +anno_root = 'data/ava_kinetics/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..3fca227de9af0e191dfa07dfd8c75fb2c72f45b9 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,162 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_' + 'kinetics400-rgb_20220901-df42dc84.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVAKineticsDataset' +data_root = 'data/ava_kinetics/rawframes' +anno_root = 'data/ava_kinetics/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..90975ac4cb9823b840176b0a4a358caff9abf48d --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'), + bbox_head=dict(in_channels=4096, mlp_head=True))) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..4a7db56b368eda6b5e650dcefd8193c01dd5e5a8 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'), + bbox_head=dict(in_channels=4096, mlp_head=True, focal_gamma=1.0))) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..4ada41f7dda4b044c0aca80d319be589dc135167 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'), + bbox_head=dict(in_channels=4096))) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2368a898eb93ebd87e1851d476c1fe93c78d1f9f --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True), + bbox_head=dict(in_channels=4096))) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..9b727f5e412ade6886d783ad1d363aa94d128444 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py @@ -0,0 +1,113 @@ +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +model = dict( + roi_head=dict( + bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'), + bbox_head=dict(in_channels=4096, mlp_head=True, focal_gamma=1.0))) + +dataset_type = 'AVAKineticsDataset' +data_root = 'data/ava_kinetics/rawframes' +anno_root = 'data/ava_kinetics/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=16, frame_interval=4), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..74c69c0feb833502cd4f7eae432d4f246fc3c816 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,8 @@ +_base_ = ['slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py'] + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-' + 'rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_' + 'kinetics700-rgb_20221013-15b93b10.pth') + +model = dict(init_cfg=dict(type='Pretrained', checkpoint=url)) diff --git a/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..78a41e098acb473d7a4462d550c32c4d4edbbb33 --- /dev/null +++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -0,0 +1,8 @@ +_base_ = ['slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-' + 'rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_' + 'kinetics700-rgb_20221013-15b93b10.pth') + +model = dict(init_cfg=dict(type='Pretrained', checkpoint=url)) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..70e06432d296bdaba1165f0e9bf7a7be4130fe24 --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,152 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_' + '20200926-0c730aef.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=101, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6526d980cc126943e656b4eab4cc6dfedfe07085 --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,161 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/' + 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_' + '20210308-0d6e5a69.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1), + norm_cfg=dict(type='BN3d', requires_grad=True), + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian')), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..57b3ff02d92be4bef53c1456cc285287a964c04e --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,160 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/' + 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_' + '20210308-e8dd9e82.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1), + norm_cfg=dict(type='BN3d', requires_grad=True), + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian')), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f42cf6538d4670073ea451aa66ff57b8e986be44 --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,154 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d5a0f4198adea9a4bd088dbded971a3a33ec237b --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py @@ -0,0 +1,151 @@ +_base_ = [ + '../../_base_/default_runtime.py', +] +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') +num_classes = 66 +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=False, + in_channels=2048, + num_classes=num_classes, + multilabel=False, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/multisports/trainval' +anno_root = 'data/multisports/annotations' + +ann_file_train = f'{anno_root}/multisports_train.csv' +ann_file_val = f'{anno_root}/multisports_val.csv' +gt_file = f'{anno_root}/multisports_GT.pkl' + +proposal_file_train = f'{anno_root}/multisports_dense_proposals_train.recall_96.13.pkl' # noqa: E501 +proposal_file_val = f'{anno_root}/multisports_dense_proposals_val.recall_96.13.pkl' # noqa: E501 + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='DecordDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + pipeline=train_pipeline, + num_classes=num_classes, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root), + timestamp_start=1, + start_index=0, + use_frames=False, + fps=1, + )) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + pipeline=val_pipeline, + num_classes=num_classes, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True, + timestamp_start=1, + start_index=0, + use_frames=False, + fps=1, + )) +test_dataloader = val_dataloader + +val_evaluator = dict(type='MultiSportsMetric', ann_file=gt_file) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=8, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[6, 7], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=5, norm_type=2)) diff --git a/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e3fefd16110c16709fc435ee9f749a4520a5e16f --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,154 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly' + '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-' + 'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/videomae/README.md b/configs/detection/videomae/README.md new file mode 100644 index 0000000000000000000000000000000000000000..43b0954bbc2733789af2f377fa00d0cf0d4b8182 --- /dev/null +++ b/configs/detection/videomae/README.md @@ -0,0 +1,75 @@ +# VideoMAE + +[VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) + + + +## Abstract + + + +Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking with an extremely high ratio. This simple design makes video reconstruction a more challenging self-supervision task, thus encouraging extracting more effective video representations during this pre-training process. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables a higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets is an important issue. Notably, our VideoMAE with the vanilla ViT can achieve 87.4% on Kinetics-400, 75.4% on Something-Something V2, 91.3% on UCF101, and 62.6% on HMDB51, without using any extra data. + + + +
+ +
+ +## Results and Models + +## Results and Models + +### AVA2.2 + +Currently, we use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. + +| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--------: | :--: | :-------: | :----------: | :--: | :---------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 16x4x1 | raw | 8 | ViT Base | Kinetics-400 | 33.6 | [config](/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-3dafab75.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log) | +| 16x4x1 | raw | 8 | ViT Large | Kinetics-400 | 38.7 | [config](/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-bf93c9ea.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log) | + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the ViT base model on AVA-Kinetics in a deterministic option. + +```shell +python tools/train.py configs/detection/ava_kinetics/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py \ + --cfg-options randomness.seed=0 randomness.deterministic=True +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test the ViT base model on AVA-Kinetics and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/ava_kinetics/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{tong2022videomae, + title={Video{MAE}: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training}, + author={Zhan Tong and Yibing Song and Jue Wang and Limin Wang}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022} +} +``` diff --git a/configs/detection/videomae/metafile.yml b/configs/detection/videomae/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..386654d06e8244a178a06793124228091acc84f6 --- /dev/null +++ b/configs/detection/videomae/metafile.yml @@ -0,0 +1,47 @@ +Collections: +- Name: VideoMAE + README: configs/detection/videomae/README.md + Paper: + URL: https://arxiv.org/abs/2203.12602 + Title: "VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training" + +Models: + - Name: vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb + Config: configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py + In Collection: VideoMAE + Metadata: + Architecture: Vision Transformer + Batch Size: 64 + Epochs: 20 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 33.6 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-3dafab75.pth + + - Name: vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb + Config: configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py + In Collection: VideoMAE + Metadata: + Architecture: Vision Transformer + Batch Size: 128 + Epochs: 20 + Pretrained: Kinetics-400 + Resolution: short-side 320 + Training Data: AVA v2.2 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 38.7 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-bf93c9ea.pth diff --git a/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..8e6097e96dda7ca386a6979617ff1e83032a50ae --- /dev/null +++ b/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py @@ -0,0 +1,171 @@ +_base_ = ['../../_base_/default_runtime.py'] + +url = ( + 'https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/' + 'vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.VisionTransformer', + img_size=224, + patch_size=16, + embed_dims=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + num_frames=16, + norm_cfg=dict(type='LN', eps=1e-6), + drop_path_rate=0.2, + use_mean_pooling=False, + return_feat_map=True), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=768, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVAKineticsDataset' +data_root = 'data/ava_kinetics/rawframes' +anno_root = 'data/ava_kinetics/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=16, frame_interval=4), + dict(type='RawFrameDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=15, + eta_min=0, + by_epoch=True, + begin=5, + end=20, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=1.25e-4, weight_decay=0.05), + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.75, + 'decay_type': 'layer_wise', + 'num_layers': 12 + }, + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c5f6481d18449cbc22e2b99095161e42dd0fee1f --- /dev/null +++ b/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py @@ -0,0 +1,172 @@ +_base_ = ['../../_base_/default_runtime.py'] + +url = ( + 'https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/' + 'vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-229dbb03.pth' +) + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.VisionTransformer', + img_size=224, + patch_size=16, + embed_dims=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + num_frames=16, + norm_cfg=dict(type='LN', eps=1e-6), + drop_path_rate=0.2, + use_mean_pooling=False, + return_feat_map=True), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + background_class=True, + in_channels=1024, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVAKineticsDataset' +data_root = 'data/ava_kinetics/rawframes' +anno_root = 'data/ava_kinetics/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.2.csv' +ann_file_val = f'{anno_root}/ava_val_v2.2.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv' + +label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=16, frame_interval=4), + dict(type='RawFrameDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=15, + eta_min=0, + by_epoch=True, + begin=5, + end=20, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=2.5e-4, weight_decay=0.05), + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.8, + 'decay_type': 'layer_wise', + 'num_layers': 24 + }, + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..432b5286ead1b0ec175e04e956fc583ef682baff --- /dev/null +++ b/configs/localization/bmn/README.md @@ -0,0 +1,86 @@ +# BMN + +[Bmn: Boundary-matching network for temporal action proposal generation](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html) + + + +## Abstract + + + +Temporal action proposal generation is an challenging and promising task which aims to locate temporal regions in real-world videos where action or event may occur. Current bottom-up proposal generation methods can generate proposals with precise boundary, but cannot efficiently generate adequately reliable confidence scores for retrieving proposals. To address these difficulties, we introduce the Boundary-Matching (BM) mechanism to evaluate confidence scores of densely distributed proposals, which denote a proposal as a matching pair of starting and ending boundaries and combine all densely distributed BM pairs into the BM confidence map. Based on BM mechanism, we propose an effective, efficient and end-to-end proposal generation method, named Boundary-Matching Network (BMN), which generates proposals with precise temporal boundaries as well as reliable confidence scores simultaneously. The two-branches of BMN are jointly trained in an unified framework. We conduct experiments on two challenging datasets: THUMOS-14 and ActivityNet-1.3, where BMN shows significant performance improvement with remarkable efficiency and generalizability. Further, combining with existing action classifier, BMN can achieve state-of-the-art temporal action detection performance. + + + +
+ +
+ +## Results and Models + +### ActivityNet feature + +| feature | gpus | pretrain | AUC | AR@1 | AR@5 | AR@10 | AR@100 | gpu_mem(M) | iter time(s) | config | ckpt | log | +| :-----------: | :--: | :------: | :---: | :---: | :---: | :---: | :----: | :--------: | :----------: | :------------------------------------------: | :----------------------------------------: | :---------------------------------------: | +| cuhk_mean_100 | 2 | None | 67.25 | 32.89 | 49.43 | 56.64 | 75.29 | 5412 | - | [config](/configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature_20220908-79f92857.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.log) | +| slowonly-k700 | 2 | None | 68.04 | 33.44 | 50.53 | 57.65 | 75.77 | - | - | [config](/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature_20230907-50b939b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.log) | + +1. The **gpus** indicates the number of gpu we used to get the checkpoint. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). The slowonly-k700 denotes the feature extracted using MMAction2's [SlowOnly model trained on Kinetics 700](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py). You can download this feature from [ActivityNet Data Preparation](/tools/data/activitynet/README.md). +3. We evaluate the action detection performance of BMN, using [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) submission for ActivityNet2017 Untrimmed Video Classification Track to assign label for each action proposal. + +\*We train BMN with the [official repo](https://github.com/JJBOY/BMN-Boundary-Matching-Network), evaluate its proposal generation and action detection performance with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) for label assigning. + +For more details on data preparation, you can refer to [ActivityNet Data Preparation](/tools/data/activitynet/README.md). + +## Train + +Train BMN model on ActivityNet features dataset. + +```shell +bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2 +``` + +Train BMN model on ActivityNet SlowOnly-K700 features dataset. + +```shell +bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py 2 +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +Test BMN on ActivityNet feature dataset. + +```shell +python3 tools/test.py configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH +``` + +For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{lin2019bmn, + title={Bmn: Boundary-matching network for temporal action proposal generation}, + author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={3889--3898}, + year={2019} +} +``` + + + +```BibTeX +@article{zhao2017cuhk, + title={Cuhk \& ethz \& siat submission to activitynet challenge 2017}, + author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others}, + journal={arXiv preprint arXiv:1710.08011}, + volume={8}, + year={2017} +} +``` diff --git a/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py b/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..e2160fbd0f5b79ec57d98c1380a9c59c74ec0d2d --- /dev/null +++ b/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py @@ -0,0 +1,110 @@ +_base_ = [ + '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py' +] + +model = dict(feat_dim=2048) + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] + +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +max_epochs = 9 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[ + 7, + ], + gamma=0.1) +] + +work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py b/configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..bc75bba2d7b41b28af8ad7ed05d22024324a16ae --- /dev/null +++ b/configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py @@ -0,0 +1,108 @@ +_base_ = [ + '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] + +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +max_epochs = 9 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[ + 7, + ], + gamma=0.1) +] + +work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bmn/metafile.yml b/configs/localization/bmn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..1a8a674a2fd7f17cc6b9ff7ca70929a91145aa40 --- /dev/null +++ b/configs/localization/bmn/metafile.yml @@ -0,0 +1,29 @@ +Collections: +- Name: BMN + README: configs/localization/bmn/README.md + Paper: + URL: https://arxiv.org/abs/1907.09702 + Title: "BMN: Boundary-Matching Network for Temporal Action Proposal Generation" + +Models: + - Name: bmn_2xb8-400x100-9e_activitynet-feature + Config: configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py + In Collection: BMN + Metadata: + Batch Size: 8 + Epochs: 9 + Training Data: ActivityNet v1.3 + Training Resources: 2 GPUs + feature: cuhk_mean_100 + Modality: RGB + Results: + - Dataset: ActivityNet v1.3 + Task: Temporal Action Localization + Metrics: + AUC: 67.25 + AR@1: 32.89 + AR@5: 49.43 + AR@10: 56.64 + AR@100: 75.29 + Training Log: https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.log + Weights: https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature_20220908-79f92857.pth diff --git a/configs/localization/bsn/README.md b/configs/localization/bsn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..60e1162bef944e200f0762541293c6a76c63978c --- /dev/null +++ b/configs/localization/bsn/README.md @@ -0,0 +1,81 @@ +# BSN + +[Bsn: Boundary sensitive network for temporal action proposal generation](https://openaccess.thecvf.com/content_ECCV_2018/html/Tianwei_Lin_BSN_Boundary_Sensitive_ECCV_2018_paper.html) + + + +## Abstract + + + +Temporal action proposal generation is an important yet challenging problem, since temporal proposals with rich action content are indispensable for analysing real-world videos with long duration and high proportion irrelevant content. This problem requires methods not only generating proposals with precise temporal boundaries, but also retrieving proposals to cover truth action instances with high recall and high overlap using relatively fewer proposals. To address these difficulties, we introduce an effective proposal generation method, named Boundary-Sensitive Network (BSN), which adopts "local to global" fashion. Locally, BSN first locates temporal boundaries with high probabilities, then directly combines these boundaries as proposals. Globally, with Boundary-Sensitive Proposal feature, BSN retrieves proposals by evaluating the confidence of whether a proposal contains an action within its region. We conduct experiments on two challenging datasets: ActivityNet-1.3 and THUMOS14, where BSN outperforms other state-of-the-art temporal action proposal generation methods with high recall and high temporal precision. Finally, further experiments demonstrate that by combining existing action classifiers, our method significantly improves the state-of-the-art temporal action detection performance. + + + +
+ +
+ +## Results and Models + +### ActivityNet feature + +| feature | gpus | pretrain | AUC | AR@1 | AR@5 | AR@10 | AR@100 | gpu_mem(M) | iter time(s) | config | ckpt | log | +| :-----------: | :--: | :------: | :---: | :---: | :---: | :---: | :----: | :-------------: | :----------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: | +| cuhk_mean_100 | 1 | None | 66.26 | 32.71 | 48.43 | 55.28 | 74.27 | 43(TEM)+25(PEM) | - | [config_TEM](/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py) [config_PGM](/configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py) [config_PEM](/configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py) | [ckpt_TEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature_20220908-9da79951.pth) [ckpt_PEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature_20220908-ec2eb21d.pth) | [log_tem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.log) [log_pem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.log) | +| slowonly-k700 | 1 | None | 67.63 | 33.04 | 48.79 | 56.01 | 75.74 | - | - | [config_TEM](/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py) [config_PGM](/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py) [config_PEM](/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py) | [ckpt_TEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature_20230907-76069fda.pth) [ckpt_PEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature_20230907-44158b6d.pth) | [log_tem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.log) [log_pem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.log) | + +1. The **gpus** indicates the number of gpu we used to get the checkpoint. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). The slowonly-k700 denotes the feature extracted using MMAction2's [SlowOnly model trained on Kinetics 700](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py). You can download this feature from [ActivityNet Data Preparation](/tools/data/activitynet/README.md). + +For more details on data preparation, you can refer to [ActivityNet Data Preparation](/tools/data/activitynet/README.md). + +## Training and Test + +The traing of the BSN model is three-stages. We take the `cuhk_mean_100` feature as an example. For `slowonly-k700` feature, just need to replace the config file with the corresponding config file with `slowonly-k700` in the file name. + +Firstly train the Temporal evaluation module (TEM): + +```shell +python3 tools/train.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py +``` + +After training use the TEM module to generate the probabilities sequence (actionness, starting, and ending) for the training and validation dataset: + +```shell +python tools/test.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py \ + work_dirs/bsn_400x100_20e_1xb16_activitynet_feature/tem_epoch_20.pth +``` + +The second step is to run the Proposal generation module (PGM) to generate Boundary-Sensitive Proposal (BSP) feature for the training and validation dataset: + +```shell +python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py --mode train +python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py --mode test +``` + +The last step is to train (and validate) the Proposal evaluation module (PEM): + +```shell +python python tools/train.py configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py +``` + +(Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports only SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals. + +```shell +python tools/data/activitynet/convert_proposal_format.py +``` + +## Citation + +```BibTeX +@inproceedings{lin2018bsn, + title={Bsn: Boundary sensitive network for temporal action proposal generation}, + author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + pages={3--19}, + year={2018} +} +``` diff --git a/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py b/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..254521be2610eae40f8ce363cc30395c15606ae0 --- /dev/null +++ b/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py @@ -0,0 +1,84 @@ +_base_ = [ + '../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +train_pipeline = [ + dict( + type='LoadProposals', + top_k=500, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('reference_temporal_iou', 'bsp_feature'), + meta_keys=()) +] +val_pipeline = [ + dict( + type='LoadProposals', + top_k=1000, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('tmin', 'tmax', 'tmin_score', 'tmax_score', 'bsp_feature'), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')), +] +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict(val_interval=20) + +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py b/configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..dee7cd4b052ade14fa72c030f4866db6749740fc --- /dev/null +++ b/configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py @@ -0,0 +1,84 @@ +_base_ = [ + '../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +train_pipeline = [ + dict( + type='LoadProposals', + top_k=500, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('reference_temporal_iou', 'bsp_feature'), + meta_keys=()) +] +val_pipeline = [ + dict( + type='LoadProposals', + top_k=1000, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('tmin', 'tmax', 'tmin_score', 'tmax_score', 'bsp_feature'), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')), +] +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict(val_interval=20) + +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py b/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..000f5f784aae34951ed7348b6b68c1e3b7bfce2e --- /dev/null +++ b/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py @@ -0,0 +1,32 @@ +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_test.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +temporal_scale = 100 +pgm_proposals_cfg = dict( + pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5) +pgm_features_test_cfg = dict( + pgm_features_thread=32, + top_k=1000, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) +pgm_features_train_cfg = dict( + pgm_features_thread=32, + top_k=500, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) diff --git a/configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py b/configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..5ffb61e4dc59780fb9484192a26c54c23d7c09b9 --- /dev/null +++ b/configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py @@ -0,0 +1,32 @@ +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_test.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +temporal_scale = 100 +pgm_proposals_cfg = dict( + pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5) +pgm_features_test_cfg = dict( + pgm_features_thread=32, + top_k=1000, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) +pgm_features_train_cfg = dict( + pgm_features_thread=32, + top_k=500, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) diff --git a/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py b/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..f674db0cfe9a1ee981397a2e55f27e496a981776 --- /dev/null +++ b/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py @@ -0,0 +1,95 @@ +_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py'] + +model = dict(tem_feat_dim=2048) + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_trainval.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='PackLocalizationInputs', meta_keys=('video_name', )) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=20) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[7, 14], + gamma=0.1) +] + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' + +test_evaluator = dict( + type='ANetMetric', + metric_type='TEM', + dump_config=dict(out=tem_results_dir, output_format='csv')) +val_evaluator = test_evaluator + +default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth')) diff --git a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4181bf7b7140f9e77995b101bcab86be785161 --- /dev/null +++ b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py @@ -0,0 +1,93 @@ +_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py'] + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_full.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='PackLocalizationInputs', meta_keys=('video_name', )) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=20) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[7, 14], + gamma=0.1) +] + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' + +test_evaluator = dict( + type='ANetMetric', + metric_type='TEM', + dump_config=dict(out=tem_results_dir, output_format='csv')) +val_evaluator = test_evaluator + +default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth')) diff --git a/configs/localization/bsn/metafile.yml b/configs/localization/bsn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..c4f4a96657bd4248e5b1ffc174e7de771a75ba50 --- /dev/null +++ b/configs/localization/bsn/metafile.yml @@ -0,0 +1,38 @@ +Collections: +- Name: BSN + README: configs/localization/bsn/README.md + Paper: + URL: https://arxiv.org/abs/1806.02964 + Title: "BSN: Boundary Sensitive Network for Temporal Action Proposal Generation" + +Models: + - Name: bsn_400x100_1xb16_20e_activitynet_feature (cuhk_mean_100) + Config: + configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py + In Collection: BSN + Metadata: + Batch Size: 16 + Epochs: 20 + Training Data: ActivityNet v1.3 + Training Resources: 1 GPU + feature: cuhk_mean_100 + configs: + - configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py + - configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py + - configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py + Modality: RGB + Results: + - Dataset: ActivityNet v1.3 + Task: Temporal Action Localization + Metrics: + AUC: 66.26 + AR@1: 32.71 + AR@5: 48.43 + AR@10: 55.28 + AR@100: 74.27 + Training Log: + - https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.log + - https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.log + Weights: + - https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature_20220908-9da79951.pth + - https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature_20220908-ec2eb21d.pth diff --git a/configs/localization/drn/README.md b/configs/localization/drn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9f94de0e64d704da7e58bf58b6e8f302243484e2 --- /dev/null +++ b/configs/localization/drn/README.md @@ -0,0 +1,84 @@ +# DRN + +[Dense Regression Network for Video Grounding](https://openaccess.thecvf.com/content_CVPR_2020/papers/Zeng_Dense_Regression_Network_for_Video_Grounding_CVPR_2020_paper.pdf) + + + +## Abstract + + + +We address the problem of video grounding from natural language queries. The key challenge in this task is that one training video might only contain a few annotated starting/ending frames that can be used as positive examples for model training. Most conventional approaches directly train a binary classifier using such imbalance data, thus achieving inferior results. The key idea of this paper is to use the distances between the frame within the ground truth and the starting (ending) frame as dense supervisions to improve the video grounding accuracy. Specifically, we design a novel dense regression network (DRN) to regress the distances from each frame to the starting (ending) frame of the video segment described by the query. We also propose a simple but effective IoU regression head module to explicitly consider the localization quality of the grounding results (i.e., the IoU between the predicted location and the ground truth). Experimental results show that our approach significantly outperforms state-of-the-arts on three datasets (i.e., Charades-STA, ActivityNet-Captions, and TACoS). + + + +
+ +
+ +## Results and Models + +### Charades STA C3D feature + +| feature | gpus | pretrain | Recall@Top1(IoU=0.5) | Recall@Top5(IoU=0.5) | config | ckpt | log | +| :-----: | :--: | :------: | :------------------: | :------------------: | :----------------------------------------------: | :---------------------------------------------: | :--------------------------------------------: | +| C3D | 2 | None | 47.04 | 84.57 | [config](configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/drn/drn_2xb16-4096-10e_c3d-feature_20230809-ec0429a6.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/drn_2xb16-4096-10e_c3d-feature.log) | + +For more details on data preparation, you can refer to [Charades STA Data Preparation](/tools/data/charades-sta/README.md). + +## Train + +The training of DRN has three stages. Following the official paper, the second and the third stage loads the best checkpoint from previous stage. + +The first stage training: + +```shell +bash tools/dist_train.sh configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_first.py 2 +``` + +The second stage training: + +```shell +BEST_CKPT=work_dirs/drn_2xb16-4096-10e_c3d-feature_first/SOME.PTH +bash tools/dist_train.sh configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_second.py 2 --cfg-options load_from=${BEST_CKPT} +``` + +The third stage training: + +```shell +BEST_CKPT=work_dirs/drn_2xb16-4096-10e_c3d-feature_second/SOME.PTH +bash tools/dist_train.sh configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py 2 --cfg-options load_from=${BEST_CKPT} +``` + +## Test + +Test DRN on Charades STA C3D feature: + +```shell +python3 tools/test.py configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py CHECKPOINT.PTH +``` + +For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{DRN2020CVPR, + author = {Runhao, Zeng and Haoming, Xu and Wenbing, Huang and Peihao, Chen and Mingkui, Tan and Chuang Gan}, + title = {Dense Regression Network for Video Grounding}, + booktitle = {CVPR}, + year = {2020}, +} +``` + + + +```BibTeX +@inproceedings{gao2017tall, + title={Tall: Temporal activity localization via language query}, + author={Gao, Jiyang and Sun, Chen and Yang, Zhenheng and Nevatia, Ram}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={5267--5275}, + year={2017} +} +``` diff --git a/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_first.py b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_first.py new file mode 100644 index 0000000000000000000000000000000000000000..5997c270c90f42abf3c0c12aa8fbe1015e3061c3 --- /dev/null +++ b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_first.py @@ -0,0 +1,115 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='DRN', + vocab_size=1301, + feature_dim=4096, + embed_dim=300, + hidden_dim=512, + bidirection=True, + first_output_dim=256, + fpn_feature_dim=512, + lstm_layers=1, + graph_node_features=1024, + fcos_pre_nms_top_n=32, + fcos_inference_thr=0.05, + fcos_prior_prob=0.01, + focal_alpha=0.25, + focal_gamma=2.0, + fpn_stride=[1, 2, 4], + fcos_nms_thr=0.6, + fcos_conv_layers=1, + fcos_num_class=2, + is_first_stage=True, + is_second_stage=False) + +# dataset settings +dataset_type = 'CharadesSTADataset' +root = 'data/CharadesSTA' +data_root = f'{root}/C3D_unit16_overlap0.5_merged/' +data_root_val = f'{root}/C3D_unit16_overlap0.5_merged/' +ann_file_train = f'{root}/Charades_sta_train.txt' +ann_file_val = f'{root}/Charades_sta_test.txt' +ann_file_test = f'{root}/Charades_sta_test.txt' + +word2id_file = f'{root}/Charades_word2id.json' +fps_file = f'{root}/Charades_fps_dict.json' +duration_file = f'{root}/Charades_duration.json' +num_frames_file = f'{root}/Charades_frames_info.json' +window_size = 16 +ft_overlap = 0.5 + +train_pipeline = [ + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=('vid_name', 'query_tokens', 'query_length', 'num_proposals', + 'num_frames')) +] + +val_pipeline = train_pipeline +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) + +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) +test_dataloader = val_dataloader + +max_epochs = 10 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +val_evaluator = dict(type='RecallatTopK', topK_list=(1, 5), threshold=0.5) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=1e-3), + clip_grad=dict(max_norm=5, norm_type=2), +) + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), +] + +find_unused_parameters = True diff --git a/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_second.py b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_second.py new file mode 100644 index 0000000000000000000000000000000000000000..500b423f1320eac0ba3b5646ed5701c70a60eee9 --- /dev/null +++ b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_second.py @@ -0,0 +1,110 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='DRN', + vocab_size=1301, + feature_dim=4096, + embed_dim=300, + hidden_dim=512, + bidirection=True, + first_output_dim=256, + fpn_feature_dim=512, + lstm_layers=1, + graph_node_features=1024, + fcos_pre_nms_top_n=32, + fcos_inference_thr=0.05, + fcos_prior_prob=0.01, + focal_alpha=0.25, + focal_gamma=2.0, + fpn_stride=[1, 2, 4], + fcos_nms_thr=0.6, + fcos_conv_layers=1, + fcos_num_class=2, + is_first_stage=False, + is_second_stage=True) + +# dataset settings +dataset_type = 'CharadesSTADataset' +root = 'data/CharadesSTA' +data_root = f'{root}/C3D_unit16_overlap0.5_merged/' +data_root_val = f'{root}/C3D_unit16_overlap0.5_merged/' +ann_file_train = f'{root}/Charades_sta_train.txt' +ann_file_val = f'{root}/Charades_sta_test.txt' +ann_file_test = f'{root}/Charades_sta_test.txt' + +word2id_file = f'{root}/Charades_word2id.json' +fps_file = f'{root}/Charades_fps_dict.json' +duration_file = f'{root}/Charades_duration.json' +num_frames_file = f'{root}/Charades_frames_info.json' +window_size = 16 +ft_overlap = 0.5 + +train_pipeline = [ + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=('vid_name', 'query_tokens', 'query_length', 'num_proposals', + 'num_frames')) +] + +val_pipeline = train_pipeline +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) + +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) +test_dataloader = val_dataloader + +max_epochs = 10 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +val_evaluator = dict(type='RecallatTopK', topK_list=(1, 5), threshold=0.5) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=1e-5), + clip_grad=dict(max_norm=5, norm_type=2)) + +find_unused_parameters = True diff --git a/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py new file mode 100644 index 0000000000000000000000000000000000000000..75561e4cc81ddd715b69279103cc27f1affec1ab --- /dev/null +++ b/configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py @@ -0,0 +1,110 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='DRN', + vocab_size=1301, + feature_dim=4096, + embed_dim=300, + hidden_dim=512, + bidirection=True, + first_output_dim=256, + fpn_feature_dim=512, + lstm_layers=1, + graph_node_features=1024, + fcos_pre_nms_top_n=32, + fcos_inference_thr=0.05, + fcos_prior_prob=0.01, + focal_alpha=0.25, + focal_gamma=2.0, + fpn_stride=[1, 2, 4], + fcos_nms_thr=0.6, + fcos_conv_layers=1, + fcos_num_class=2, + is_first_stage=False, + is_second_stage=False) + +# dataset settings +dataset_type = 'CharadesSTADataset' +root = 'data/CharadesSTA' +data_root = f'{root}/C3D_unit16_overlap0.5_merged/' +data_root_val = f'{root}/C3D_unit16_overlap0.5_merged/' +ann_file_train = f'{root}/Charades_sta_train.txt' +ann_file_val = f'{root}/Charades_sta_test.txt' +ann_file_test = f'{root}/Charades_sta_test.txt' + +word2id_file = f'{root}/Charades_word2id.json' +fps_file = f'{root}/Charades_fps_dict.json' +duration_file = f'{root}/Charades_duration.json' +num_frames_file = f'{root}/Charades_frames_info.json' +window_size = 16 +ft_overlap = 0.5 + +train_pipeline = [ + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=('vid_name', 'query_tokens', 'query_length', 'num_proposals', + 'num_frames')) +] + +val_pipeline = train_pipeline +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) + +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + word2id_file=word2id_file, + fps_file=fps_file, + duration_file=duration_file, + num_frames_file=num_frames_file, + window_size=window_size, + ft_overlap=ft_overlap), +) +test_dataloader = val_dataloader + +max_epochs = 10 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +val_evaluator = dict(type='RecallatTopK', topK_list=(1, 5), threshold=0.5) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=1e-6), + clip_grad=dict(max_norm=5, norm_type=2)) + +find_unused_parameters = True diff --git a/configs/localization/drn/metafile.yml b/configs/localization/drn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..467c8ca0484a911f8c0b2dd515dfca9d35302a7e --- /dev/null +++ b/configs/localization/drn/metafile.yml @@ -0,0 +1,26 @@ +Collections: +- Name: DRN + README: configs/localization/drn/README.md + Paper: + URL: https://openaccess.thecvf.com/content_CVPR_2020/papers/Zeng_Dense_Regression_Network_for_Video_Grounding_CVPR_2020_paper.pdf + Title: "Dense Regression Network for Video Grounding" + +Models: + - Name: drn_2xb16-4096-10e_c3d-feature_third + Config: configs/localization/drn/drn_2xb16-4096-10e_c3d-feature_third.py + In Collection: DRN + Metadata: + Batch Size: 16 + Epochs: 10 + Training Data: Charades STA + Training Resources: 2 GPUs + feature: C3D + Modality: RGB + Results: + - Dataset: Charades STA + Task: Video Grounding + Metrics: + Recall@Top1(IoU=0.5): 47.04 + Recall@Top5(IoU=0.5): 84.57 + Training Log: https://download.openmmlab.com/mmaction/v1.0/drn_2xb16-4096-10e_c3d-feature.log + Weights: https://download.openmmlab.com/mmaction/v1.0/localization/drn/drn_2xb16-4096-10e_c3d-feature_20230809-ec0429a6.pth diff --git a/configs/localization/tcanet/README.md b/configs/localization/tcanet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..756dbdb795e67383458f35540a04b34f89495308 --- /dev/null +++ b/configs/localization/tcanet/README.md @@ -0,0 +1,66 @@ +# TCANet + +[Temporal Context Aggregation Network for Temporal Action Proposal Refinement](https://openaccess.thecvf.com/content/CVPR2021/papers/Qing_Temporal_Context_Aggregation_Network_for_Temporal_Action_Proposal_Refinement_CVPR_2021_paper.pdf) + + + +## Abstract + + + +Temporal action proposal generation aims to estimate temporal intervals of actions in untrimmed videos, which is a challenging yet important task in the video understanding field. +The proposals generated by current methods still suffer from inaccurate temporal boundaries and inferior confidence used for retrieval owing to the lack of efficient temporal modeling and effective boundary context utilization. +In this paper, we propose Temporal Context Aggregation Network (TCANet) to generate high-quality action proposals through `local and global` temporal context aggregation and complementary as well as progressive boundary refinement. +Specifically, we first design a Local-Global Temporal Encoder (LGTE), which adopts the channel grouping strategy to efficiently encode both `local and global` temporal inter-dependencies. +Furthermore, both the boundary and internal context of proposals are adopted for frame-level and segment-level boundary regressions, respectively. +Temporal Boundary Regressor (TBR) is designed to combine these two regression granularities in an end-to-end fashion, which achieves the precise boundaries and reliable confidence of proposals through progressive refinement. Extensive experiments are conducted on three challenging datasets: HACS, ActivityNet-v1.3, and THUMOS-14, where TCANet can generate proposals with high precision and recall. By combining with the existing action classifier, TCANet can obtain remarkable temporal action detection performance compared with other methods. Not surprisingly, the proposed TCANet won the 1$^{st}$ place in the CVPR 2020 - HACS challenge leaderboard on temporal action localization task. + + + +
+ +
+ +## Results and Models + +### HACS dataset + +| feature | gpus | pretrain | AUC | AR@1 | AR@5 | AR@10 | AR@100 | gpu_mem(M) | iter time(s) | config | ckpt | log | +| :------: | :--: | :------: | :---: | :--: | :---: | :---: | :----: | :--------: | :----------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| SlowOnly | 2 | None | 51.39 | 3.61 | 16.92 | 21.94 | 62.80 | - | - | [config](/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature_20230621-d6bc10b0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.log) | + +For more details on data preparation, you can refer to [HACS Data Preparation](/tools/data/hacs/README.md). + +## Train + +Train TCANet model on HACS dataset with the SlowOnly feature. + +```shell +bash tools/dist_train.sh configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py 2 +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +Test TCANet model on HACS dataset with the SlowOnly feature. + +```shell +python3 tools/test.py configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py CHECKPOINT.PTH +``` + +For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + + + +```BibTeX +@inproceedings{qing2021temporal, + title={Temporal Context Aggregation Network for Temporal Action Proposal Refinement}, + author={Qing, Zhiwu and Su, Haisheng and Gan, Weihao and Wang, Dongliang and Wu, Wei and Wang, Xiang and Qiao, Yu and Yan, Junjie and Gao, Changxin and Sang, Nong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={485--494}, + year={2021} +} +``` diff --git a/configs/localization/tcanet/metafile.yml b/configs/localization/tcanet/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..4afa6946bef5e91ec94e9bb6a322aeb8fb04bea0 --- /dev/null +++ b/configs/localization/tcanet/metafile.yml @@ -0,0 +1,29 @@ +Collections: +- Name: TCANET + README: configs/localization/tcanet/README.md + Paper: + URL: https://arxiv.org/abs/2103.13141 + Title: "Temporal Context Aggregation Network for Temporal Action Proposal Refinement" + +Models: + - Name: tcanet_2xb8-2048x100-9e_hacs-feature.py + Config: configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py + In Collection: TCANET + Metadata: + Batch Size: 8 + Epochs: 9 + Training Data: HACS + Training Resources: 2 GPUs + feature: slowonly + Modality: RGB + Results: + - Dataset: HACS + Task: Temporal Action Localization + Metrics: + AUC: 51.39 + AR@1: 3.61 + AR@5: 16.92 + AR@10: 21.94 + AR@100: 62.80 + Training Log: https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.log + Weights: https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature_20230621-d6bc10b0.pth diff --git a/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py b/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..d430913faff7d42989d6933e16f35e57fbc5fbbd --- /dev/null +++ b/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py @@ -0,0 +1,131 @@ +_base_ = '../../_base_/default_runtime.py' + +# model settings +model = dict( + type='TCANet', + feat_dim=700, + se_sample_num=32, + action_sample_num=64, + temporal_dim=100, + window_size=9, + lgte_num=2, + soft_nms_alpha=0.4, + soft_nms_low_threshold=0.0, + soft_nms_high_threshold=0.0, + post_process_top_k=100, + feature_extraction_interval=16) + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/HACS/slowonly_feature/' +data_root_val = 'data/HACS/slowonly_feature/' +ann_file_train = 'data/HACS/hacs_anno_train.json' +ann_file_val = 'data/HACS/hacs_anno_val.json' +ann_file_test = 'data/HACS/hacs_anno_val.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=('video_name', )) +] + +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=( + 'video_name', + 'duration_second', + 'duration_frame', + 'annotations', + 'feature_frame', + )) +] + +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', 'proposals'), + meta_keys=( + 'video_name', + 'duration_second', + 'duration_frame', + 'annotations', + 'feature_frame', + )) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +max_epochs = 9 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[ + 7, + ], + gamma=0.1) +] + +work_dir = './work_dirs/tcanet_2xb8-2048x100-9e_hacs-feature/' +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/multimodal/vindlu/README.md b/configs/multimodal/vindlu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b7d4c52bcd527a9e809c175c28e8086b1b2dfee --- /dev/null +++ b/configs/multimodal/vindlu/README.md @@ -0,0 +1,87 @@ +# VindLU + +[VindLU: A Recipe for Effective Video-and-Language Pretraining](https://arxiv.org/abs/2212.05051) + + + +## Abstract + + + +The last several years have witnessed remarkable progress in video-and-language (VidL) understanding. However, most modern VidL approaches use complex and specialized model architectures and sophisticated pretraining protocols, making the reproducibility, analysis and comparisons of these frameworks difficult. Hence, instead of proposing yet another new VidL model, this paper conducts a thorough empirical study demystifying the most important factors in the VidL model design. Among the factors that we investigate are (i) the spatiotemporal architecture design, (ii) the multimodal fusion schemes, (iii) the pretraining objectives, (iv) the choice of pretraining data, (v) pretraining and finetuning protocols, and (vi) dataset and model scaling. Our empirical study reveals that the most important design factors include: temporal modeling, video-to-text multimodal fusion, masked modeling objectives, and joint training on images and videos. Using these empirical insights, we then develop a step-by-step recipe, dubbed VindLU, for effective VidL pretraining. Our final model trained using our recipe achieves comparable or better than state-of-the-art results on several VidL tasks without relying on external CLIP pretraining. In particular, on the text-to-video retrieval task, our approach obtains 61.2% on DiDeMo, and 55.0% on ActivityNet, outperforming current SOTA by 7.8% and 6.1% respectively. Furthermore, our model also obtains state-of-the-art video question-answering results on ActivityNet-QA, MSRVTT-QA, MSRVTT-MC and TVQA. Our code and pretrained models are publicly available at: https://github.com/klauscc/VindLU. + + + +
+ +
+ +## Results and Models + +### Video Retrieval on MSRVTT-9k + +| frame sampling strategy | resolution | gpus | vision encoder | text encoder | pretraining | Recall@1 | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------------: | :----------: | :--------------------: | :------: | :-----------------------------------: | :---------------------------------: | :---------------------------------: | +| uniform 12 | 224x224 | 8 | BEiT-Base | Bert-Base | C5M (WebVid-2M + CC3M) | 44.0 | [config](/configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k_20230905-fc36231e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k.log) | + +### Video Question-Answering on MSRVTT-QA + +| frame sampling strategy | resolution | gpus | vision encoder | text encoder | pretraining | top1 acc | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------------: | :----------: | :--------------------: | :------: | :-----------------------------------: | :---------------------------------: | :---------------------------------: | +| uniform 12 | 224x224 | 8 | BEiT-Base | Bert-Base | C5M (WebVid-2M + CC3M) | 43.6 | [config](/configs/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa/vindlu_beit-base_8x8_vqa_msrvtt-qa_20230906-6e693e64.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa/vindlu_beit-base_8x8_vqa_msrvtt-qa.log) | + +### Multiple-Choice Question-Answering on MSRVTT-MC (Inference) + +| frame sampling strategy | resolution | gpus | vision encoder | text encoder | pretraining | top1 acc | config | ckpt | +| :---------------------: | :--------: | :--: | :------------: | :----------: | :--------------------: | :------: | :----------------------------------------------------: | :---------------------------------------------------: | +| uniform 12 | 224x224 | 8 | BEiT-Base | Bert-Base | C5M (WebVid-2M + CC3M) | 97.6 | [config](/configs/multimodal/vindlu/vindlu_beit-base_vqa-mc_msrvtt-mc.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k_20230905-fc36231e.pth) | + +1. Currently, we only support the fine-tuning stage of VindLU models based on the pretrained checkpoint provided by the [original repo](https://github.com/klauscc/VindLU). + +For more details on data preparation, you can refer to [prepare msrvtt](/tools/data/msrvtt/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train VindLU model on MSRVTT-9k dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test CLIP4Clip model on MSRVTT-9k dataset and dump the result to a pkl file. + +```shell +python tools/test.py cconfigs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{cheng2023vindlu, + title={Vindlu: A recipe for effective video-and-language pretraining}, + author={Cheng, Feng and Wang, Xizi and Lei, Jie and Crandall, David and Bansal, Mohit and Bertasius, Gedas}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={10739--10750}, + year={2023} +} +``` diff --git a/configs/multimodal/vindlu/metafile.yml b/configs/multimodal/vindlu/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..68b30d837d0139db756a1b2d1145c22de7774a23 --- /dev/null +++ b/configs/multimodal/vindlu/metafile.yml @@ -0,0 +1,55 @@ +Collections: + - Name: VindLU + README: configs/multimodal/vindlu/README.md + Paper: + URL: https://arxiv.org/abs/2212.05051 + Title: 'VindLU: A Recipe for Effective Video-and-Language Pretraining' + +Models: + - Name: vindlu_beit-base_8x16_retrieval_msrvtt-9k + Config: configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py + In Collection: VindLU + Metadata: + Architecture: BEiT-Base + Batch Size: 16 + Epochs: 5 + Training Data: MSRVTT-9k + Training Resources: 8 GPUs + Results: + Dataset: MSRVTT + Task: Video Retrieval + Metrics: + Recall@1: 44.0 + Recall@5: 70.6 + Recall@10: 80.0 + Training Log: https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k.log + Weights: https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k_20230905-fc36231e.pth + + - Name: vindlu_beit-base_8x8_vqa_msrvtt-qa + Config: configs/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa.py + In Collection: VindLU + Metadata: + Architecture: BEiT-Base + Batch Size: 8 + Epochs: 10 + Training Data: MSRVTT-qa + Training Resources: 8 GPUs + Results: + Dataset: MSRVTT + Task: Video Question-Answering + Metrics: + Top 1 Accuracy: 43.6 + Training Log: https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa/vindlu_beit-base_8x8_vqa_msrvtt-qa.log + Weights: https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa/vindlu_beit-base_8x8_vqa_msrvtt-qa_20230906-6e693e64.pth + + - Name: vindlu_beit-base_vqa-mc_msrvtt-mc + Config: configs/multimodal/vindlu/vindlu_beit-base_vqa-mc_msrvtt-mc.py + In Collection: VindLU + Metadata: + Architecture: BEiT-Base + Results: + Dataset: MSRVTT-MC + Task: Multiple-Choice Question-Answering + Metrics: + Top 1 Accuracy: 97.6 + Weights: https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k/vindlu_beit-base_8x16_retrieval_msrvtt-9k_20230905-fc36231e.pth diff --git a/configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py b/configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py new file mode 100644 index 0000000000000000000000000000000000000000..8dbdc64a7acb606d5aefaa5cdc8b9a245eb5cd13 --- /dev/null +++ b/configs/multimodal/vindlu/vindlu_beit-base_8x16_retrieval_msrvtt-9k.py @@ -0,0 +1,200 @@ +_base_ = ['../../_base_/default_runtime.py'] + +video_root = 'data/msrvtt/videos_2fps_224' +anno_file_train = 'data/msrvtt/annotations/msrvtt_ret_train9k.json' +anno_file_test = 'data/msrvtt/annotations/msrvtt_ret_test1k.json' +pretrained_ckpt_url = 'https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_c5m_pretrain.pth' # noqa: E501 + +# model settings +model = dict( + type='VindLURetrieval', + gradient_checkpointing=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained_ckpt_url), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128], + std=[128], + format_shape='NCTHW'), + tokenizer=dict( + type='VindLUTokenizer', + pretrained_model_name_or_path='bert-base-uncased'), + vision_encoder=dict( + type='BeitModel3D', + config='microsoft/beit-base-patch16-224-pt22k-ft22k', + tem_config=dict( + num_frames=12, + temporal_model_block='timesformer', + temporal_model_position='last', + temporal_model_config=dict(input_dim=768), + use_temporal_position_embedding=True), + encoder_width=768, + add_ln=True), + text_encoder=dict( + type='XBertModel', + pretrained_model_name_or_path='bert-base-uncased', + encoder_width=768, + fusion_layer=9, + add_pooling_layer=False), + proj_dim=256, + temperature=0.07, + max_txt_len=32, + topk=128) + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + out_of_bound_opt='repeat_last', + ), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop', area_range=(0.5, 1.0)), + dict( + type='Resize', + scale=(224, 224), + keep_ratio=False, + interpolation='bicubic'), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict( + type='PackActionInputs', + algorithm_keys=( + 'text', + 'gt_video_id', + 'gt_text_id', + )) +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + test_mode=True, + out_of_bound_opt='repeat_last'), + dict(type='DecordDecode'), + dict( + type='Resize', + scale=(224, 224), + keep_ratio=False, + interpolation='bicubic'), + dict(type='FormatShape', input_format='NCHW'), + dict( + type='PackActionInputs', + algorithm_keys=( + 'text', + 'gt_video_id', + 'gt_text_id', + )) +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + test_mode=True, + out_of_bound_opt='repeat_last'), + dict(type='DecordDecode'), + dict( + type='Resize', + scale=(224, 224), + keep_ratio=False, + interpolation='bicubic'), + dict(type='FormatShape', input_format='NCHW'), + dict( + type='PackActionInputs', + algorithm_keys=( + 'text', + 'gt_video_id', + 'gt_text_id', + )) +] + +dataset_type = 'MSRVTTRetrieval' + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=anno_file_train, + pipeline=train_pipeline, + data_prefix=dict(video=video_root), + )) + +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=anno_file_test, + pipeline=test_pipeline, + data_prefix=dict(video=video_root), + )) + +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=anno_file_test, + pipeline=test_pipeline, + data_prefix=dict(video=video_root), + )) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='RetrievalValLoop') +test_cfg = dict(type='RetrievalTestLoop') + +val_evaluator = dict(type='RetrievalRecall', topk=(1, 5, 10)) +test_evaluator = dict(type='RetrievalRecall', topk=(1, 5, 10)) + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=5, + eta_min_ratio=0.01, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.02), + paramwise_cfg=dict( + bypass_duplicate=True, norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=50, norm_type=2), +) + +model_wrapper_cfg = dict(type='MMDistributedDataParallel', static_graph=True) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1, + save_best='t2i/retrieval/Recall@1', + rule='greater'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False)) + +auto_scale_lr = dict(enable=True, base_batch_size=128) + +find_unused_parameters = True + +custom_hooks = [dict(type='EmptyCacheHook', after_epoch=True)] diff --git a/configs/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa.py b/configs/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa.py new file mode 100644 index 0000000000000000000000000000000000000000..e7d324b6d072b8b021c4a888e51767ce5eb1ca6e --- /dev/null +++ b/configs/multimodal/vindlu/vindlu_beit-base_8x8_vqa_msrvtt-qa.py @@ -0,0 +1,190 @@ +_base_ = ['../../_base_/default_runtime.py'] + +video_root = 'data/msrvtt/videos_2fps_224' +anno_file_train = 'data/msrvtt/annotations/msrvtt_qa_train.json' +anno_file_val = 'data/msrvtt/annotations/msrvtt_qa_val.json' +anno_file_test = 'data/msrvtt/annotations/msrvtt_qa_test.json' +answer_list_file = 'data/msrvtt/annotations/msrvtt_qa_answer_list.json' +pretrained_ckpt_url = 'https://download.openmmlab.com/mmaction/v1.0/multimodal/vindlu/vindlu_c5m_pretrain.pth' # noqa: E501 + +# model settings +model = dict( + type='VindLUVQA', + init_cfg=dict(type='Pretrained', checkpoint=pretrained_ckpt_url), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128], + std=[128], + format_shape='NCTHW'), + tokenizer=dict( + type='VindLUTokenizer', + pretrained_model_name_or_path='bert-base-uncased', + ), + vision_encoder=dict( + type='BeitModel3D', + config='microsoft/beit-base-patch16-224-pt22k-ft22k', + tem_config=dict( + num_frames=12, + temporal_model_block='timesformer', + temporal_model_position='last', + temporal_model_config=dict(input_dim=768), + use_temporal_position_embedding=True), + encoder_width=768, + add_ln=True), + text_encoder=dict( + type='XBertModel', + pretrained_model_name_or_path='bert-base-uncased', + encoder_width=768, + fusion_layer=9, + add_pooling_layer=False), + text_decoder=dict( + type='BertDecoder', + pretrained_model_name_or_path='bert-base-uncased', + encoder_width=768, + fusion_layer=0, + num_hidden_layers=3, + add_pooling_layer=True), + proj_dim=256, + temperature=0.07, + max_question_len=25, + max_answer_len=5, + num_ans_candidates=128, + gradient_checkpointing=True, + answer_list_path=answer_list_file) + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + out_of_bound_opt='repeat_last'), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop', area_range=(0.5, 1.0)), + dict( + type='Resize', + scale=(224, 224), + keep_ratio=False, + interpolation='bicubic'), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict( + type='PackActionInputs', + algorithm_keys=( + 'question', + 'question_id', + 'gt_answer', + 'gt_answer_weight', + )) +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + test_mode=True, + out_of_bound_opt='repeat_last'), + dict(type='DecordDecode'), + dict( + type='Resize', + scale=(224, 224), + keep_ratio=False, + interpolation='bicubic'), + dict(type='FormatShape', input_format='NCHW'), + dict( + type='PackActionInputs', + algorithm_keys=( + 'question', + 'gt_answer', + 'question_id', + )) +] + +test_pipeline = val_pipeline + +dataset_type = 'MSRVTTVQA' + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=anno_file_train, + pipeline=train_pipeline, + data_prefix=dict(video=video_root), + )) + +val_dataloader = dict( + batch_size=16, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=anno_file_val, + pipeline=val_pipeline, + data_prefix=dict(video=video_root), + )) + +test_dataloader = dict( + batch_size=16, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=anno_file_test, + pipeline=test_pipeline, + data_prefix=dict(video=video_root), + )) + +val_evaluator = dict(type='VQAAcc') +test_evaluator = dict(type='VQAAcc') + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=10, + eta_min_ratio=0.01, + by_epoch=True, + begin=1, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict(type='AdamW', lr=1e-5, weight_decay=0.02), + paramwise_cfg=dict( + bypass_duplicate=True, norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=50, norm_type=2), +) + +model_wrapper_cfg = dict(type='MMDistributedDataParallel', static_graph=True) + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=20, ignore_last=False)) + +auto_scale_lr = dict(enable=True, base_batch_size=32) + +find_unused_parameters = True diff --git a/configs/multimodal/vindlu/vindlu_beit-base_vqa-mc_msrvtt-mc.py b/configs/multimodal/vindlu/vindlu_beit-base_vqa-mc_msrvtt-mc.py new file mode 100644 index 0000000000000000000000000000000000000000..89cbe479dd4c51865bab18148ee3706acb0fe925 --- /dev/null +++ b/configs/multimodal/vindlu/vindlu_beit-base_vqa-mc_msrvtt-mc.py @@ -0,0 +1,80 @@ +_base_ = ['../../_base_/default_runtime.py'] + +video_root = 'data/msrvtt/videos_2fps_224' +anno_file_test = 'data/msrvtt/annotations/msrvtt_mc_test.json' + +# model settings +model = dict( + type='VindLURetrievalMC', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128], + std=[128], + format_shape='NCTHW'), + tokenizer=dict( + type='VindLUTokenizer', + pretrained_model_name_or_path='bert-base-uncased'), + vision_encoder=dict( + type='BeitModel3D', + config='microsoft/beit-base-patch16-224-pt22k-ft22k', + tem_config=dict( + num_frames=12, + temporal_model_block='timesformer', + temporal_model_position='last', + temporal_model_config=dict(input_dim=768), + use_temporal_position_embedding=True), + encoder_width=768, + add_ln=True), + text_encoder=dict( + type='XBertModel', + pretrained_model_name_or_path='bert-base-uncased', + encoder_width=768, + fusion_layer=9, + add_pooling_layer=False), + text_decoder=dict( + type='BertDecoder', + pretrained_model_name_or_path='bert-base-uncased', + encoder_width=768, + fusion_layer=0, + num_hidden_layers=3, + add_pooling_layer=True), + proj_dim=256, + temperature=0.07, + max_txt_len=32, + gradient_checkpointing=True) + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=12, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs', algorithm_keys=('caption_options', )) +] + +dataset_type = 'MSRVTTVQAMC' + +test_dataloader = dict( + batch_size=32, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=anno_file_test, + pipeline=test_pipeline, + data_prefix=dict(video=video_root), + )) + +test_evaluator = dict(type='VQAMCACC') +test_cfg = dict(type='TestLoop') + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=20, ignore_last=False), ) diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dbc265249e0227d6fbeea5b83e9e0d8f6572d214 --- /dev/null +++ b/configs/recognition/c2d/README.md @@ -0,0 +1,80 @@ +# C2D + + + +[Non-local Neural Networks](https://arxiv.org/abs/1711.07971) + +## Abstract + + + +Both convolutional and recurrent operations are building blocks that process one local neighborhood at a time. In this paper, we present non-local operations as a generic family of building blocks for capturing long-range dependencies. Inspired by the classical non-local means method in computer vision, our non-local operation computes the response at a position as a weighted sum of the features at all positions. This building block can be plugged into many computer vision architectures. On the task of video classification, even without any bells and whistles, our non-local models can compete or outperform current competition winners on both Kinetics and Charades datasets. In static image recognition, our non-local models improve object detection/segmentation and pose estimation on the COCO suite of tasks. + + + +
+ + +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :-------------: | :------: | :------: | :------: | :-----------------------: | :-----------------------: | :---------------: | :---: | :----: | :------------: | :----------: | :---------: | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
| ImageNet | 73.44 | 91.00 | 67.2
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 87.8
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 10 clips x 3 crop | 33G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet101
| ImageNet | 74.97 | 91.77 | x | x | 10 clips x 3 crop | 63G | 43.3M | [config](/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 73.89 | 91.21 | 71.9
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 90.0
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 10 clips x 3 crop | 19G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 16x4x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 74.97 | 91.91 | x | x | 10 clips x 3 crop | 39G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | + +1. The values in columns named after "reference" are the results reported in the original repo, using the same model settings. +2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train C2D model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test C2D model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{XiaolongWang2017NonlocalNN, + title={Non-local Neural Networks}, + author={Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He}, + journal={arXiv: Computer Vision and Pattern Recognition}, + year={2017} +} +``` diff --git a/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..8c1515fc9f3b7257ed842ee59ee21571fa4c2b09 --- /dev/null +++ b/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py @@ -0,0 +1,7 @@ +_base_ = ['c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + pretrained=('https://download.pytorch.org/' + 'models/resnet101-cd907fc2.pth'), + depth=101)) diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..83cb66136bdfd715f0841b026e62f45b761e9dcc --- /dev/null +++ b/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py @@ -0,0 +1,101 @@ +_base_ = [ + '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c004aa2d9ffc9fea7625d6bd600e394e0b2216fe --- /dev/null +++ b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py @@ -0,0 +1,101 @@ +_base_ = [ + '../../_base_/models/c2d_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5ec71c9b6ee01e06d43108dc6bfe8ae11ea1b26d --- /dev/null +++ b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py @@ -0,0 +1,101 @@ +_base_ = [ + '../../_base_/models/c2d_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/c2d/metafile.yml b/configs/recognition/c2d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..f8530da97030a3a29e13425d324f6988c02cfa80 --- /dev/null +++ b/configs/recognition/c2d/metafile.yml @@ -0,0 +1,99 @@ +Collections: + - Name: C2D + README: configs/recognition/c2d/README.md + Paper: + URL: https://arxiv.org/abs/1711.07971 + Title: 'Non-local Neural Networks' + +Models: + - Name: c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb + Config: configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py + In Collection: C2D + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 33G + Parameters: 24.3M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.44 + Top 5 Accuracy: 91.00 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth + + - Name: c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb + Config: configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py + In Collection: C2D + Metadata: + Architecture: ResNet101 + Batch Size: 32 + Epochs: 100 + FLOPs: 63G + Parameters: 43.3M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.97 + Top 5 Accuracy: 91.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth + + - Name: c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb + Config: configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py + In Collection: C2D + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 19G + Parameters: 24.3M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.89 + Top 5 Accuracy: 91.21 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth + + - Name: c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb + Config: configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb + In Collection: C2D + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 39G + Parameters: 24.3M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.97 + Top 5 Accuracy: 91.91 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c1796528c97471716888c833ba6a5715af00360b --- /dev/null +++ b/configs/recognition/c3d/README.md @@ -0,0 +1,79 @@ +# C3D + +[Learning Spatiotemporal Features with 3D Convolutional Networks](https://openaccess.thecvf.com/content_iccv_2015/html/Tran_Learning_Spatiotemporal_Features_ICCV_2015_paper.html) + + + +## Abstract + + + +We propose a simple, yet effective approach for spatiotemporal feature learning using deep 3-dimensional convolutional networks (3D ConvNets) trained on a large scale supervised video dataset. Our findings are three-fold: 1) 3D ConvNets are more suitable for spatiotemporal feature learning compared to 2D ConvNets; 2) A homogeneous architecture with small 3x3x3 convolution kernels in all layers is among the best performing architectures for 3D ConvNets; and 3) Our learned features, namely C3D (Convolutional 3D), with a simple linear classifier outperform state-of-the-art methods on 4 different benchmarks and are comparable with current best methods on the other 2 benchmarks. In addition, the features are compact: achieving 52.8% accuracy on UCF101 dataset with only 10 dimensions and also very efficient to compute due to the fast inference of ConvNets. Finally, they are conceptually very simple and easy to train and use. + + + +
+ +
+ +## Results and Models + +### UCF-101 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :----------------------------------: | :--------------------------------: | :-------------------------------: | +| 16x1x1 | 112x112 | 8 | c3d | sports1m | 83.08 | 95.93 | 10 clips x 1 crop | 38.5G | 78.4M | [config](/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log) | + +1. The author of C3D normalized UCF-101 with volume mean and used SVM to classify videos, while we normalized the dataset with RGB mean value and used a linear classifier. +2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. + +For more details on data preparation, you can refer to [UCF101](/tools/data/ucf101/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train C3D model on UCF-101 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test C3D model on UCF-101 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + + + +```BibTeX +@ARTICLE{2014arXiv1412.0767T, +author = {Tran, Du and Bourdev, Lubomir and Fergus, Rob and Torresani, Lorenzo and Paluri, Manohar}, +title = {Learning Spatiotemporal Features with 3D Convolutional Networks}, +keywords = {Computer Science - Computer Vision and Pattern Recognition}, +year = 2014, +month = dec, +eid = {arXiv:1412.0767} +} +``` diff --git a/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..926894e2b6d730c826534a426f5269d9da0b06dc --- /dev/null +++ b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py @@ -0,0 +1,116 @@ +_base_ = [ + '../../_base_/models/c3d_sports1m_pretrained.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos' +data_root_val = 'data/ucf101/videos' +split = 1 # official train/test splits. valid numbers: 1, 2, 3 +ann_file_train = f'data/ucf101/ucf101_train_split_{split}_videos.txt' +ann_file_val = f'data/ucf101/ucf101_val_split_{split}_videos.txt' +ann_file_test = f'data/ucf101/ucf101_val_split_{split}_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=16, frame_interval=1, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 128)), + dict(type='RandomCrop', size=112), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=1, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 128)), + dict(type='CenterCrop', crop_size=112), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=1, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 128)), + dict(type='CenterCrop', crop_size=112), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=30, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=30, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=45, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=45, + by_epoch=True, + milestones=[20, 40], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(interval=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (30 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=240) diff --git a/configs/recognition/c3d/metafile.yml b/configs/recognition/c3d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..9f87925b395a990714475ec54d246923f4f41cb4 --- /dev/null +++ b/configs/recognition/c3d/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: C3D + README: configs/recognition/c3d/README.md + Paper: + URL: https://arxiv.org/abs/1412.0767 + Title: 'Learning Spatiotemporal Features with 3D Convolutional Networks' + +Models: + - Name: c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb + Config: configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py + In Collection: C3D + Metadata: + Architecture: c3d + Batch Size: 30 + Epochs: 45 + FLOPs: 38.5G + Parameters: 78.4M + Pretrained: sports1m + Resolution: 112x112 + Training Data: UCF101 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: UCF101 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 83.08 + Top 5 Accuracy: 95.93 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth diff --git a/configs/recognition/csn/README.md b/configs/recognition/csn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc1d4418f92700b41e59f4789d036fd9956412d7 --- /dev/null +++ b/configs/recognition/csn/README.md @@ -0,0 +1,97 @@ +# CSN + +[Video Classification With Channel-Separated Convolutional Networks](https://openaccess.thecvf.com/content_ICCV_2019/html/Tran_Video_Classification_With_Channel-Separated_Convolutional_Networks_ICCV_2019_paper.html) + + + +## Abstract + + + +Group convolution has been shown to offer great computational savings in various 2D convolutional architectures for image classification. It is natural to ask: 1) if group convolution can help to alleviate the high computational cost of video classification networks; 2) what factors matter the most in 3D group convolutional networks; and 3) what are good computation/accuracy trade-offs with 3D group convolutional networks. This paper studies the effects of different design choices in 3D group convolutional networks for video classification. We empirically demonstrate that the amount of channel interactions plays an important role in the accuracy of 3D group convolutional networks. Our experiments suggest two main findings. First, it is a good practice to factorize 3D convolutions by separating channel interactions and spatiotemporal interactions as this leads to improved accuracy and lower computational cost. Second, 3D channel-separated convolutions provide a form of regularization, yielding lower training accuracy but higher test accuracy compared to 3D convolutions. These two empirical findings lead us to design an architecture -- Channel-Separated Convolutional Network (CSN) -- which is simple, efficient, yet accurate. On Sports1M, Kinetics, and Something-Something, our CSNs are comparable with or better than the state-of-the-art while being 2-3 times more efficient. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------------: | :---------------------------: | :--------------------------: | +| 32x2x1 | 224x224 | 8 | ResNet152 (IR) | IG65M | 82.87 | 95.90 | 10 clips x 3 crop | 97.63G | 29.70M | [config](/configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb_20220811-c7a3cc5b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet152 (IR+BNFrozen) | IG65M | 82.84 | 95.92 | 10 clips x 3 crop | 97.63G | 29.70M | [config](/configs/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-7d1dacde.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (IR+BNFrozen) | IG65M | 79.44 | 94.26 | 10 clips x 3 crop | 55.90G | 13.13M | [config](/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-44395bae.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | x | ResNet152 (IP) | None | 77.80 | 93.10 | 10 clips x 3 crop | 109.9G | 33.02M | [config](/configs/recognition/csn/ipcsn_r152_32x2x1-180e_kinetics400-rgb.py) | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-d565828d.pth) | x | +| 32x2x1 | 224x224 | x | ResNet152 (IR) | None | 76.53 | 92.28 | 10 clips x 3 crop | 97.6G | 29.70M | [config](/configs/recognition/csn/ircsn_r152_32x2x1-180e_kinetics400-rgb.py) | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-5c933ae1.pth) | x | +| 32x2x1 | 224x224 | x | ResNet152 (IP+BNFrozen) | IG65M | 82.68 | 95.69 | 10 clips x 3 crop | 109.9G | 33.02M | [config](/configs/recognition/csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py) | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-c3be9793.pth) | x | +| 32x2x1 | 224x224 | x | ResNet152 (IP+BNFrozen) | Sports1M | 79.07 | 93.82 | 10 clips x 3 crop | 109.9G | 33.02M | [config](/configs/recognition/csn/ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py) | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-3367437a.pth) | x | +| 32x2x1 | 224x224 | x | ResNet152 (IR+BNFrozen) | Sports1M | 78.57 | 93.44 | 10 clips x 3 crop | 109.9G | 33.02M | [config](/configs/recognition/csn/ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py) | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-b9b10241.pth) | x | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +3. The **infer_ckpt** means those checkpoints are ported from [VMZ](https://github.com/facebookresearch/VMZ). + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train CSN model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test CSN model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{inproceedings, +author = {Wang, Heng and Feiszli, Matt and Torresani, Lorenzo}, +year = {2019}, +month = {10}, +pages = {5551-5560}, +title = {Video Classification With Channel-Separated Convolutional Networks}, +doi = {10.1109/ICCV.2019.00565} +} +``` + + + +```BibTeX +@inproceedings{ghadiyaram2019large, + title={Large-scale weakly-supervised pre-training for video action recognition}, + author={Ghadiyaram, Deepti and Tran, Du and Mahajan, Dhruv}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={12046--12055}, + year={2019} +} +``` diff --git a/configs/recognition/csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5c34e110eea39670db6236bf1506107ad5088e51 --- /dev/null +++ b/configs/recognition/csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,13 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ip', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ipcsn_from_scratch_r152_ig65m_20210617-c4b99d38.pth' # noqa: E501 + )) diff --git a/configs/recognition/csn/ipcsn_r152_32x2x1-180e_kinetics400-rgb.py b/configs/recognition/csn/ipcsn_r152_32x2x1-180e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f350bee3a1a1840fdb59f466d0d9600361ecaf63 --- /dev/null +++ b/configs/recognition/csn/ipcsn_r152_32x2x1-180e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict(bottleneck_mode='ip', pretrained=None), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[110.2008, 100.63983, 95.99475], + std=[58.14765, 56.46975, 55.332195], + format_shape='NCTHW')) diff --git a/configs/recognition/csn/ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..41ec41480afe6786bf55809dd9161e9ce5a9b0eb --- /dev/null +++ b/configs/recognition/csn/ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,18 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ip', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ipcsn_from_scratch_r152_sports1m_20210617-7a7cc5b9.pth' # noqa: E501 + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[110.2008, 100.63983, 95.99475], + std=[58.14765, 56.46975, 55.332195], + format_shape='NCTHW')) diff --git a/configs/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..381ddf791f4d952469a8e4227872ce6d8485b4cc --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,135 @@ +_base_ = [ + '../../_base_/models/ircsn_r152.py', '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + pretrained='https://download.openmmlab.com/mmaction/recognition/csn/' + 'ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth')) + +# dataset settings +dataset_type = 'VideoDataset' +root = './data/kinetics400/' +data_root = root + 'videos_train' +data_root_val = root + 'videos_val' +data_root_test = data_root_val + +ann_file_train = root + 'kinetics400_train_list_videos.txt' +ann_file_val = root + 'kinetics400_val_list_videos.txt' +ann_file_test = ann_file_val + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=58, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=16), + dict( + type='MultiStepLR', + begin=0, + end=58, + by_epoch=True, + milestones=[32, 48], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=5e-4, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(interval=2, max_keep_ckpts=5)) +find_unused_parameters = True + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (12 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=96) diff --git a/configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ead2956d905d48937a6d61b92fcf86703a1acb7c --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,8 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +model = dict( + backbone=dict( + pretrained='https://download.openmmlab.com/mmaction/recognition/csn/' + 'ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth')) diff --git a/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..67c50ce9d6cd18a88496807f8134509fcbea6b21 --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + depth=50, + norm_eval=True, + bn_frozen=True, + pretrained='https://download.openmmlab.com/mmaction/recognition/csn/' + 'ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth')) diff --git a/configs/recognition/csn/ircsn_r152_32x2x1-180e_kinetics400-rgb.py b/configs/recognition/csn/ircsn_r152_32x2x1-180e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7a94caf36e28c450f47d1a7e83a36647fb109d00 --- /dev/null +++ b/configs/recognition/csn/ircsn_r152_32x2x1-180e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict(bottleneck_mode='ir', pretrained=None), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[110.2008, 100.63983, 95.99475], + std=[58.14765, 56.46975, 55.332195], + format_shape='NCTHW')) diff --git a/configs/recognition/csn/ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py b/configs/recognition/csn/ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..b775455bfa9f627719e027fce0b39c54949ac94f --- /dev/null +++ b/configs/recognition/csn/ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py @@ -0,0 +1,18 @@ +_base_ = [ + 'ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ir', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_sports1m_20210617-bcc9c0dd.pth' # noqa: E501 + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[110.2008, 100.63983, 95.99475], + std=[58.14765, 56.46975, 55.332195], + format_shape='NCTHW')) diff --git a/configs/recognition/csn/metafile.yml b/configs/recognition/csn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..ea2a70a523e860f1e5c7d95b80f72cd056064ad4 --- /dev/null +++ b/configs/recognition/csn/metafile.yml @@ -0,0 +1,191 @@ +Collections: + - Name: CSN + README: configs/recognition/csn/README.md + Paper: + URL: https://arxiv.org/abs/1904.02811 + Title: 'Video Classification with Channel-Separated Convolutional Networks' + +Models: + - Name: ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Batch Size: 12 + Epochs: 58 + FLOPs: 97.63G + Parameters: 29.70M + Pretrained: IG65M + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 82.87 + Top 5 Accuracy: 95.90 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb_20220811-c7a3cc5b.pth + + - Name: ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Batch Size: 12 + Epochs: 58 + FLOPs: 97.63G + Parameters: 29.70M + Pretrained: IG65M + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 82.84 + Top 5 Accuracy: 95.92 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r152-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-7d1dacde.pth + + - Name: ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet50 + Batch Size: 12 + Epochs: 58 + FLOPs: 55.90G + Parameters: 13.13M + Pretrained: IG65M + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 79.44 + Top 5 Accuracy: 94.26 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-44395bae.pth + + - Name: ipcsn_r152_32x2x1-180e_kinetics400-rgb + Config: configs/recognition/csn/ipcsn_r152_32x2x1-180e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 180 + FLOPs: 109.9G + Parameters: 33.02M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 77.80 + Top 5 Accuracy: 93.10 + Converted From: + Weights: https://www.dropbox.com/s/3fihu6ti60047mu/ipCSN_152_kinetics_from_scratch_f129594342.pkl?dl=0 + Code: https://github.com/facebookresearch/VMZ/tree/b61b08194bc3273bef4c45fdfdd36c56c8579ff3 + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-d565828d.pth + + - Name: ircsn_r152_32x2x1-180e_kinetics400-rgb + Config: configs/recognition/csn/ircsn_r152_32x2x1-180e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 180 + FLOPs: 97.63G + Parameters: 29.70M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.53 + Top 5 Accuracy: 92.28 + Converted From: + Weights: https://www.dropbox.com/s/46gcm7up60ssx5c/irCSN_152_kinetics_from_scratch_f98268019.pkl?dl=0 + Code: https://github.com/facebookresearch/VMZ/tree/b61b08194bc3273bef4c45fdfdd36c56c8579ff3 + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-5c933ae1.pth + + - Name: ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 109.9G + Parameters: 33.02M + Pretrained: IG65M + Resolution: 224x224 + Training Data: Kinetics-400 + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 82.68 + Top 5 Accuracy: 95.69 + Converted From: + Weights: https://www.dropbox.com/s/zpp3p0vn2i7bibl/ipCSN_152_ft_kinetics_from_ig65m_f133090949.pkl?dl=0 + Code: https://github.com/facebookresearch/VMZ/tree/b61b08194bc3273bef4c45fdfdd36c56c8579ff3 + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-c3be9793.pth + + - Name: ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ipcsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 109.9G + Parameters: 33.02M + Pretrained: Sports1M + Resolution: 224x224 + Training Data: Kinetics-400 + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 79.07 + Top 5 Accuracy: 93.82 + Converted From: + Weights: https://www.dropbox.com/s/ir7cr0hda36knux/ipCSN_152_ft_kinetics_from_sports1m_f111279053.pkl?dl=0 + Code: https://github.com/facebookresearch/VMZ/tree/b61b08194bc3273bef4c45fdfdd36c56c8579ff3 + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-3367437a.pth + + - Name: ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb + Config: configs/recognition/csn/ircsn_sports1m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 109.9G + Parameters: 33.02M + Pretrained: Sports1M + Resolution: 224x224 + Training Data: Kinetics-400 + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 78.57 + Top 5 Accuracy: 93.44 + Converted From: + Weights: https://www.dropbox.com/s/zuoj1aqouh6bo6k/irCSN_152_ft_kinetics_from_sports1m_f101599884.pkl?dl=0 + Code: https://github.com/facebookresearch/VMZ/tree/b61b08194bc3273bef4c45fdfdd36c56c8579ff3 + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-b9b10241.pth diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b4a17ecbfe7e9ed9dbe3ae30637d636b4516fd25 --- /dev/null +++ b/configs/recognition/i3d/README.md @@ -0,0 +1,95 @@ +# I3D + +[Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset](https://openaccess.thecvf.com/content_cvpr_2017/html/Carreira_Quo_Vadis_Action_CVPR_2017_paper.html) + +[Non-local Neural Networks](https://openaccess.thecvf.com/content_cvpr_2018/html/Wang_Non-Local_Neural_Networks_CVPR_2018_paper.html) + + + +## Abstract + + + +The paucity of videos in current action classification datasets (UCF-101 and HMDB-51) has made it difficult to identify good video architectures, as most methods obtain similar performance on existing small-scale benchmarks. This paper re-evaluates state-of-the-art architectures in light of the new Kinetics Human Action Video dataset. Kinetics has two orders of magnitude more data, with 400 human action classes and over 400 clips per class, and is collected from realistic, challenging YouTube videos. We provide an analysis on how current architectures fare on the task of action classification on this dataset and how much performance improves on the smaller benchmark datasets after pre-training on Kinetics. We also introduce a new Two-Stream Inflated 3D ConvNet (I3D) that is based on 2D ConvNet inflation: filters and pooling kernels of very deep image classification ConvNets are expanded into 3D, making it possible to learn seamless spatio-temporal feature extractors from video while leveraging successful ImageNet architecture designs and even their parameters. We show that, after pre-training on Kinetics, I3D models considerably improve upon the state-of-the-art in action classification, reaching 80.9% on HMDB-51 and 98.0% on UCF-101. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :--------------------------: | :-------------------------: | :------------------------: | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.80 | 92.07 | 10 clips x 3 crop | 59.3G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.73 | 91.80 | 10 clips x 3 crop | 59.3G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.97 | 91.33 | 10 clips x 3 crop | 56.5 | 31.7M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.47 | 91.27 | 10 clips x 3 crop | 43.5G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| dense-32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.77 | 91.35 | 10 clips x 3 crop | 43.5G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (Heavy) | ImageNet | 76.21 | 92.48 | 10 clips x 3 crop | 166.3G | 33.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train I3D model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test I3D model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{inproceedings, + author = {Carreira, J. and Zisserman, Andrew}, + year = {2017}, + month = {07}, + pages = {4724-4733}, + title = {Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset}, + doi = {10.1109/CVPR.2017.502} +} +``` + + + +```BibTeX +@article{NonLocal2018, + author = {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He}, + title = {Non-local Neural Networks}, + journal = {CVPR}, + year = {2018} +} +``` diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..fe936aec048ae50a371e336bc2ea91d0999c66c4 --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,9 @@ +_base_ = ['./i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + inflate=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + with_pool2=True)) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0bc4503b010f6ddd24a3099098f73e62a12c724c --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,112 @@ +_base_ = [ + '../../_base_/models/i3d_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (0, 1, 0, 1), (0, 1, 0, 1, 0, 1), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='dot_product'))) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c3149bc950c977e443dd1dd1a4391dbc28efff3c --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,13 @@ +_base_ = [ + 'i3d_imagenet-pretrained-r50-nl-dot-product_' + + '8xb8-32x2x1-100e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian'))) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..3f6dc9a60fd23e39db075edaf11ecb1cdfa5a978 --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,13 @@ +_base_ = [ + 'i3d_imagenet-pretrained-r50-nl-dot-product_' + + '8xb8-32x2x1-100e_kinetics400-rgb.py' +] + +# model settings +model = dict( + backbone=dict( + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='gaussian'))) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e3780990ac4573c1318f6fae9ec9890e927d12f4 --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,103 @@ +_base_ = [ + '../../_base_/models/i3d_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdc4f7b318134c14ba317118c1d21b90f2850ad --- /dev/null +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py @@ -0,0 +1,88 @@ +_base_ = ['./i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='DenseSampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/i3d/metafile.yml b/configs/recognition/i3d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..7390dcf0af0cfb81a060a3ff275cb688f9169631 --- /dev/null +++ b/configs/recognition/i3d/metafile.yml @@ -0,0 +1,147 @@ +Collections: + - Name: I3D + README: configs/recognition/i3d/README.md + Paper: + URL: https://arxiv.org/abs/1705.07750 + Title: 'Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset' + +Models: + - Name: i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb + Alias: + - i3d + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 59.3G + Parameters: 35.4M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.80 + Top 5 Accuracy: 92.07 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth + + - Name: i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 59.3G + Parameters: 35.4M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.73 + Top 5 Accuracy: 91.80 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth + + - Name: i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 56.5G + Parameters: 31.7M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.97 + Top 5 Accuracy: 91.33 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth + + - Name: i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 43.5G + Parameters: 28.0M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.47 + Top 5 Accuracy: 91.27 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth + + - Name: i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 43.5G + Parameters: 28.0M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.77 + Top 5 Accuracy: 91.35 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth + + - Name: i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb + Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py + In Collection: I3D + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 166.3G + Parameters: 33.0M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.21 + Top 5 Accuracy: 92.48 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6d2d135231bc6ca1976d1afac8a8b4ff91e33f43 --- /dev/null +++ b/configs/recognition/mvit/README.md @@ -0,0 +1,99 @@ +# MViT V2 + +[MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf) + + + +## Abstract + + + +In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video classification, as well as object detection. We present an improved version of MViT that incorporates decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as well as 86.1% on Kinetics-400 video classification. + + + +
+ +
+ +## Results and Models + +1. Models with * in `Inference results` are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data, and models in `Training results` are trained in MMAction2 on our data. +2. The values in columns named after `reference` are copied from paper, and `reference*` are results using [SlowFast](https://github.com/facebookresearch/SlowFast/) repo and trained on our data. +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +4. MaskFeat fine-tuning experiment is based on pretrain model from [MMSelfSup](https://github.com/open-mmlab/mmselfsup/tree/main/projects/maskfeat_video), and the corresponding reference result is based on pretrain model from [SlowFast](https://github.com/facebookresearch/SlowFast/). +5. Due to the different versions of Kinetics-400, our training results are different from paper. +6. Due to the training efficiency, we currently only provide MViT-small training results, we don't ensure other config files' training accuracy and welcome you to contribute your reproduction results. +7. We use `repeat augment` in MViT training configs following [SlowFast](https://github.com/facebookresearch/SlowFast/). [Repeat augment](https://arxiv.org/pdf/1901.09335.pdf) takes multiple times of data augment for one video, this way can improve the generalization of the model and relieve the IO stress of loading videos. And please note that the actual batch size is `num_repeats` times of `batch_size` in `train_dataloader`. + +### Inference results + +#### Kinetics-400 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :--------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :---: | :----: | :------------------: | :----------------: | +| 16x4x1 | 224x224 | MViTv2-S\* | From scratch | 81.1 | 94.7 | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 64G | 34.5M | [config](/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth) | +| 32x3x1 | 224x224 | MViTv2-B\* | From scratch | 82.6 | 95.8 | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 225G | 51.2M | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth) | +| 40x3x1 | 312x312 | MViTv2-L\* | From scratch | 85.4 | 96.2 | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | 2828G | 213M | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth) | + +#### Something-Something V2 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :--------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :---: | :----: | :------------------: | :----------------: | +| uniform 16 | 224x224 | MViTv2-S\* | K400 | 68.1 | 91.0 | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | 64G | 34.4M | [config](/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth) | +| uniform 32 | 224x224 | MViTv2-B\* | K400 | 70.8 | 92.7 | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | 225G | 51.1M | [config](/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth) | +| uniform 40 | 312x312 | MViTv2-L\* | IN21K + K400 | 73.2 | 94.0 | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | 2828G | 213M | [config](/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth) | + +### Training results + +#### Kinetics-400 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference\* top1 acc | reference\* top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :------: | :-----------: | :------: | :------: | :---------------------------: | :----------------------------: | :---------------: | :---: | :----: | :--------------: | :------------: | :-----------: | +| 16x4x1 | 224x224 | MViTv2-S | From scratch | 80.6 | 94.7 | [80.8](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 64G | 34.5M | [config](/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb_20230201-23284ff3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.log) | +| 16x4x1 | 224x224 | MViTv2-S | K400 MaskFeat | 81.8 | 95.2 | [81.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/maskfeat/README.md) | [94.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/maskfeat/README.md) | 10 clips x 1 crop | 71G | 36.4M | [config](/configs/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb_20230201-5bced1d0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | + +the corresponding result without repeat augment is as follows: + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference\* top1 acc | reference\* top5 acc | testing protocol | FLOPs | params | +| :---------------------: | :--------: | :------: | :----------: | :------: | :------: | :--------------------------------------------------: | :--------------------------------------------------: | :--------------: | :---: | :----: | +| 16x4x1 | 224x224 | MViTv2-S | From scratch | 79.4 | 93.9 | [80.8](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 64G | 34.5M | + +#### Something-Something V2 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :------: | :------: | :------: | :------: | :---------------------------: | :----------------------------: | :--------------: | :---: | :----: | :----------------: | :--------------: | :-------------: | +| uniform 16 | 224x224 | MViTv2-S | K400 | 68.2 | 91.3 | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | 64G | 34.4M | [config](/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb_20230201-4065c1b9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.log) | + +For more details on data preparation, you can refer to + +- [Kinetics](/tools/data/kinetics/README.md) +- [Something-something V2](/tools/data/sthv2/README.md) + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test MViT model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```bibtex +@inproceedings{li2021improved, + title={MViTv2: Improved multiscale vision transformers for classification and detection}, + author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph}, + booktitle={CVPR}, + year={2022} +} +``` diff --git a/configs/recognition/mvit/metafile.yml b/configs/recognition/mvit/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..505602fe8306d43a1a432e4dff4e3efcf70b6e85 --- /dev/null +++ b/configs/recognition/mvit/metafile.yml @@ -0,0 +1,183 @@ +Collections: +- Name: MViT + README: configs/recognition/mvit/README.md + Paper: + URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf + Title: "MViTv2: Improved Multiscale Vision Transformers for Classification and Detection" + +Models: + - Name: mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb_infer + Config: configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: 224x224 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth + + - Name: mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb + Config: configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Batch Size: 16 + Epochs: 100 + FLOPs: 64G + Parameters: 34.5M + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 80.6 + Top 5 Accuracy: 94.7 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb_20230201-23284ff3.pth + + - Name: mvit-base-p244_32x3x1_kinetics400-rgb + Config: configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-base + Resolution: 224x224 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth + + - Name: mvit-large-p244_40x3x1_kinetics400-rgb + Config: configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-large + Resolution: 312x312 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth + + - Name: mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb_infer + Config: configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: 224x224 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 68.1 + Top 5 Accuracy: 91.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth + + - Name: mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb + Config: configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Batch Size: 16 + Epochs: 100 + FLOPs: 64G + Parameters: 34.4M + Pretrained: Kinetics-400 + Resolution: 224x224 + Training Data: SthV2 + Training Resources: 16 GPUs + Modality: RGB + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 68.2 + Top 5 Accuracy: 91.3 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb_20230201-4065c1b9.pth + + - Name: mvit-base-p244_u32_sthv2-rgb + Config: configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-base + Resolution: 224x224 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 70.8 + Top 5 Accuracy: 92.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth + + - Name: mvit-large-p244_u40_sthv2-rgb + Config: configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-large + Resolution: 312x312 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.2 + Top 5 Accuracy: 94.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth + + - Name: mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb + Config: configs/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Batch Size: 32 + Epochs: 100 + FLOPs: 71G + Parameters: 36.4M + Pretrained: Kinetics-400 MaskFeat + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.8 + Top 5 Accuracy: 95.2 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb_20230201-5bced1d0.pth diff --git a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f1723fca44810fb5ffe1c9293d88aefba97729e1 --- /dev/null +++ b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py @@ -0,0 +1,156 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='base', + temporal_size=32, + drop_path_rate=0.3, + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), +) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=3, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=3, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=3, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +repeat_sample = 2 +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='repeat_pseudo_collate'), + dataset=dict( + type='RepeatAugDataset', + num_repeats=repeat_sample, + sample_once=True, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=200, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=1, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=200, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=200, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=512 // repeat_sample) diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f4768987729c4a1fa22ffd74d59a8f467ecd7f7e --- /dev/null +++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py @@ -0,0 +1,139 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='base', + temporal_size=32, + drop_path_rate=0.3, + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=174), + dict(type='CutmixBlending', alpha=1, num_classes=174) + ]), + format_shape='NCTHW'), + cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=32), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=32, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=32, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=70, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..bb1152e633908f6399e47966edc887faf180905f --- /dev/null +++ b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py @@ -0,0 +1,145 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='large', + temporal_size=40, + spatial_size=312, + drop_path_rate=0.75, + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW'), + cls_head=dict(in_channels=1152), + test_cfg=dict(max_testing_views=5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=40, frame_interval=3, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 356)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(312, 312), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=40, + frame_interval=3, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 356)), + dict(type='CenterCrop', crop_size=312), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=40, + frame_interval=3, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 312)), + dict(type='ThreeCrop', crop_size=312), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +repeat_sample = 2 +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='repeat_pseudo_collate'), + dataset=dict( + type='RepeatAugDataset', + num_repeats=repeat_sample, + sample_once=True, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=10e-8), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=1, norm_type=2)) + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=128 // repeat_sample) diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ff99a0cfc7956766c97320cf390e7a57547d8c50 --- /dev/null +++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py @@ -0,0 +1,141 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='large', + temporal_size=40, + spatial_size=312, + drop_path_rate=0.75, + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), + cls_head=dict(in_channels=1152, num_classes=174), + test_cfg=dict(max_testing_views=5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=40), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=40, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=40, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=70, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=10)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..693f6e01b3a498577ad05e5a3b84baba32a4378a --- /dev/null +++ b/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py @@ -0,0 +1,151 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), ) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +repeat_sample = 2 +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='repeat_pseudo_collate'), + dataset=dict( + type='RepeatAugDataset', + num_repeats=repeat_sample, + sample_once=True, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=200, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=1, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=200, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=200, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512 // repeat_sample) diff --git a/configs/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d56928651c24739514671710a12297922235149c --- /dev/null +++ b/configs/recognition/mvit/mvit-small-p244_k400-maskfeat-pre_8xb32-16x4x1-100e_kinetics400-rgb.py @@ -0,0 +1,158 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + drop_path_rate=0.1, + dim_mul_in_attention=False, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230131-87d60b6f.pth', # noqa + pretrained_type='maskfeat', + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), + cls_head=dict(dropout_ratio=0., init_scale=0.001)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='PytorchVideoWrapper', op='RandAugment', magnitude=7), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +repeat_sample = 2 +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='repeat_pseudo_collate'), + dataset=dict( + type='RepeatAugDataset', + num_repeats=repeat_sample, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 9.6e-3 # for batch size 512 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.75, + 'decay_type': 'layer_wise', + 'num_layers': 16 + }, + clip_grad=dict(max_norm=5, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1 / 600, + by_epoch=True, + begin=0, + end=20, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + eta_min_ratio=1 / 600, + by_epoch=True, + begin=20, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=20), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512 // repeat_sample) diff --git a/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..59a3ce367d7728162214aeb0da2ab4e5a7089939 --- /dev/null +++ b/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py @@ -0,0 +1,130 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=16, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=16, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=100, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/omnisource/README.md b/configs/recognition/omnisource/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4ce6fc199ca7290807f234287932b6918b6122d3 --- /dev/null +++ b/configs/recognition/omnisource/README.md @@ -0,0 +1,88 @@ +# Omnisource + + + + + +## Abstract + + + +We propose to train a recognizer that can classify images and videos. The recognizer is jointly trained on image and video datasets. Compared with pre-training on the same image dataset, this method can significantly improve the video recognition performance. + + + +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | joint-training | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-----------: | :--------: | :--: | :------: | :------------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------: | :-------------------------: | :-------------------------: | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | ImageNet | 77.30 | 93.23 | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb_20230208-61c4be0d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train SlowOnly model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +We found that the training of this Omnisource model could crash for unknown reasons. If this happens, you can resume training by adding the `--cfg-options resume=True` to the training script. + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test SlowOnly model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={6202--6211}, + year={2019} +} +``` + +```BibTeX +@article{duan2020omni, + title={Omni-sourced Webly-supervised Learning for Video Recognition}, + author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua}, + journal={arXiv preprint arXiv:2003.13042}, + year={2020} +} +``` diff --git a/configs/recognition/omnisource/metafile.yml b/configs/recognition/omnisource/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..edef10cd3e54df6e0ba45b757564d54d6e273eb5 --- /dev/null +++ b/configs/recognition/omnisource/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: Omnisource + README: configs/recognition/omnisource/README.md + Paper: + URL: https://arxiv.org/abs/2003.13042 + Title: 'Omni-sourced Webly-supervised Learning for Video Recognition' + +Models: + - Name: slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb + Config: configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 256 + FLOPs: 54.75G + Parameters: 32.45M + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 77.30 + Top 5 Accuracy: 93.23 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb_20230208-61c4be0d.pth diff --git a/configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py b/configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7c4dc25c10c10984fd09a3052a745b50b1523581 --- /dev/null +++ b/configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py @@ -0,0 +1,171 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='RecognizerOmni', + backbone=dict(type='OmniResNet'), + cls_head=dict( + type='OmniHead', + image_classes=1000, + video_classes=400, + in_channels=2048, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='MIX2d3d')) + +# dataset settings +image_root = 'data/imagenet/' +image_ann_train = 'meta/train.txt' + +video_root = 'data/kinetics400/videos_train' +video_root_val = 'data/kinetics400/videos_val' +video_ann_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +video_ann_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +num_images = 1281167 # number of training samples in the ImageNet dataset +num_videos = 240435 # number of training samples in the Kinetics400 dataset +batchsize_video = 16 +num_gpus = 8 +num_iter = num_videos // (batchsize_video * num_gpus) +batchsize_image = num_images // (num_iter * num_gpus) + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=batchsize_video, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='VideoDataset', + ann_file=video_ann_train, + data_prefix=dict(video=video_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='VideoDataset', + ann_file=video_ann_val, + data_prefix=dict(video=video_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='VideoDataset', + ann_file=video_ann_val, + data_prefix=dict(video=video_root_val), + pipeline=test_pipeline, + test_mode=True)) + +imagenet_pipeline = [ + dict(type='LoadRGBFromFile'), + dict(type='mmcls.RandomResizedCrop', scale=224), + dict(type='mmcls.RandomFlip', prob=0.5, direction='horizontal'), + dict(type='mmcls.PackClsInputs'), +] + +image_dataloader = dict( + batch_size=batchsize_image, + num_workers=8, + dataset=dict( + type='mmcls.ImageNet', + data_root=image_root, + ann_file=image_ann_train, + data_prefix='train', + pipeline=imagenet_pipeline), + sampler=dict(type='DefaultSampler', shuffle=True), +) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='MultiLoaderEpochBasedTrainLoop', + other_loaders=[image_dataloader], + max_epochs=256, + val_interval=4) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=34, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=222, + eta_min=0, + by_epoch=True, + begin=34, + end=256, + convert_to_iter_based=True) +] +""" +The learning rate is for total_batch_size = 8 x 16 (num_gpus x batch_size) +If you want to use other batch size or number of GPU settings, please update +the learning rate with the linear scaling rule. +""" +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# runtime settings +default_hooks = dict(checkpoint=dict(interval=4, max_keep_ckpts=3)) diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a16f3af798f308e78d2c31f7b16004759ee1576a --- /dev/null +++ b/configs/recognition/r2plus1d/README.md @@ -0,0 +1,77 @@ +# R2plus1D + +[A closer look at spatiotemporal convolutions for action recognition](https://openaccess.thecvf.com/content_cvpr_2018/html/Tran_A_Closer_Look_CVPR_2018_paper.html) + + + +## Abstract + + + +In this paper we discuss several forms of spatiotemporal convolutions for video analysis and study their effects on action recognition. Our motivation stems from the observation that 2D CNNs applied to individual frames of the video have remained solid performers in action recognition. In this work we empirically demonstrate the accuracy advantages of 3D CNNs over 2D CNNs within the framework of residual learning. Furthermore, we show that factorizing the 3D convolutional filters into separate spatial and temporal components yields significantly advantages in accuracy. Our empirical study leads to the design of a new spatiotemporal convolutional block "R(2+1)D" which gives rise to CNNs that achieve results comparable or superior to the state-of-the-art on Sports-1M, Kinetics, UCF101 and HMDB51. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :----------------------------------: | :--------------------------------: | :-------------------------------: | +| 8x8x1 | 224x224 | 8 | ResNet34 | None | 69.76 | 88.41 | 10 clips x 3 crop | 53.1G | 63.8M | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb_20220812-47cfe041.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet34 | None | 75.46 | 92.28 | 10 clips x 3 crop | 213G | 63.8M | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train R(2+1)D model on Kinetics-400 dataset in a deterministic option. + +```shell +python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test R(2+1)D model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{tran2018closer, + title={A closer look at spatiotemporal convolutions for action recognition}, + author={Tran, Du and Wang, Heng and Torresani, Lorenzo and Ray, Jamie and LeCun, Yann and Paluri, Manohar}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={6450--6459}, + year={2018} +} +``` diff --git a/configs/recognition/r2plus1d/metafile.yml b/configs/recognition/r2plus1d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..d9f98e13d073f51de5d8d47ba6ef68a1d5723571 --- /dev/null +++ b/configs/recognition/r2plus1d/metafile.yml @@ -0,0 +1,53 @@ +Collections: + - Name: R2Plus1D + README: configs/recognition/r2plus1d/README.md + Paper: + URL: https://arxiv.org/abs/1711.11248 + Title: 'A Closer Look at Spatiotemporal Convolutions for Action Recognition' + +Models: + - Name: r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb + Config: configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py + In Collection: R2Plus1D + Metadata: + Architecture: ResNet34 + Batch Size: 8 + Epochs: 180 + FLOPs: 53.1G + Parameters: 63.8M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 69.76 + Top 5 Accuracy: 88.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb_20220812-47cfe041.pth + + - Name: r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb + Config: configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py + In Collection: R2Plus1D + Metadata: + Architecture: ResNet34 + Batch Size: 8 + Epochs: 180 + FLOPs: 213G + Parameters: 63.8M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.46 + Top 5 Accuracy: 92.28 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f9f579752dbd1b1bfa12bf9938d1b4bff6864a --- /dev/null +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py @@ -0,0 +1,82 @@ +_base_ = ['./r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ca61740d1caf2152d387fc9c36df517c919730d7 --- /dev/null +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py @@ -0,0 +1,113 @@ +_base_ = [ + '../../_base_/models/r2plus1d_r34.py', '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=180, val_begin=1, val_interval=20) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=180, + eta_min=0, + by_epoch=True, + ) +] + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md new file mode 100644 index 0000000000000000000000000000000000000000..71a1eb8bc05fb39bf8c0932230badd9eef7581ba --- /dev/null +++ b/configs/recognition/slowfast/README.md @@ -0,0 +1,80 @@ +# SlowFast + +[SlowFast Networks for Video Recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------------: | :--------: | :--: | :------------------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :------------------------: | :-----------------------: | :----------------------: | +| 4x16x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 75.55 | 92.35 | 10 clips x 3 crop | 36.3G | 34.5M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 76.80 | 92.99 | 10 clips x 3 crop | 66.1G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | None | 76.65 | 92.86 | 10 clips x 3 crop | 66.1G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet101 | None | 78.65 | 93.88 | 10 clips x 3 crop | 126G | 62.9M | [config](/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220818-9c0e09bd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 4x16x1 | Linear+Cosine | 224x224 | 32 | ResNet101 + ResNet50 | None | 77.03 | 92.99 | 10 clips x 3 crop | 64.9G | 62.4M | [config](/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train SlowFast model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test SlowFast model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={6202--6211}, + year={2019} +} +``` diff --git a/configs/recognition/slowfast/metafile.yml b/configs/recognition/slowfast/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..bb5de31d5009a87d3fcb6d506f3bab0d766abfb7 --- /dev/null +++ b/configs/recognition/slowfast/metafile.yml @@ -0,0 +1,124 @@ +Collections: + - Name: SlowFast + README: configs/recognition/slowfast/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb + Config: configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 256 + FLOPs: 36.3G + Parameters: 34.5M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.55 + Top 5 Accuracy: 92.35 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth + + - Name: slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb + Alias: + - slowfast + Config: configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 256 + FLOPs: 66.1G + Parameters: 34.6M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.80 + Top 5 Accuracy: 92.99 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth + + - Name: slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb + Config: configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 256 + FLOPs: 66.1G + Parameters: 34.6M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.65 + Top 5 Accuracy: 92.86 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth + + - Name: slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb + Config: configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet101 + Batch Size: 8 + Epochs: 256 + FLOPs: 126G + Parameters: 62.9M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 78.65 + Top 5 Accuracy: 93.88 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220901-9c0e09bd.pth + + - Name: slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb + Config: configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet101 + ResNet50 + Batch Size: 8 + Epochs: 256 + FLOPs: 64.9G + Parameters: 62.4M + Pretrained: None + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Metrics: + Top 1 Accuracy: 77.03 + Top 5 Accuracy: 92.99 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth diff --git a/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..29599c8973b2df77d6ab22351cf98785ac09dc17 --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py @@ -0,0 +1,5 @@ +_base_ = ['slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py'] + +model = dict(backbone=dict(slow_pathway=dict(depth=101))) + +optim_wrapper = dict(optimizer=dict(lr=0.1 * 4)) diff --git a/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ea04d706adc10c47dbc46c08aa794c264618deed --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py @@ -0,0 +1,4 @@ +_base_ = ['slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py'] + +model = dict( + backbone=dict(slow_pathway=dict(depth=101), fast_pathway=dict(depth=101))) diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..8c4a1eef8bacc0124b3d62c2bbe1c6d85b4932ee --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py @@ -0,0 +1,115 @@ +_base_ = [ + '../../_base_/models/slowfast_r50.py', '../../_base_/default_runtime.py' +] + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=256, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=34, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=256, + eta_min=0, + by_epoch=True, + begin=0, + end=256) +] + +default_hooks = dict( + checkpoint=dict(interval=4, max_keep_ckpts=3), logger=dict(interval=100)) diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d8e2c0b9b3526c397be699aec84f3f720d71c6 --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py @@ -0,0 +1,8 @@ +_base_ = ['slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + resample_rate=4, # tau + speed_ratio=4, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict(fusion_kernel=7))) diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..dae9f459eacdea151ccf1783303725eeda4b9d87 --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py @@ -0,0 +1,20 @@ +_base_ = ['slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py'] + +model = dict(backbone=dict(slow_pathway=dict(lateral_norm=True))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=34, + convert_to_iter_based=True), + dict( + type='MultiStepLR', + begin=0, + end=256, + by_epoch=True, + milestones=[94, 154, 196], + gamma=0.1) +] diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6003436307071e1186ec4e96a673c182d5dada99 --- /dev/null +++ b/configs/recognition/slowonly/README.md @@ -0,0 +1,95 @@ +# SlowOnly + +[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------------: | :--------: | :--: | :------------------------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------: | :--------------------: | :--------------------: | +| 4x16x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 72.97 | 90.88 | 10 clips x 3 crop | 27.38G | 32.45M | [config](/configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb/slowonly_r50_4x16x1_256e_8xb16_kinetics400-rgb_20220901-f6a40d08.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb/slowonly_r50_4x16x1_256e_8xb16_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 75.15 | 92.11 | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb_20220901-2132fc87.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet101 | None | 76.59 | 92.80 | 10 clips x 3 crop | 112G | 60.36M | [config](/configs/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb_20220901-e6281431.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.log) | +| 4x16x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 75.12 | 91.72 | 10 clips x 3 crop | 27.38G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 76.45 | 92.55 | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb_20220901-df42dc84.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.log) | +| 4x16x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 75.07 | 91.69 | 10 clips x 3 crop | 43.23G | 39.81M | [config](/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-cf739c75.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 76.65 | 92.47 | 10 clips x 3 crop | 96.66G | 39.81M | [config](/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb_20220901-df42dc84.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.log) | + +### Kinetics-700 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------------: | :--------------------------: | :--------------------------: | +| 4x16x1 | Linear+MultiStep | 224x224 | 8x2 | ResNet50 | ImageNet | 65.52 | 86.39 | 10 clips x 3 crop | 27.38G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb_20221013-98b1b0a7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8x2 | ResNet50 | ImageNet | 67.67 | 87.80 | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log) | + +### Kinetics-710 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------------: | :--------------------------: | :--------------------------: | +| 8x8x1 | Linear+MultiStep | 224x224 | 8x4 | ResNet50 | ImageNet | 72.39 | 90.60 | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train SlowOnly model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test SlowOnly model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={6202--6211}, + year={2019} +} +``` diff --git a/configs/recognition/slowonly/metafile.yml b/configs/recognition/slowonly/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..aca7c576305620e826a53a18c46fc3c18b31ac1e --- /dev/null +++ b/configs/recognition/slowonly/metafile.yml @@ -0,0 +1,239 @@ +Collections: + - Name: SlowOnly + README: configs/recognition/slowonly/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 256 + FLOPs: 27.38G + Parameters: 32.45M + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 72.68 + Top 5 Accuracy: 90.68 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb/slowonly_r50_4x16x1_256e_8xb16_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb/slowonly_r50_4x16x1_256e_8xb16_kinetics400-rgb_20220901-f6a40d08.pth + + - Name: slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 256 + FLOPs: 54.75G + Parameters: 32.45M + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.82 + Top 5 Accuracy: 91.80 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb_20220901-2132fc87.pth + + + - Name: slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet101 + Batch Size: 16 + Epochs: 196 + FLOPs: 112G + Parameters: 60.36M + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.28 + Top 5 Accuracy: 92.70 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb_20220901-e6281431.pth + + - Name: slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 27.38G + Parameters: 32.45M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.83 + Top 5 Accuracy: 91.60 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth + + - Name: slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 54.75G + Parameters: 32.45M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.96 + Top 5 Accuracy: 92.40 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb_20220901-df42dc84.pth + + + - Name: slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 43.23G + Parameters: 39.81M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.84 + Top 5 Accuracy: 91.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-cf739c75.pth + + - Name: slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb + Config: configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 96.66G + Parameters: 39.81M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.35 + Top 5 Accuracy: 92.18 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb_20220901-df42dc84.pth + + - Name: slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb + Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 27.38G + Parameters: 32.45M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 16 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 65.18 + Top 5 Accuracy: 86.05 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth + + - Name: slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb + Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 150 + FLOPs: 54.75G + Parameters: 32.45M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 16 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 66.93 + Top 5 Accuracy: 87.47 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20220901-4098e1eb.pth + + - Name: slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb + Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 150 + FLOPs: 54.75G + Parameters: 32.45M + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-710 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-710 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 72.39 + Top 5 Accuracy: 90.60 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..84e2bb3d0c9a0a470f527b4571346cfc15c515b7 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py @@ -0,0 +1,120 @@ +_base_ = ['slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth' + ), + cls_head=dict(num_classes=700)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=4, frame_interval=16, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=10, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..dc7380390401844553c82cfc2413577abf7b1182 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py @@ -0,0 +1,88 @@ +_base_ = [('slowonly_imagenet-pretrained-r50_16xb16-' + '4x16x1-steplr-150e_kinetics700-rgb.py')] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d2a43714660a0559a139323c730708696d16e541 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py @@ -0,0 +1,138 @@ +_base_ = [('slowonly_imagenet-pretrained-r50_16xb16-' + '4x16x1-steplr-150e_kinetics700-rgb.py')] + +model = dict(cls_head=dict(num_classes=710)) + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +k400_data_root = 'data/kinetics400/videos_train' +k600_data_root = 'data/kinetics600/videos' +k700_data_root = 'data/kinetics700/videos' +k400_data_root_val = 'data/kinetics400/videos_val' +k600_data_root_val = k600_data_root +k700_data_root_val = k700_data_root + +k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt' +k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt' +k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt' + +k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt' +k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt' +k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' + +k400_trainset = dict( + type='VideoDataset', + ann_file=k400_ann_file_train, + data_prefix=dict(video=k400_data_root), + pipeline=train_pipeline) +k600_trainset = dict( + type='VideoDataset', + ann_file=k600_ann_file_train, + data_prefix=dict(video=k600_data_root), + pipeline=train_pipeline) +k700_trainset = dict( + type='VideoDataset', + ann_file=k700_ann_file_train, + data_prefix=dict(video=k700_data_root), + pipeline=train_pipeline) + +k400_valset = dict( + type='VideoDataset', + ann_file=k400_ann_file_val, + data_prefix=dict(video=k400_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k600_valset = dict( + type='VideoDataset', + ann_file=k600_ann_file_val, + data_prefix=dict(video=k600_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k700_valset = dict( + type='VideoDataset', + ann_file=k700_ann_file_val, + data_prefix=dict(video=k700_data_root_val), + pipeline=val_pipeline, + test_mode=True) + +k400_testset = k400_valset.copy() +k600_testset = k600_valset.copy() +k700_testset = k700_valset.copy() +k400_testset['pipeline'] = test_pipeline +k600_testset['pipeline'] = test_pipeline +k700_testset['pipeline'] = test_pipeline + +k710_trainset = dict( + type='ConcatDataset', + datasets=[k400_trainset, k600_trainset, k700_trainset], + _delete_=True) +k710_valset = dict( + type='ConcatDataset', + datasets=[k400_valset, k600_valset, k700_valset], + _delete_=True) +k710_testset = dict( + type='ConcatDataset', + datasets=[k400_testset, k600_testset, k700_testset], + _delete_=True, +) + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=k710_trainset) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_valset) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_testset) diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e6fe44d0dad9232d34c09fb6629c27710486b6 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py @@ -0,0 +1,27 @@ +_base_ = ['slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth') +) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=10, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d0f5cb7b44b41921f597adf58059d016e159d667 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py @@ -0,0 +1,27 @@ +_base_ = ['slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth') +) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=10, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] diff --git a/configs/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..3fa39ffb5e47ede2e50e5171bc3a6141967e0415 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r101_8xb16-8x8x1-196e_kinetics400-rgb.py @@ -0,0 +1,19 @@ +_base_ = ['slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py'] + +# model settings +model = dict(backbone=dict(depth=101, pretrained=None)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=196, val_begin=1, val_interval=5) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=34), + dict( + type='CosineAnnealingLR', + T_max=162, + eta_min=0, + by_epoch=True, + begin=34, + end=196) +] diff --git a/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c36e2f42913c9074b3fe55de74e8afec4953f2c7 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-4x16x1-steplr-150e_kinetics400-rgb.py @@ -0,0 +1,26 @@ +_base_ = ['slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian'))) + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=10, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=1e-4)) diff --git a/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa37bb4ae07b6e3034020372cca403a23965b3e --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50-in1k-pre-nl-embedded-gaussian_8xb16-8x8x1-steplr-150e_kinetics400-rgb.py @@ -0,0 +1,26 @@ +_base_ = ['slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian'))) + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=10, + end=150, + by_epoch=True, + milestones=[90, 130], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=1e-4)) diff --git a/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py b/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py new file mode 100644 index 0000000000000000000000000000000000000000..5e9b50e6ca05c6b0dccf3d6032974e68ee04fd04 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_8xb16-16x4x1-256e_kinetics400-flow.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + lateral=False, + in_channels=2, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128, 128], + std=[128, 128], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_flow.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_flow.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_flow.txt' +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=2, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=256, val_begin=1, val_interval=8) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=34), + dict( + type='CosineAnnealingLR', + T_max=222, + eta_min=0, + by_epoch=True, + begin=34, + end=256) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +# runtime settings +default_hooks = dict(checkpoint=dict(interval=8, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..8928e1514fff7aab634e53479b8175b11b644a71 --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py @@ -0,0 +1,125 @@ +_base_ = [ + '../../_base_/models/slowonly_r50.py', '../../_base_/default_runtime.py' +] + +# model settings +model = dict(backbone=dict(pretrained=None)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=4, frame_interval=16, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=256, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=34), + dict( + type='CosineAnnealingLR', + T_max=222, + eta_min=0, + by_epoch=True, + begin=34, + end=256) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +# runtime settings +default_hooks = dict(checkpoint=dict(interval=4, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py b/configs/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..17747f70983c05d9ef656ac52372311f10fa8c8d --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_8xb16-8x8x1-256e_kinetics400-rgb.py @@ -0,0 +1,87 @@ +_base_ = ['slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..03e9d489007af4ef9d2a96f94b9a214490f0f103 --- /dev/null +++ b/configs/recognition/swin/README.md @@ -0,0 +1,93 @@ +# VideoSwin + +[Video Swin Transformer](https://openaccess.thecvf.com/content/CVPR2022/html/Liu_Video_Swin_Transformer_CVPR_2022_paper.html) + + + +## Abstract + + + +The vision community is witnessing a modeling shift from CNNs to Transformers, where pure Transformer architectures have attained top accuracy on the major video recognition benchmarks. These video models are all built on Transformer layers that globally connect patches across the spatial and temporal dimensions. In this paper, we instead advocate an inductive bias of locality in video Transformers, which leads to a better speed-accuracy trade-off compared to previous approaches which compute self-attention globally even with spatial-temporal factorization. The locality of the proposed video architecture is realized by adapting the Swin Transformer designed for the image domain, while continuing to leverage the power of pre-trained image models. Our approach achieves state-of-the-art accuracy on a broad range of video recognition benchmarks, including on action recognition (84.9 top-1 accuracy on Kinetics-400 and 85.9 top-1 accuracy on Kinetics-600 with ~20xless pre-training data and ~3xsmaller model size) and temporal modeling (69.6 top-1 accuracy on Something-Something v2). + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------------------: | :--------------------------: | :--------------: | :---: | :----: | :--------------: | :------------: | :------------: | +| 32x2x1 | 224x224 | 8 | Swin-T | ImageNet-1k | 78.90 | 93.77 | 78.84 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.76 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-S | ImageNet-1k | 80.54 | 94.46 | 80.58 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.45 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-B | ImageNet-1k | 80.57 | 94.49 | 80.55 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.66 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-L | ImageNet-22k | 83.46 | 95.91 | 83.1\* | 95.9\* | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | + +### Kinetics-700 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: | +| 32x2x1 | 224x224 | 16 | Swin-L | ImageNet-22k | 75.92 | 92.72 | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log) | + +### Kinetics-710 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :---------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | +| 32x2x1 | 224x224 | 32 | Swin-S | ImageNet-1k | 76.90 | 92.96 | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. `*` means that the numbers are copied from the paper. +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +4. Pre-trained image models can be downloaded from [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). + +For more details on data preparation, you can refer to [Kinetics](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train VideoSwin model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test VideoSwin model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{liu2022video, + title={Video swin transformer}, + author={Liu, Ze and Ning, Jia and Cao, Yue and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Hu, Han}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={3202--3211}, + year={2022} +} +``` diff --git a/configs/recognition/swin/README_zh-CN.md b/configs/recognition/swin/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..14fc00b8f9a676224a57dda736a452b8d31d4c6e --- /dev/null +++ b/configs/recognition/swin/README_zh-CN.md @@ -0,0 +1,79 @@ +# VideoSwin + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{liu2022video, + title={Video swin transformer}, + author={Liu, Ze and Ning, Jia and Cao, Yue and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Hu, Han}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={3202--3211}, + year={2022} +} +``` + +## ๆจกๅž‹ๅบ“ + +### Kinetics-400 + +| ๅธง้‡‡ๆ ท็ญ–็•ฅ | ๅˆ†่พจ็އ | GPUๆ•ฐ้‡ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | ๅ‚่€ƒไปฃ็ ็š„ top1 ๅ‡†็กฎ็އ | ๅ‚่€ƒไปฃ็ ็š„ top5ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๅ่ฎฎ | ๆตฎ็‚น่ฟ็ฎ—ๆ•ฐ | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :--------: | :-----: | :-----: | :------: | :----------: | :---------: | :---------: | :------------------------: | :------------------------: | :--------------: | :--------: | :----: | :---------------: | :---------------: | :--------------: | +| 32x2x1 | 224x224 | 8 | Swin-T | ImageNet-1k | 78.90 | 93.77 | 78.84 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.76 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-S | ImageNet-1k | 80.54 | 94.46 | 80.58 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.45 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-B | ImageNet-1k | 80.57 | 94.49 | 80.55 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.66 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-L | ImageNet-22k | 83.46 | 95.91 | 83.1\* | 95.9\* | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | + +### Kinetics-700 + +| ๅธง้‡‡ๆ ท็ญ–็•ฅ | ๅˆ†่พจ็އ | GPUๆ•ฐ้‡ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๅ่ฎฎ | ๆตฎ็‚น่ฟ็ฎ—ๆ•ฐ | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :--------: | :-----: | :-----: | :------: | :----------: | :---------: | :---------: | :--------------: | :--------: | :----: | :--------------------------------: | :--------------------------------: | :--------------------------------: | +| 32x2x1 | 224x224 | 16 | Swin-L | ImageNet-22k | 75.92 | 92.72 | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log) | + +### Kinetics-710 + +| ๅธง้‡‡ๆ ท็ญ–็•ฅ | ๅˆ†่พจ็އ | GPUๆ•ฐ้‡ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๅ่ฎฎ | ๆตฎ็‚น่ฟ็ฎ—ๆ•ฐ | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :--------: | :-----: | :-----: | :------: | :---------: | :---------: | :---------: | :--------------: | :--------: | :----: | :--------------------------------: | :---------------------------------: | :--------------------------------: | +| 32x2x1 | 224x224 | 32 | Swin-S | ImageNet-1k | 76.90 | 92.96 | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log) | + +1. ่ฟ™้‡Œ็š„ **GPUๆ•ฐ้‡** ๆŒ‡็š„ๆ˜ฏๅพ—ๅˆฐๆจกๅž‹ๆƒ้‡ๆ–‡ไปถๅฏนๅบ”็š„ GPU ไธชๆ•ฐใ€‚ๅฝ“็”จๆˆทไฝฟ็”จไธๅŒๆ•ฐ้‡็š„ GPU ๆˆ–่€…ๆฏๅ— GPU ๅค„็†ไธๅŒ่ง†้ข‘ไธชๆ•ฐๆ—ถ๏ผŒ้œ€่ฆๅœจ่ฟ่กŒ `tools/train.py` ๆ—ถ่ฎพ็ฝฎ `--auto-scale-lr` ๏ผŒ่ฏฅๅ‚ๆ•ฐๅฐ†ๆ นๆฎๆ‰นๅคงๅฐ็ญ‰ๆฏ”ไพ‹ๅœฐ่ฐƒ่Š‚ๅญฆไน ็އใ€‚ +2. ๅ‚่€ƒไปฃ็ ็š„ๅ‡†็กฎ็އๅˆ—ไธญ็š„็ป“ๆžœๆ˜ฏ้€š่ฟ‡ไฝฟ็”จ็›ธๅŒ็š„ๆจกๅž‹้…็ฝฎๅœจๅŽŸๆฅ็š„ไปฃ็ ๅบ“ไธŠ่ฎญ็ปƒๅพ—ๅˆฐ็š„ใ€‚ `*` ไปฃ่กจๆ•ฐๆฎๆฅๆบไบŽ่ฎบๆ–‡ใ€‚ +3. ๆˆ‘ไปฌไฝฟ็”จ็š„ Kinetics400 ้ชŒ่ฏ้›†ๅŒ…ๅซ 19796 ไธช่ง†้ข‘ใ€‚ ็”จๆˆทๅฏไปฅไปŽ[้ชŒ่ฏ้›†่ง†้ข‘](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB)ไธ‹่ฝฝ่ฟ™ไบ›่ง†้ข‘ใ€‚ ๅŒๆ—ถไนŸๆไพ›ไบ†ๅฏนๅบ”็š„[ๆ•ฐๆฎๅˆ—่กจ](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (ๆฏ่กŒๆ ผๅผไธบ๏ผš่ง†้ข‘ ID๏ผŒ่ง†้ข‘ๅธงๆ•ฐ็›ฎ๏ผŒ็ฑปๅˆซๅบๅท) ไปฅๅŠ[ๆ˜ ๅฐ„ๆ ‡็ญพ](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) ใ€‚ +4. ้ข„่ฎญ็ปƒๆจกๅž‹ๅฏไปฅไปŽ [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models)ไธ‹่ฝฝใ€‚ + +ๅ…ณไบŽๆ•ฐๆฎๅค„็†็š„ๆ›ดๅคš็ป†่Š‚๏ผŒ็”จๆˆทๅฏไปฅๅ‚็…ง [Kinetics](/tools/data/kinetics/README_zh-CN.md)ใ€‚ + +## ๅฆ‚ไฝ•่ฎญ็ปƒ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒๆจกๅž‹่ฎญ็ปƒใ€‚ + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +ไพ‹ๅฆ‚๏ผšไปฅไธ€ไธช็กฎๅฎšๆ€ง็š„่ฎญ็ปƒๆ–นๅผ๏ผŒ่พ…ไปฅๅฎšๆœŸ็š„้ชŒ่ฏ่ฟ‡็จ‹่ฟ›่กŒ VideoSwin ๆจกๅž‹ๅœจ Kinetics-400 ๆ•ฐๆฎ้›†ไธŠ็š„่ฎญ็ปƒใ€‚ + +```shell +python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +ๆ›ดๅคš่ฎญ็ปƒ็ป†่Š‚๏ผŒๅฏๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md) ไธญ็š„ **่ฎญ็ปƒ** ้ƒจๅˆ†ใ€‚ + +## ๅฆ‚ไฝ•ๆต‹่ฏ• + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒๆจกๅž‹ๆต‹่ฏ•ใ€‚ + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +ไพ‹ๅฆ‚๏ผšๅœจ Kinetics-400 ๆ•ฐๆฎ้›†ไธŠๆต‹่ฏ• VideoSwin ๆจกๅž‹๏ผŒๅนถๅฐ†็ป“ๆžœๅฏผๅ‡บไธบไธ€ไธช pkl ๆ–‡ไปถใ€‚ + +```shell +python tools/test.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +ๆ›ดๅคšๆต‹่ฏ•็ป†่Š‚๏ผŒๅฏๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md) ไธญ็š„ **ๆต‹่ฏ•** ้ƒจๅˆ†ใ€‚ diff --git a/configs/recognition/swin/metafile.yml b/configs/recognition/swin/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..1c59f22536a6985dd30276ab2fb99dab6f37ae1a --- /dev/null +++ b/configs/recognition/swin/metafile.yml @@ -0,0 +1,145 @@ +Collections: + - Name: Swin + README: configs/recognition/swin/README.md + Paper: + URL: https://arxiv.org/abs/2106.13230 + Title: 'Video Swin Transformer' + +Models: + - Name: swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb + Config: configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py + In Collection: Swin + Metadata: + Architecture: Swin-T + Batch Size: 8 + Epochs: 30 + FLOPs: 88G + Parameters: 28.2M + Pretrained: ImageNet-1K + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 78.90 + Top 5 Accuracy: 93.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth + + - Name: swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb + Config: configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py + In Collection: Swin + Metadata: + Architecture: Swin-S + Batch Size: 8 + Epochs: 30 + FLOPs: 166G + Parameters: 49.8M + Pretrained: ImageNet-1K + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 80.54 + Top 5 Accuracy: 94.46 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth + + - Name: swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb + Config: configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py.py + In Collection: Swin + Metadata: + Architecture: Swin-B + Batch Size: 8 + Epochs: 30 + FLOPs: 282G + Parameters: 88.0M + Pretrained: ImageNet-1K + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 80.57 + Top 5 Accuracy: 94.49 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth + + - Name: swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb + Config: configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py + In Collection: Swin + Metadata: + Architecture: Swin-L + Batch Size: 8 + Epochs: 30 + FLOPs: 604G + Parameters: 197M + Pretrained: ImageNet-22K + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 83.46 + Top 5 Accuracy: 95.91 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth + + - Name: swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb + Config: configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py + In Collection: Swin + Metadata: + Architecture: Swin-L + Batch Size: 8 + Epochs: 30 + FLOPs: 604G + Parameters: 197M + Pretrained: ImageNet-22K + Resolution: 224x224 + Training Data: Kinetics-700 + Training Resources: 16 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.92 + Top 5 Accuracy: 92.72 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth + + - Name: swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb + Config: configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py + In Collection: Swin + Metadata: + Architecture: Swin-S + Batch Size: 4 + Epochs: 30 + FLOPs: 604G + Parameters: 197M + Pretrained: ImageNet-1K + Resolution: 224x224 + Training Data: Kinetics-710 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-710 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.90 + Top 5 Accuracy: 92.96 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth diff --git a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..41e7e40f6115e1f0ca2e3d628166f9ca8b9efa65 --- /dev/null +++ b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -0,0 +1,139 @@ +_base_ = [ + '../../_base_/models/swin_tiny.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='base', + drop_path_rate=0.3, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_base_patch4_window7_224.pth' # noqa: E501 + ), + cls_head=dict(in_channels=1024)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05), + constructor='SwinOptimWrapperConstructor', + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..697ae944fa94eaab34f58f338836681eaad9d0b1 --- /dev/null +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py @@ -0,0 +1,94 @@ +_base_ = [ + 'swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py' +] + +model = dict(cls_head=dict(num_classes=700)) + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +optim_wrapper = dict(optimizer=dict(lr=2e-3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba5745e71919e4ee88c79d79ba544076f3fbb0c --- /dev/null +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -0,0 +1,139 @@ +_base_ = [ + '../../_base_/models/swin_tiny.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='large', + drop_path_rate=0.4, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_large_patch4_window7_224_22k.pth' # noqa: E501 + ), + cls_head=dict(in_channels=1536)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05), + constructor='SwinOptimWrapperConstructor', + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..227fc9ca3c52fdfd3fb09100434b919c3d827f75 --- /dev/null +++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py @@ -0,0 +1,144 @@ +_base_ = [ + 'swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py' +] + +model = dict(cls_head=dict(num_classes=710)) + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +k400_data_root = 'data/kinetics400/videos_train' +k600_data_root = 'data/kinetics600/videos' +k700_data_root = 'data/kinetics700/videos' +k400_data_root_val = 'data/kinetics400/videos_val' +k600_data_root_val = k600_data_root +k700_data_root_val = k700_data_root + +k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt' +k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt' +k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt' + +k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt' +k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt' +k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' + +k400_trainset = dict( + type='VideoDataset', + ann_file=k400_ann_file_train, + data_prefix=dict(video=k400_data_root), + pipeline=train_pipeline) +k600_trainset = dict( + type='VideoDataset', + ann_file=k600_ann_file_train, + data_prefix=dict(video=k600_data_root), + pipeline=train_pipeline) +k700_trainset = dict( + type='VideoDataset', + ann_file=k700_ann_file_train, + data_prefix=dict(video=k700_data_root), + pipeline=train_pipeline) + +k400_valset = dict( + type='VideoDataset', + ann_file=k400_ann_file_val, + data_prefix=dict(video=k400_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k600_valset = dict( + type='VideoDataset', + ann_file=k600_ann_file_val, + data_prefix=dict(video=k600_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k700_valset = dict( + type='VideoDataset', + ann_file=k700_ann_file_val, + data_prefix=dict(video=k700_data_root_val), + pipeline=val_pipeline, + test_mode=True) + +k400_testset = k400_valset.copy() +k600_testset = k600_valset.copy() +k700_testset = k700_valset.copy() +k400_testset['pipeline'] = test_pipeline +k600_testset['pipeline'] = test_pipeline +k700_testset['pipeline'] = test_pipeline + +k710_trainset = dict( + type='ConcatDataset', + datasets=[k400_trainset, k600_trainset, k700_trainset], + _delete_=True) +k710_valset = dict( + type='ConcatDataset', + datasets=[k400_valset, k600_valset, k700_valset], + _delete_=True) +k710_testset = dict( + type='ConcatDataset', + datasets=[k400_testset, k600_testset, k700_testset], + _delete_=True, +) + +train_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=k710_trainset) +val_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_valset) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_testset) + +optim_wrapper = dict(optimizer=dict(lr=2e-3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..1dcc9a99e9f2c179aa684e3786b37ec379dadb69 --- /dev/null +++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -0,0 +1,138 @@ +_base_ = [ + '../../_base_/models/swin_tiny.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='small', + drop_path_rate=0.2, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_small_patch4_window7_224.pth' # noqa: E501 + )) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02), + constructor='SwinOptimWrapperConstructor', + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..de26949259a40a50f097fb158b2bde23736f9dea --- /dev/null +++ b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -0,0 +1,136 @@ +_base_ = [ + '../../_base_/models/swin_tiny.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_tiny_patch4_window7_224.pth' # noqa: E501 + )) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02), + constructor='SwinOptimWrapperConstructor', + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4d88b6ea982e6e23c433e658038bf488c31f229c --- /dev/null +++ b/configs/recognition/tanet/README.md @@ -0,0 +1,86 @@ +# TANet + +[TAM: Temporal Adaptive Module for Video Recognition](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_TAM_Temporal_Adaptive_Module_for_Video_Recognition_ICCV_2021_paper.html) + + + +## Abstract + + + +Video data is with complex temporal dynamics due to various factors such as camera motion, speed variation, and different activities. To effectively capture this diverse motion pattern, this paper presents a new temporal adaptive module ({\\bf TAM}) to generate video-specific temporal kernels based on its own feature map. TAM proposes a unique two-level adaptive modeling scheme by decoupling the dynamic kernel into a location sensitive importance map and a location invariant aggregation weight. The importance map is learned in a local temporal window to capture short-term information, while the aggregation weight is generated from a global view with a focus on long-term structure. TAM is a modular block and could be integrated into 2D CNNs to yield a powerful video architecture (TANet) with a very small extra computational cost. The extensive experiments on Kinetics-400 and Something-Something datasets demonstrate that our TAM outperforms other temporal modeling methods consistently, and achieves the state-of-the-art performance under the similar complexity. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------------------: | :---------------------------: | :--------------: | :---: | :----: | :---------------: | :-------------: | :------------: | +| dense-1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 76.25 | 92.41 | [76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | [92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | 8 clips x 3 crop | 43.0G | 25.6M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log) | + +### Something-Something V1 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :---------: | :---------: | :---------------: | :---: | :----: | :--------------------------------: | :------------------------------: | :-----------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 46.98/49.71 | 75.75/77.43 | 16 clips x 3 crop | 43.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 48.24/50.95 | 78.16/79.28 | 16 clips x 3 crop | 86.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. The checkpoints for reference repo can be downloaded [here](https://drive.google.com/drive/folders/1sFfmP3yrfc7IzRshEELOby7-aEoymIFL?usp=sharing). +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to + +- [Kinetics400](/tools/data/kinetics/README.md) +- [Something-something V1](/tools/data/sthv1/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TANet model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TANet model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{liu2020tam, + title={TAM: Temporal Adaptive Module for Video Recognition}, + author={Liu, Zhaoyang and Wang, Limin and Wu, Wayne and Qian, Chen and Lu, Tong}, + journal={arXiv preprint arXiv:2005.06803}, + year={2020} +} +``` diff --git a/configs/recognition/tanet/metafile.yml b/configs/recognition/tanet/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..4bd3bcd15788786aabadfe1253e6c1394488078d --- /dev/null +++ b/configs/recognition/tanet/metafile.yml @@ -0,0 +1,80 @@ +Collections: + - Name: TANet + README: configs/recognition/tanet/README.md + Paper: + URL: https://arxiv.org/abs/2005.06803 + Title: "TAM: Temporal Adaptive Module for Video Recognition" + +Models: + - Name: tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb + Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py + In Collection: TANet + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 100 + FLOPs: 43.0G + Parameters: 25.6M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.25 + Top 5 Accuracy: 92.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth + + - Name: tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb + Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py + In Collection: TANet + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 50 + FLOPs: 43.1G + Parameters: 25.1M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 49.71 + Top 1 Accuracy (efficient): 46.98 + Top 5 Accuracy: 77.43 + Top 5 Accuracy (efficient): 75.75 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth + + - Name: tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb + Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py + In Collection: TANet + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 50 + FLOPs: 86.1G + Parameters: 25.1M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 50.95 + Top 1 Accuracy (efficient): 48.24 + Top 5 Accuracy: 79.28 + Top 5 Accuracy (efficient): 78.16 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a3dbbd05bdd6ddd398aa60f1eea7e97f29e83fce --- /dev/null +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py @@ -0,0 +1,122 @@ +_base_ = [ + '../../_base_/models/tanet_r50.py', '../../_base_/default_runtime.py', + '../../_base_/schedules/sgd_tsm_50e.py' +] + +# model settings +model = dict( + backbone=dict(num_segments=16), + cls_head=dict(num_classes=174, num_segments=16, dropout_ratio=0.6)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = val_pipeline +train_dataloader = dict( + batch_size=6, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=6, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=0.0075, weight_decay=0.001)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[30, 40, 45], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..94e9a514dfee0516904880bb0e045939ff714add --- /dev/null +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py @@ -0,0 +1,120 @@ +_base_ = [ + '../../_base_/models/tanet_r50.py', '../../_base_/default_runtime.py', + '../../_base_/schedules/sgd_tsm_50e.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174, dropout_ratio=0.6)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(weight_decay=0.001)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[30, 40, 45], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2e0c07ca0c8e2ac4a48248bef8d93301d5c5292e --- /dev/null +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,116 @@ +_base_ = [ + '../../_base_/models/tanet_r50.py', + '../../_base_/schedules/sgd_tsm_100e.py', '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict(val_interval=5) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[50, 75, 90], + gamma=0.1) +] + +default_hooks = dict( + checkpoint=dict(max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bbeb4a9d8718e356c3cb2a501ed194378a7da191 --- /dev/null +++ b/configs/recognition/timesformer/README.md @@ -0,0 +1,80 @@ +# TimeSformer + +[Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) + + + +## Abstract + + + +We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :----------------------------: | :--------------------------: | :-------------------------: | +| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clip x 3 crop | 196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 1 clip x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clip x 3 crop | 141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. We keep the test setting with the [original repo](https://github.com/facebookresearch/TimeSformer) (three crop x 1 clip). +3. The pretrained model `vit_base_patch16_224.pth` used by TimeSformer was converted from [vision_transformer](https://github.com/google-research/vision_transformer). + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TimeSformer model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TimeSformer model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@misc{bertasius2021spacetime, + title = {Is Space-Time Attention All You Need for Video Understanding?}, + author = {Gedas Bertasius and Heng Wang and Lorenzo Torresani}, + year = {2021}, + eprint = {2102.05095}, + archivePrefix = {arXiv}, + primaryClass = {cs.CV} +} +``` diff --git a/configs/recognition/timesformer/metafile.yml b/configs/recognition/timesformer/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..6417797b092ae44b9bbf9ad5ecbaa5e3bf9fea53 --- /dev/null +++ b/configs/recognition/timesformer/metafile.yml @@ -0,0 +1,76 @@ +Collections: + - Name: TimeSformer + README: configs/recognition/timesformer/README.md + Paper: + URL: https://arxiv.org/abs/2102.05095 + Title: 'Is Space-Time Attention All You Need for Video Understanding' + +Models: + - Name: timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb + Config: configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py + In Collection: TimeSformer + Metadata: + Architecture: TimeSformer + Batch Size: 8 + Epochs: 15 + Pretrained: ImageNet-21K + Resolution: 224x224 + FLOPs: 196G + params: 122M + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 77.69 + Top 5 Accuracy: 93.45 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth + + - Name: timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb + Config: configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py + In Collection: TimeSformer + Metadata: + Architecture: TimeSformer + Batch Size: 8 + Epochs: 15 + Pretrained: ImageNet-21K + Resolution: 224x224 + FLOPs: 180G + params: 86.11M + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.95 + Top 5 Accuracy: 93.28 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth + + - Name: timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb + Config: configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py + In Collection: TimeSformer + Metadata: + Architecture: TimeSformer + Batch Size: 8 + Epochs: 15 + Pretrained: ImageNet-21K + Resolution: 224x224 + FLOPs: 141G + params: 86.11M + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.93 + Top 5 Accuracy: 92.88 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth diff --git a/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py b/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..711d44a390c079483b6efda97653fea695ba344a --- /dev/null +++ b/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py @@ -0,0 +1,3 @@ +_base_ = 'timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py' + +model = dict(backbone=dict(attention_type='divided_space_time')) diff --git a/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py b/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d643b1eec14434311d67b483482cb8ecc4f46c93 --- /dev/null +++ b/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py @@ -0,0 +1,3 @@ +_base_ = 'timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py' + +model = dict(backbone=dict(attention_type='joint_space_time')) diff --git a/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e573356893555f550e833ae9e99cb11213f42bd1 --- /dev/null +++ b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py @@ -0,0 +1,146 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='TimeSformer', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth', # noqa: E501 + num_frames=8, + img_size=224, + patch_size=16, + embed_dims=768, + in_channels=3, + dropout_ratio=0., + transformer_layers=None, + attention_type='space_only', + norm_cfg=dict(type='LN', eps=1e-6)), + cls_head=dict( + type='TimeSformerHead', + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[127.5, 127.5, 127.5], + std=[127.5, 127.5, 127.5], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=8, frame_interval=32, num_clips=1), + dict(type='DecordDecode'), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=224), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=32, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=32, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=15, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.005, momentum=0.9, weight_decay=1e-4, nesterov=True), + paramwise_cfg=dict( + custom_keys={ + '.backbone.cls_token': dict(decay_mult=0.0), + '.backbone.pos_embed': dict(decay_mult=0.0), + '.backbone.time_embed': dict(decay_mult=0.0) + }), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=15, + by_epoch=True, + milestones=[5, 10], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(interval=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5df5f0cfa3bebee3148f7d7abe795d9ac599bfd0 --- /dev/null +++ b/configs/recognition/tin/README.md @@ -0,0 +1,102 @@ +# TIN + +[Temporal Interlacing Network](https://ojs.aaai.org/index.php/AAAI/article/view/6872) + + + +## Abstract + + + +For a long time, the vision community tries to learn the spatio-temporal representation by combining convolutional neural network together with various temporal models, such as the families of Markov chain, optical flow, RNN and temporal convolution. However, these pipelines consume enormous computing resources due to the alternately learning process for spatial and temporal information. One natural question is whether we can embed the temporal information into the spatial one so the information in the two domains can be jointly learned once-only. In this work, we answer this question by presenting a simple yet powerful operator -- temporal interlacing network (TIN). Instead of learning the temporal features, TIN fuses the two kinds of information by interlacing spatial representations from the past to the future, and vice versa. A differentiable interlacing target can be learned to control the interlacing process. In this way, a heavy temporal model is replaced by a simple interlacing operator. We theoretically prove that with a learnable interlacing target, TIN performs equivalently to the regularized temporal convolution network (r-TCN), but gains 4% more accuracy with 6x less latency on 6 challenging benchmarks. These results push the state-of-the-art performances of video understanding by a considerable margin. Not surprising, the ensemble model of the proposed TIN won the 1st place in the ICCV19 - Multi Moments in Time challenge. + + + +
+ +
+ +## Results and Models + +### Something-Something V1 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: | +| 1x1x8 | height 100 | 8x4 | ResNet50 | ImageNet | 39.68 | 68.55 | 44.04 | 72.72 | 8 clips x 1 crop | x | 6181 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb_20220913-9b7804d6.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.log) | + +### Something-Something V2 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: | +| 1x1x8 | height 240 | 8x4 | ResNet50 | ImageNet | 54.78 | 82.18 | 56.48 | 83.45 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb_20220913-84f9b4b0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.log) | + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :------------: | :--: | :------: | :-------------: | :------: | :------: | :--------------: | :---------------------: | :--------: | :-----------------------: | :---------------------: | :---------------------: | +| 1x1x8 | short-side 256 | 8x4 | ResNet50 | TSM-Kinetics400 | 71.86 | 90.44 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) | + +Here, we use `finetune` to indicate that we use [TSM model](https://download.openmmlab.com/mmaction/v1.0/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) trained on Kinetics-400 to finetune the TIN model on Kinetics-400. + +:::{note} + +1. The **reference topk acc** are got by training the [original repo #1aacd0c](https://github.com/deepcs233/TIN/tree/1aacd0c4c30d5e1d334bf023e55b855b59f158db) with no [AverageMeter issue](https://github.com/deepcs233/TIN/issues/4). + The [AverageMeter issue](https://github.com/deepcs233/TIN/issues/4) will lead to incorrect performance, so we fix it before running. +2. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +3. The values in columns named after "reference" are the results got by training on the original repo, using the same model settings. +4. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_class2ind.txt) are also available. + +::: + +For more details on data preparation, you can refer to + +- [Kinetics](/tools/data/kinetics/README.md) +- [Something-something V1](/tools/data/sthv1/README.md) +- [Something-something V2](/tools/data/sthv2/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TIN model on Something-Something V1 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py \ + --work-dir work_dirs/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb randomness.seed=0 randomness.deterministic=True +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TIN model on Something-Something V1 dataset and dump the result to a json file. + +```shell +python tools/test.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.json +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{shao2020temporal, + title={Temporal Interlacing Network}, + author={Hao Shao and Shengju Qian and Yu Liu}, + year={2020}, + journal={AAAI}, +} +``` diff --git a/configs/recognition/tin/metafile.yml b/configs/recognition/tin/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..d13bf78308ffd9c2a0aa1244ff7956259786aeb0 --- /dev/null +++ b/configs/recognition/tin/metafile.yml @@ -0,0 +1,73 @@ +Collections: +- Name: TIN + README: configs/recognition/tin/README.md + Paper: + URL: https://arxiv.org/abs/2001.06499 + Title: Temporal Interlacing Network +Models: +- Config: configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py + In Collection: TIN + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 40 + FLOPs: 32962097536 + Parameters: 23895566 + Pretrained: ImageNet + Resolution: height 100 + Training Data: SthV1 + Training Resources: 32 GPUs + Modality: RGB + Name: tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb + Results: + - Dataset: SthV1 + Metrics: + Top 1 Accuracy: 38.68 + Top 5 Accuracy: 68.55 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb_20220913-9b7804d6.pth +- Config: configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py + In Collection: TIN + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 40 + FLOPs: 32962097536 + Parameters: 23895566 + Pretrained: ImageNet + Resolution: height 240 + Training Data: SthV2 + Training Resources: 32 GPUs + Modality: RGB + Name: tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb + Results: + - Dataset: SthV2 + Metrics: + Top 1 Accuracy: 54.78 + Top 5 Accuracy: 82.18 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb_20220913-84f9b4b0.pth +- Config: configs/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py + In Collection: TIN + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 50 + FLOPs: 32965800320 + Parameters: 24358640 + Pretrained: TSM-Kinetics400 + Resolution: short-side 256 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Name: tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb + Results: + - Dataset: Kinetics-400 + Metrics: + Top 1 Accuracy: 71.86 + Top 5 Accuracy: 90.44 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth diff --git a/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py b/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..25f3b65f394842270063afebf18853637fa22d14 --- /dev/null +++ b/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb.py @@ -0,0 +1,127 @@ +_base_ = ['../../_base_/models/tin_r50.py', '../../_base_/default_runtime.py'] + +# model settings +model = dict(cls_head=dict(num_classes=174, dropout_ratio=0.8)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=40, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0005, + ), + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + clip_grad=dict(max_norm=20, norm_type=2)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=39, + eta_min=0, + by_epoch=True, + convert_to_iter_based=True, + begin=1, + end=40) +] diff --git a/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py b/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..db57ce4956dc09803a955493631bde23cae5bb47 --- /dev/null +++ b/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py @@ -0,0 +1,125 @@ +_base_ = ['../../_base_/models/tin_r50.py', '../../_base_/default_runtime.py'] + +# model settings +model = dict(cls_head=dict(num_classes=174, dropout_ratio=0.8)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv2/rawframes' +data_root_val = 'data/sthv2/rawframes' +ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0005, + ), + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + clip_grad=dict(max_norm=40, norm_type=2)) + +# learning policy + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=39, + eta_min=0, + by_epoch=True, + convert_to_iter_based=True, + begin=1, + end=40) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=40, val_begin=1, val_interval=2) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..fb88f5d24813ecb224fa8660da44e2a82de2d9ca --- /dev/null +++ b/configs/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,102 @@ +_base_ = [ + '../../_base_/models/tin_r50.py', '../../_base_/schedules/sgd_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(is_shift=True)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=6, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict(val_interval=5) + +# optimizer +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', paramwise_cfg=dict(fc_lr5=True)) +load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth' # noqa: E501 diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b5a11c8aa351e30dfb83558ae541a8e4b643dd55 --- /dev/null +++ b/configs/recognition/tpn/README.md @@ -0,0 +1,93 @@ +# TPN + +[Temporal Pyramid Network for Action Recognition](https://openaccess.thecvf.com/content_CVPR_2020/html/Yang_Temporal_Pyramid_Network_for_Action_Recognition_CVPR_2020_paper.html) + + + +## Abstract + + + +Visual tempo characterizes the dynamics and the temporal scale of an action. Modeling such visual tempos of different actions facilitates their recognition. Previous works often capture the visual tempo through sampling raw videos at multiple rates and constructing an input-level frame pyramid, which usually requires a costly multi-branch network to handle. In this work we propose a generic Temporal Pyramid Network (TPN) at the feature-level, which can be flexibly integrated into 2D or 3D backbone networks in a plug-and-play manner. Two essential components of TPN, the source of features and the fusion of features, form a feature hierarchy for the backbone so that it can capture action instances at various tempos. TPN also shows consistent improvements over other challenging baselines on several action recognition datasets. Specifically, when equipped with TPN, the 3D ResNet-50 with dense sampling obtains a 2% gain on the validation set of Kinetics-400. A further analysis also reveals that TPN gains most of its improvements on action classes that have large variances in their visual tempos, validating the effectiveness of TPN. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :---------------------: | :---------------: | :---------------------: | :--------: | :----------: | :--------: | :-------: | +| 8x8x1 | short-side 320 | 8x2 | ResNet50 | None | 74.20 | 91.48 | x | x | 10 clips x 3 crop | x | 6916 | [config](/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-97d0835d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.log) | +| 8x8x1 | short-side 320 | 8 | ResNet50 | ImageNet | 76.74 | 92.57 | [75.49](https://github.com/decisionforce/TPN/blob/master/MODELZOO.md) | [92.05](https://github.com/decisionforce/TPN/blob/master/MODELZOO.md) | 10 clips x 3 crop | x | 6916 | [config](/configs/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-fed3f4c1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb.log) | + +### Something-Something V1 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: | +| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 51.87 | 79.67 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) | + +:::{note} + +1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. The values in columns named after "reference" are the results got by testing the checkpoint released on the original repo and codes, using the same dataset with ours. +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_class2ind.txt) are also available. + +::: + +For more details on data preparation, you can refer to + +- [Kinetics](/tools/data/kinetics/README.md) +- [Something-something V1](/tools/data/sthv1/README.md) +- [Something-something V2](/tools/data/sthv2/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TPN model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py \ + --work-dir work_dirs/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb [--validate --seed 0 --deterministic] +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TPN model on Kinetics-400 dataset and dump the result to a json file. + +```shell +python tools/test.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{yang2020tpn, + title={Temporal Pyramid Network for Action Recognition}, + author={Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2020}, +} +``` diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..0621e97c62cca02b59d8ac28b726ffa2e482aa7f --- /dev/null +++ b/configs/recognition/tpn/metafile.yml @@ -0,0 +1,73 @@ +Collections: +- Name: TPN + README: configs/recognition/tpn/README.md + Paper: + URL: https://arxiv.org/abs/2004.03548 + Title: Temporal Pyramid Network for Action Recognition +Models: +- Config: configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py + In Collection: TPN + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 150 + FLOPs: 66014576640 + Parameters: 91498336 + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Name: tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb + Results: + - Dataset: Kinetics-400 + Metrics: + Top 1 Accuracy: 74.20 + Top 5 Accuracy: 91.48 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-97d0835d.pth +- Config: configs/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py + In Collection: TPN + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 150 + FLOPs: 66014576640 + Parameters: 91498336 + Pretrained: ImageNet + Resolution: short-side 320 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Name: tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb + Results: + - Dataset: Kinetics-400 + Metrics: + Top 1 Accuracy: 76.74 + Top 5 Accuracy: 92.57 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-fed3f4c1.pth +- Config: configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py + In Collection: TPN + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 150 + FLOPs: 54202822656 + Parameters: 82445724 + Pretrained: TSM + Resolution: height 100 + Training Data: SthV1 + Training Resources: 48 GPUs + Modality: RGB + Name: tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb + Results: + - Dataset: SthV1 + Metrics: + Top 1 Accuracy: 51.87 + Top 5 Accuracy: 79.67 + Task: Action Recognition + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log + Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth diff --git a/configs/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py b/configs/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..af165a084d55bdc5f635057a8629016f3541feae --- /dev/null +++ b/configs/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py @@ -0,0 +1,109 @@ +_base_ = [ + '../../_base_/models/tpn_slowonly_r50.py', + '../../_base_/default_runtime.py' +] + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='ColorJitter'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='ColorJitter'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=10) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.01, momentum=0.9, weight_decay=1e-4, nesterov=True), + clip_grad=dict(max_norm=40, norm_type=2), +) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=150, + by_epoch=True, + milestones=[75, 125], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=5)) diff --git a/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py b/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c4f4e2559f0d0fd09e4420283825a7057b5e0310 --- /dev/null +++ b/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py @@ -0,0 +1,6 @@ +_base_ = [ + './tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.py' +] + +# model settings +model = dict(backbone=dict(pretrained=None)) diff --git a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d65ce2c3db2d260ffdef27b9be4e98b8b3b13d05 --- /dev/null +++ b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py @@ -0,0 +1,107 @@ +_base_ = [ + '../../_base_/models/tpn_tsm_r50.py', '../../_base_/default_runtime.py' +] + +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='ColorJitter'), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=150, + by_epoch=True, + milestones=[75, 125], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/recognition/trn/README.md b/configs/recognition/trn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58d06f3952da2c8148d34909f6d5555f2bfa8bc2 --- /dev/null +++ b/configs/recognition/trn/README.md @@ -0,0 +1,83 @@ +# TRN + +[Temporal Relational Reasoning in Videos](https://openaccess.thecvf.com/content_ECCV_2018/html/Bolei_Zhou_Temporal_Relational_Reasoning_ECCV_2018_paper.html) + + + +## Abstract + + + +Temporal relational reasoning, the ability to link meaningful transformations of objects or entities over time, is a fundamental property of intelligent species. In this paper, we introduce an effective and interpretable network module, the Temporal Relation Network (TRN), designed to learn and reason about temporal dependencies between video frames at multiple time scales. We evaluate TRN-equipped networks on activity recognition tasks using three recent video datasets - Something-Something, Jester, and Charades - which fundamentally depend on temporal relational reasoning. Our results demonstrate that the proposed TRN gives convolutional neural networks a remarkable capacity to discover temporal relations in videos. Through only sparsely sampled video frames, TRN-equipped networks can accurately predict human-object interactions in the Something-Something dataset and identify various human gestures on the Jester dataset with very competitive performance. TRN-equipped networks also outperform two-stream networks and 3D convolution networks in recognizing daily activities in the Charades dataset. Further analyses show that the models learn intuitive and interpretable visual common sense knowledge in videos. + + + +
+ +
+ +## Results and Models + +### Something-Something V1 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :----------------: | :----: | :----: | :-------------------: | :-----------------: | :-----------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 31.60 / 33.65 | 60.15 / 62.22 | 16 clips x 10 crop | 42.94G | 26.64M | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb_20220815-e13db2e9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.log) | + +### Something-Something V2 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :----------------: | :----: | :----: | :-------------------: | :-----------------: | :-----------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 47.65 / 51.20 | 76.27 / 78.42 | 16 clips x 10 crop | 42.94G | 26.64M | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20220815-e01617db.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop only) and accurate setting (three crop and `twice_sample`). +3. In the original [repository](https://github.com/zhoubolei/TRN-pytorch), the author augments data with random flipping on something-something dataset, but the augmentation method may be wrong due to the direct actions, such as `push left to right`. So, we replaced `flip` with `flip with label mapping`, and change the testing method `TenCrop`, which has five flipped crops, to `Twice Sample & ThreeCrop`. +4. We use `ResNet50` instead of `BNInception` as the backbone of TRN. When Training `TRN-ResNet50` on sthv1 dataset in the original repository, we get top1 (top5) accuracy 30.542 (58.627) vs. ours 31.81 (60.47). + +For more details on data preparation, you can refer to [Something-something V1](/tools/data/sthv1/README.md) and [Something-something V2](/tools/data/sthv2/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TRN model on sthv1 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TRN model on sthv1 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{zhou2017temporalrelation, + title = {Temporal Relational Reasoning in Videos}, + author = {Zhou, Bolei and Andonian, Alex and Oliva, Aude and Torralba, Antonio}, + journal={European Conference on Computer Vision}, + year={2018} +} +``` diff --git a/configs/recognition/trn/metafile.yml b/configs/recognition/trn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..e440a01a3166349ed2429218d6f057b5c8eb6434 --- /dev/null +++ b/configs/recognition/trn/metafile.yml @@ -0,0 +1,57 @@ +Collections: + - Name: TRN + README: configs/recognition/trn/README.md + Paper: + URL: https://arxiv.org/abs/1711.08496 + Title: 'Temporal Relational Reasoning in Videos' + +Models: + - Name: trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb + Config: configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py + In Collection: TRN + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 50 + FLOPs: 42.94G + params: 26.64M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 33.65 + Top 1 Accuracy (efficient): 31.60 + Top 5 Accuracy: 62.22 + Top 5 Accuracy (efficient): 60.15 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb_20220815-e13db2e9.pth + + - Name: trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb + Config: configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py + In Collection: TRN + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 50 + FLOPs: 42.94G + params: 26.64M + Pretrained: 224x224 + Resolution: height 240 + Training Data: SthV2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 51.20 + Top 1 Accuracy (efficient): 47.65 + Top 5 Accuracy: 78.42 + Top 5 Accuracy (efficient): 76.27 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20220815-e01617db.pth diff --git a/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..617820924a5ce204f9011c72914e66b7331916fa --- /dev/null +++ b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py @@ -0,0 +1,94 @@ +_base_ = ['trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py'] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +file_client_args = dict(io_backend='disk') + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(img=data_root), + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ed2c91e47af8c22b1d414d5eb22e90489919204a --- /dev/null +++ b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py @@ -0,0 +1,123 @@ +_base_ = ['../../_base_/models/trn_r50.py', '../../_base_/default_runtime.py'] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=False), + optimizer=dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=5e-4), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[30, 45], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +find_unused_parameters = True diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8e11574624b0254b8dc4b5153054712ecd5b91a1 --- /dev/null +++ b/configs/recognition/tsm/README.md @@ -0,0 +1,103 @@ +# TSM + +[TSM: Temporal Shift Module for Efficient Video Understanding](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_TSM_Temporal_Shift_Module_for_Efficient_Video_Understanding_ICCV_2019_paper.html) + + + +## Abstract + + + +The explosive growth in video streaming gives rise to challenges on performing video understanding at high accuracy and low computation cost. Conventional 2D CNNs are computationally cheap but cannot capture temporal relationships; 3D CNN based methods can achieve good performance but are computationally intensive, making it expensive to deploy. In this paper, we propose a generic and effective Temporal Shift Module (TSM) that enjoys both high efficiency and high performance. Specifically, it can achieve the performance of 3D CNN but maintain 2D CNN's complexity. TSM shifts part of the channels along the temporal dimension; thus facilitate information exchanged among neighboring frames. It can be inserted into 2D CNNs to achieve temporal modeling at zero computation and zero parameters. We also extended TSM to online setting, which enables real-time low-latency online video recognition and video object detection. TSM is accurate and efficient: it ranks the first place on the Something-Something leaderboard upon publication; on Jetson Nano and Galaxy Note8, it achieves a low latency of 13ms and 35ms for online video recognition. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :--------------------------: | -------------------------: | -------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 73.18 | 90.56 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 73.22 | 90.22 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb_20220831-a6db1e5d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 75.12 | 91.55 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb_20220831-042b1748.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.log) | +| 1x1x8 (dense) | 224x224 | 8 | ResNet50 | ImageNet | 73.38 | 90.78 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb_20220831-f55d3c2b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.49 | 91.15 | 8 clips x 10 crop | 61.30G | 31.68M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb_20220831-108bfde5.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.66 | 90.99 | 8 clips x 10 crop | 59.06G | 28.00M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-7e54dacf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.34 | 91.23 | 8 clips x 10 crop | 61.30G | 31.68M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-35eddb57.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | 224x224 | 8 | MobileNetV2 | ImageNet | 68.71 | 88.32 | 8 clips x 3 crop | 3.269G | 2.736M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb_20230414-401127fd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.log) | +| 1x1x16 | 224x224 | 8 | MobileOne-S4 | ImageNet | 74.38 | 91.71 | 16 clips x 10 crop | 48.65G | 13.72M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb_20230825-a7f8876b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.log) | + +### Something-something V2 + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 3 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 3 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 63.70 | 88.28 | 8 clips x 3 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +3. MoibleOne backbone supports reparameterization during inference. You can use the provided [reparameterize tool](/tools/convert/reparameterize_model.py) to convert the checkpoint and switch to the [deploy config file](/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_deploy_8xb16-1x1x16-50e_kinetics400-rgb.py). + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TSM model on Kinetics-400 dataset in a deterministic option. + +```shell +python tools/train.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TSM model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{lin2019tsm, + title={TSM: Temporal Shift Module for Efficient Video Understanding}, + author={Lin, Ji and Gan, Chuang and Han, Song}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + year={2019} +} +``` + + + +```BibTeX +@article{Nonlocal2018, + author = {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He}, + title = {Non-local Neural Networks}, + journal = {CVPR}, + year = {2018} +} +``` diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..0be4599404ad67263e429d65f166e8e018dc5ca9 --- /dev/null +++ b/configs/recognition/tsm/metafile.yml @@ -0,0 +1,284 @@ +Collections: + - Name: TSM + README: configs/recognition/tsm/README.md + Paper: + URL: https://arxiv.org/abs/1811.08383 + Title: "TSM: Temporal Shift Module for Efficient Video Understanding" + +Models: + - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 32.88G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.18 + Top 5 Accuracy: 90.56 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth + + - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 32.88G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.22 + Top 5 Accuracy: 90.22 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb_20220831-a6db1e5d.pth + + - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 65.75G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.12 + Top 5 Accuracy: 91.55 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb_20220831-042b1748.pth + + - Name: tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 32.88G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.38 + Top 5 Accuracy: 90.78 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb_20220831-f55d3c2b.pth + + - Name: tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 61.30G + Parameters: 31.68M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.34 + Top 5 Accuracy: 91.23 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-35eddb57.pth + + - Name: tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 61.30G + Parameters: 31.68M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.49 + Top 5 Accuracy: 91.15 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb_20220831-108bfde5.pth + + - Name: tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 59.06G + Parameters: 28.00M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.66 + Top 5 Accuracy: 90.99 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-7e54dacf.pth + + - Name: tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py + In Collection: TSM + Metadata: + Architecture: MobileOne-S4 + Batch Size: 16 + Epochs: 100 + FLOPs: 48.65G + Parameters: 13.72M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.38 + Top 5 Accuracy: 91.71 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb_20230825-a7f8876b.pth + + + - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py + In Collection: TSM + Metadata: + Architecture: MobileNetV2 + Batch Size: 16 + Epochs: 100 + FLOPs: 3.269G + Parameters: 2.736M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 63.70 + Top 5 Accuracy: 88.28 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth + + - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 100 + FLOPs: 32.88G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 62.72 + Top 5 Accuracy: 87.70 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth + + - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 50 + FLOPs: 65.75G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 64.16 + Top 5 Accuracy: 88.61 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth + + - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb + Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py + In Collection: TSM + Metadata: + Architecture: ResNet101 + Batch Size: 16 + Epochs: 50 + FLOPs: 62.66G + Parameters: 42.86M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: SthV2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 63.70 + Top 5 Accuracy: 88.28 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..9a896265b6455bbd5874d47deda5e5475fcdf5b8 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,124 @@ +_base_ = [ + '../../_base_/models/tsm_mobilenet_v2.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[40, 80], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00002), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=128) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..865f69cfb651ba13ec8a91b1e6b90b9e2425315a --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py @@ -0,0 +1,126 @@ +_base_ = [ + '../../_base_/models/tsm_mobileone_s4.py', + '../../_base_/default_runtime.py' +] + +model = dict(cls_head=dict(num_segments=16)) +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[25, 45], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00002), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=128) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_deploy_8xb16-1x1x16-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_deploy_8xb16-1x1x16-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..077dd64ff8799bb30169c38a1458bd7d5e020490 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobileone-s4_deploy_8xb16-1x1x16-50e_kinetics400-rgb.py @@ -0,0 +1,5 @@ +_base_ = [ + './tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py', # noqa: E501 +] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0e7531f4873a0a1143fe8848fb4db04d8560e129 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py @@ -0,0 +1,6 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py'] + +# model settings +r101_checkpoint = 'torchvision://resnet101' + +model = dict(backbone=dict(pretrained=r101_checkpoint, depth=101)) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7affb458486ec6989b3d434094b9291efd520169 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='dot_product'), + )) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..b42c793fecc01d43f438fe5061bdd2c262e718e5 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian'), + )) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0f223ecbcc0c612dfdddb2d346379661eaa42a37 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,12 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py'] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='gaussian'), + )) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e9ca076cb7dce84206007a03f66a7a6d0c87df68 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_kinetics400-rgb.py @@ -0,0 +1,92 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py'] + +# model settings +model = dict(backbone=dict(num_segments=16), cls_head=dict(num_segments=16)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..db1a43de6554d50336c2c8216ad5476f60a4f14f --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -0,0 +1,59 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py'] + +model = dict(backbone=dict(num_segments=16), cls_head=dict(num_segments=16)) + +file_client_args = dict(io_backend='disk') + +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + twice_sample=True, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +val_dataloader = dict(dataset=dict(pipeline=val_pipeline)) + +test_dataloader = dict(pipeline=test_pipeline, test_mode=True) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..b505a7c0bee55eb047048fbbdd703780292272e2 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,21 @@ +_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py'] + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10), + dict( + type='MultiStepLR', + begin=0, + end=100, + by_epoch=True, + milestones=[50, 90], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..cb37432b10d79d22e186953577443e9e863edc10 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,122 @@ +_base_ = ['../../_base_/models/tsm_r50.py', '../../_base_/default_runtime.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[25, 45], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2654d15cb5e71d53f9eaeb2b40a82cbb294e7587 --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py @@ -0,0 +1,126 @@ +_base_ = ['../../_base_/models/tsm_r50.py', '../../_base_/default_runtime.py'] + +# model settings +model = dict(cls_head=dict(num_classes=174, dropout_ratio=0.5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True, + twice_sample=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[25, 45], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..441a0b93aadedd67639c78be65ca8883234d3aba --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-dense-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,122 @@ +_base_ = ['../../_base_/models/tsm_r50.py', '../../_base_/default_runtime.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[25, 45], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..824f3437c919e1a99ce1d1a828b30e908b6c78ad --- /dev/null +++ b/configs/recognition/tsn/README.md @@ -0,0 +1,110 @@ +# TSN + +[Temporal segment networks: Towards good practices for deep action recognition](https://link.springer.com/chapter/10.1007/978-3-319-46484-8_2) + + + +## Abstract + + + +Deep convolutional networks have achieved great success for visual recognition in still images. However, for action recognition in videos, the advantage over traditional methods is not so evident. This paper aims to discover the principles to design effective ConvNet architectures for action recognition in videos and learn these models given limited training samples. Our first contribution is temporal segment network (TSN), a novel framework for video-based action recognition. which is based on the idea of long-range temporal structure modeling. It combines a sparse temporal sampling strategy and video-level supervision to enable efficient and effective learning using the whole action video. The other contribution is our study on a series of good practices in learning ConvNets on video data with the help of temporal segment network. Our approach obtains the state-the-of-art performance on the datasets of HMDB51 ( 69.4%) and UCF101 (94.2%). We also visualize the learned ConvNet models, which qualitatively demonstrates the effectiveness of temporal segment network and the proposed good practices. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | ----------------------------: | +| 1x1x3 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 72.83 | 90.65 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.log) | +| 1x1x5 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 73.80 | 91.21 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb_20220906-65d68713.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 74.12 | 91.34 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.log) | +| dense-1x1x5 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 71.37 | 89.67 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb_20220906-dcbc6e01.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | ResNet101 | ImageNet | 75.89 | 92.07 | 25 clips x 10 crop | 195.8G | 43.32M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb_20220906-23cff032.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.log) | + +### Something-Something V2 + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | -----------------------------: | +| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 35.51 | 67.09 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.91 | 68.77 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) | + +### Using backbones from 3rd-party in TSN + +It's possible and convenient to use a 3rd-party backbone for TSN under the framework of MMAction2, here we provide some examples for: + +- [x] Backbones from [MMClassification](https://github.com/open-mmlab/mmclassification/) +- [x] Backbones from [MMPretrain](https://github.com/open-mmlab/mmpretrain) +- [x] Backbones from [TorchVision](https://github.com/pytorch/vision/) +- [x] Backbones from [TIMM (pytorch-image-models)](https://github.com/rwightman/pytorch-image-models) + +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :--------------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------: | ---------------------------: | --------------------------: | +| 1x1x3 | MultiStep | 224x224 | 8 | ResNext101 | ImageNet | 72.95 | 90.36 | 25 clips x 10 crop | 200.3G | 42.95M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb_20221209-de2d5615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.log) | +| 1x1x3 | MultiStep | 224x224 | 8 | DenseNet161 | ImageNet | 72.07 | 90.15 | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) | +| 1x1x3 | MultiStep | 224x224 | 8 | Swin Transformer | ImageNet | 77.03 | 92.61 | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | Swin Transformer | ImageNet | 79.22 | 94.20 | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | MobileOne-S4 | ImageNet | 73.65 | 91.32 | 25 clips x 10 crop | 76G | 13.72M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb_20230825-2da3c1f7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.log) | + +1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details. +2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +4. MoibleOne backbone supports reparameterization during inference. You can use the provided [reparameterize tool](/tools/convert/reparameterize_model.py) to convert the checkpoint and switch to the [deploy config file](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_deploy_8xb32-1x1x8-100e_kinetics400-rgb.py). + +For more details on data preparation, you can refer to + +- [Kinetics](/tools/data/kinetics/README.md) +- [Something-something V2](/tools/data/sthv2/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TSN model on Kinetics-400 dataset in a deterministic option. + +```shell +python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TSN model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{wang2016temporal, + title={Temporal segment networks: Towards good practices for deep action recognition}, + author={Wang, Limin and Xiong, Yuanjun and Wang, Zhe and Qiao, Yu and Lin, Dahua and Tang, Xiaoou and Van Gool, Luc}, + booktitle={European conference on computer vision}, + pages={20--36}, + year={2016}, + organization={Springer} +} +``` diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2b6a83eebd8e1f5b1a3b0400308d025b9ef4d72f --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,34 @@ +_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + type='torchvision.densenet161', pretrained=True, _delete_=True), + cls_head=dict(in_channels=2208)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2c9528ffcc6e63e1704e4d08216d4fcdbaa20f42 --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,75 @@ +_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +# dataset settings +checkpoint = ('https://download.openmmlab.com/mmclassification/' + 'v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth') +model = dict( + backbone=dict( + type='mmpretrain.MobileOne', + arch='s4', + out_indices=(3, ), + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint, prefix='backbone'), + _delete_=True), + cls_head=dict(in_channels=2048)) + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_deploy_8xb32-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_deploy_8xb32-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..16adcbfa7c6c02bde040fb05c48205aeb76c4bcc --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_deploy_8xb32-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,5 @@ +_base_ = [ + './tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py' # noqa: E501 +] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..97870bb7059e9d59938ffc2eee8ea8bccb42a756 --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,17 @@ +_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +checkpoint = ('https://download.openmmlab.com/mmclassification/v0/resnext/' + 'resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth') + +model = dict( + backbone=dict( + type='mmcls.ResNeXt', + depth=101, + num_stages=4, + out_indices=(3, ), + groups=32, + width_per_group=4, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint, prefix='backbone'), + _delete_=True)) diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4d47b1881ef1490d6b5640e7769f3d6e72458f --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,21 @@ +_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + type='timm.swin_base_patch4_window7_224', + pretrained=True, + _delete_=True), + cls_head=dict(in_channels=1024)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[20, 40], + gamma=0.1) +] diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..af25654a90f57d570b56bccecf7bd95669dcd467 --- /dev/null +++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,9 @@ +_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + type='timm.swin_base_patch4_window7_224', + pretrained=True, + feature_shape='NHWC', + _delete_=True), + cls_head=dict(in_channels=1024)) diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac0474a435633bb4b3aa55def751b551fdcfa6b1 --- /dev/null +++ b/configs/recognition/tsn/metafile.yml @@ -0,0 +1,285 @@ +Collections: + - Name: TSN + README: configs/recognition/tsn/README.md + Paper: + URL: https://arxiv.org/abs/1608.00859 + Title: "Temporal Segment Networks: Towards Good Practices for Deep Action Recognition" + +Models: + - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 24.33M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 72.83 + Top 5 Accuracy: 90.65 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth + + - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 24.33M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.80 + Top 5 Accuracy: 91.21 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb_20220906-65d68713.pth + + - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb + Alias: + - TSN + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 24.33M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 74.12 + Top 5 Accuracy: 91.34 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth + + - Name: tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 24.33M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 71.37 + Top 5 Accuracy: 89.67 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb_20220906-dcbc6e01.pth + + - Name: tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet101 + Batch Size: 32 + Epochs: 100 + FLOPs: 195.8G + Parameters: 43.32M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.89 + Top 5 Accuracy: 92.07 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb_20220906-23cff032.pth + + - Name: tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb + Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNext101 + Batch Size: 32 + Epochs: 100 + FLOPs: 200.3G + Parameters: 42.95M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 72.95 + Top 5 Accuracy: 90.36 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb_20221209-de2d5615.pth + + - Name: tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb + Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: DenseNet161 + Batch Size: 32 + Epochs: 100 + FLOPs: 194.6G + Parameters: 27.36M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 72.07 + Top 5 Accuracy: 90.15 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth + + - Name: tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb + Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: Swin-base + Batch Size: 32 + Epochs: 100 + FLOPs: 386.7G + Parameters: 87.15M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 77.03 + Top 5 Accuracy: 92.61 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth + + - Name: tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb + Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: Swin-base + Batch Size: 32 + Epochs: 100 + FLOPs: 386.7G + Parameters: 87.15M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 32 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 79.22 + Top 5 Accuracy: 94.20 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth + + - Name: tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb + Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py + In Collection: TSN + Metadata: + Architecture: MobileOne-S4 + Batch Size: 32 + Epochs: 100 + FLOPs: 76G + Parameters: 13.72M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.65 + Top 5 Accuracy: 91.32 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb_20230825-2da3c1f7.pth + + - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 35.51 + Top 5 Accuracy: 67.09 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth + + - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb + Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py + In Collection: TSN + Metadata: + Architecture: ResNet50 + Batch Size: 32 + Epochs: 100 + FLOPs: 102.7G + Parameters: 23.87M + Pretrained: ImageNet + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 36.91 + Top 5 Accuracy: 68.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..958fdf7899999884e069be8e6c1831eb5a051b43 --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r101_8xb32-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,7 @@ +_base_ = ['tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'] + +model = dict( + backbone=dict( + pretrained=('https://download.pytorch.org/' + 'models/resnet101-cd907fc2.pth'), + depth=101)) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..eb7c578d4086d93ae34479047daf95e724f4acaf --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py @@ -0,0 +1,56 @@ +_base_ = ['tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py'] + +file_client_args = dict(io_backend='disk') + +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +val_dataloader = dict(dataset=dict(pipeline=val_pipeline)) + +test_dataloader = dict(dataset=dict(pipeline=test_pipeline, test_mode=True)) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0a97a9c223fa0bd27b0a7466f7800b10d18cd888 --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,102 @@ +_base_ = [ + '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f03a2da78e1c750ad7caecd2a56ff1bbfaa35be0 --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x5-100e_kinetics400-rgb.py @@ -0,0 +1,63 @@ +_base_ = ['tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..3357be08d08db36f970c437e0bb7169a4cab10ba --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,63 @@ +_base_ = ['tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..be7e090ceb9fe4cedd760f6aa9c7cda282dbd8b7 --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py @@ -0,0 +1,108 @@ +_base_ = [ + '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174, dropout_ratio=0.5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=5) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c991c710338cbb080f7f58379df2a059af6de2ef --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb.py @@ -0,0 +1,95 @@ +_base_ = ['tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'] + +# model settings +model = dict(cls_head=dict(dropout_ratio=0.5, init_std=0.001)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=5), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.06, momentum=0.9, weight_decay=0.0001)) diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py new file mode 100644 index 0000000000000000000000000000000000000000..d829b7dda82cc19954a7a4478cb3174419dbeade --- /dev/null +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32_5x1x3-110e_kinetics400-flow.py @@ -0,0 +1,141 @@ +_base_ = '../../_base_/default_runtime.py' + +clip_len = 5 + +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + depth=50, + in_channels=2 * clip_len, # ``in_channels`` should be 2 * clip_len + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[128, 128] * clip_len, # ``in_channels`` should be 2 * clip_len + std=[128, 128] * clip_len, # ``in_channels`` should be 2 * clip_len + format_shape='NCHW')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_flow.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_flow.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_flow.txt' +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict( + type='SampleFrames', clip_len=clip_len, frame_interval=1, num_clips=3), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=clip_len, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=clip_len, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=110, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=110, + by_epoch=True, + milestones=[70, 100], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/recognition/uniformer/README.md b/configs/recognition/uniformer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..27d8a4d893a0292669acf33693bf141772fc4235 --- /dev/null +++ b/configs/recognition/uniformer/README.md @@ -0,0 +1,67 @@ +# UniFormer + +[UniFormer: Unified Transformer for Efficient Spatiotemporal Representation Learning](https://arxiv.org/abs/2201.04676) + + + +## Abstract + + + +It is a challenging task to learn rich and multi-scale spatiotemporal semantics from high-dimensional videos, due to large local redundancy and complex global dependency between video frames. The recent advances in this research have been mainly driven by 3D convolutional neural networks and vision transformers. Although 3D convolution can efficiently aggregate local context to suppress local redundancy from a small 3D neighborhood, it lacks the capability to capture global dependency because of the limited receptive field. Alternatively, vision transformers can effectively capture long-range dependency by self-attention mechanism, while having the limitation on reducing local redundancy with blind similarity comparison among all the tokens in each layer. Based on these observations, we propose a novel Unified transFormer (UniFormer) which seamlessly integrates merits of 3D convolution and spatiotemporal self-attention in a concise transformer format, and achieves a preferable balance between computation and accuracy. Different from traditional transformers, our relation aggregator can tackle both spatiotemporal redundancy and dependency, by learning local and global token affinity respectively in shallow and deep layers. We conduct extensive experiments on the popular video benchmarks, e.g., Kinetics-400, Kinetics-600, and Something-Something V1&V2. With only ImageNet-1K pretraining, our UniFormer achieves 82.9%/84.8% top-1 accuracy on Kinetics-400/Kinetics-600, while requiring 10x fewer GFLOPs than other state-of-the-art methods. For Something-Something V1 and V2, our UniFormer achieves new state-of-the-art performances of 60.9% and 71.2% top-1 accuracy respectively. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | backbone | top1 acc | top5 acc | [reference](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top1 acc | [reference](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :------------: | :---------: | :------: | :------: | :-------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 16x4x1 | short-side 320 | UniFormer-S | 80.9 | 94.6 | 80.8 | 94.7 | 80.9 | 94.6 | 4 clips x 1 crop | 41.8G | 21.4M | [config](/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-c630a037.pth) | +| 16x4x1 | short-side 320 | UniFormer-B | 82.0 | 95.0 | 82.0 | 95.1 | 82.0 | 95.0 | 4 clips x 1 crop | 96.7G | 49.8M | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-157c2e66.pth) | +| 32x4x1 | short-side 320 | UniFormer-B | 83.1 | 95.3 | 82.9 | 95.4 | 83.0 | 95.3 | 4 clips x 1 crop | 59G | 49.8M | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth) | + +The models are ported from the repo [UniFormer](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) and tested on our data. Currently, we only support the testing of UniFormer models, training will be available soon. + +1. The values in columns named after "reference" are the results of the original repo. +2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormer](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL). The total videos are available at [Kinetics400](https://pan.baidu.com/s/1t5K0FRz3PGAT-37-3FwAfg) (BaiduYun password: g5kp), which consists of 19787 videos. +3. The values in columns named after "mm-Kinetics" are the testing results on the Kinetics dataset held by MMAction2, which is also used by other models in MMAction2. Due to the differences between various versions of Kinetics dataset, there is a little gap between `top1/5 acc` and `mm-Kinetics top1/5 acc`. For a fair comparison with other models, we report both results here. Note that we simply report the inference results, since the training set is different between UniFormer and other models, the results are lower than that tested on the author's version. +4. Since the original models for Kinetics-400/600/700 adopt different [label file](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL), we simply map the weight according to the label name. New label map for Kinetics-400/600/700 can be found [here](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/kinetics). +5. Due to some difference between [SlowFast](https://github.com/facebookresearch/SlowFast) and MMAction2, there are some gaps between their performances. + +For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test UniFormer-S model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{ + li2022uniformer, + title={UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning}, + author={Kunchang Li and Yali Wang and Gao Peng and Guanglu Song and Yu Liu and Hongsheng Li and Yu Qiao}, + booktitle={International Conference on Learning Representations}, + year={2022}, + url={https://openreview.net/forum?id=nBU_u6DLvoK} +} +``` diff --git a/configs/recognition/uniformer/README_zh-CN.md b/configs/recognition/uniformer/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..bc1086aab03b412a557f689728b05bec4d315d6e --- /dev/null +++ b/configs/recognition/uniformer/README_zh-CN.md @@ -0,0 +1,55 @@ +# UniFormer + +[UniFormer: Unified Transformer for Efficient Spatiotemporal Representation Learning](https://arxiv.org/abs/2201.04676) + + + +## ็ฎ€ไป‹ + +```BibTeX +@inproceedings{ + li2022uniformer, + title={UniFormer: Unified Transformer for Efficient Spatial-Temporal Representation Learning}, + author={Kunchang Li and Yali Wang and Gao Peng and Guanglu Song and Yu Liu and Hongsheng Li and Yu Qiao}, + booktitle={International Conference on Learning Representations}, + year={2022}, + url={https://openreview.net/forum?id=nBU_u6DLvoK} +} +``` + +## ๆจกๅž‹ๅบ“ + +### Kinetics-400 + +| ๅธง้‡‡ๆ ท็ญ–็•ฅ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top1 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top5 ๅ‡†็กฎ็އ | mm-Kinetics top1 ๅ‡†็กฎ็އ | mm-Kinetics top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๆ–นๆกˆ | FLOPs | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | +| :--------: | :------------: | :---------: | :---------: | :---------: | :---------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 16x4x1 | short-side 320 | UniFormer-S | 80.9 | 94.6 | 80.8 | 94.7 | 80.9 | 94.6 | 4 clips x 1 crop | 41.8G | 21.4M | [config](/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-c630a037.pth) | +| 16x4x1 | short-side 320 | UniFormer-B | 82.0 | 95.0 | 82.0 | 95.1 | 82.0 | 95.0 | 4 clips x 1 crop | 96.7G | 49.8M | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-157c2e66.pth) | +| 32x4x1 | short-side 320 | UniFormer-B | 83.1 | 95.3 | 82.9 | 95.4 | 83.0 | 95.3 | 4 clips x 1 crop | 59G | 49.8M | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth) | + +่ฟ™ไบ›ๆจกๅž‹่ฟ็งป่‡ช [UniFormer](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md)ไป“ๅบ“๏ผŒๅนถๅœจๆˆ‘ไปฌ็š„ๆ•ฐๆฎไธŠ่ฟ›่กŒไบ†ๆต‹่ฏ•ใ€‚็›ฎๅ‰๏ผŒๆˆ‘ไปฌไป…ๆ”ฏๆŒๅฏน UniFormer ๆจกๅž‹็š„ๆต‹่ฏ•๏ผŒ่ฎญ็ปƒๅŠŸ่ƒฝๅฐ†ๅพˆๅฟซๆไพ›ใ€‚ + +1. ๅ็งฐไธบ"ๅ‚่€ƒๆ–‡็Œฎ"็š„ๅˆ—ไธญ็š„ๅ€ผๆ˜ฏๅŽŸๅง‹ไป“ๅบ“็š„็ป“ๆžœใ€‚ +2. `top1/5 ๅ‡†็กฎ็އ`ไธญ็š„ๅ€ผๆ˜ฏๆจกๅž‹ๅœจไธŽๅŽŸๅง‹ไป“ๅบ“็›ธๅŒ็š„ๆ•ฐๆฎ้›†ไธŠ็š„ๆต‹่ฏ•็ป“ๆžœ๏ผŒๅˆ†็ฑปๅ™จ็ป“ๆžœ-ๆ ‡็ญพๆ˜ ๅฐ„ไธŽ[UniFormer](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)ไธ€่‡ดใ€‚ๆ•ฐๆฎ้›†ๆ€ปๅ…ฑๆœ‰19787ไธช่ง†้ข‘๏ผŒๅฏไปฅๅœจ[Kinetics400](https://pan.baidu.com/s/1t5K0FRz3PGAT-37-3FwAfg)๏ผˆ็™พๅบฆไบ‘ๅฏ†็ ๏ผšg5kp๏ผ‰ไธญ่Žทๅ–ใ€‚ +3. ๅ็งฐไธบ "mm-Kinetics" ็š„ๅˆ—ไธญ็š„ๅ€ผๆ˜ฏๆจกๅž‹ๅœจ MMAction2 ๆŒๆœ‰็š„ Kinetics ๆ•ฐๆฎ้›†ไธŠ็š„ๆต‹่ฏ•็ป“ๆžœ๏ผŒๅ…ถไป– MMAction2 ๆจกๅž‹ไนŸไฝฟ็”จไบ†่ฏฅๆ•ฐๆฎ้›†ใ€‚็”ฑไบŽ Kinetics ๆ•ฐๆฎ้›†็š„ๅ„ไธช็‰ˆๆœฌไน‹้—ดๅญ˜ๅœจๅทฎๅผ‚๏ผŒๅ› ๆญค `top1/5 ๅ‡†็กฎ็އ` ๅ’Œ `mm-Kinetics top1/5 ๅ‡†็กฎ็އ` ไน‹้—ดๅญ˜ๅœจไธ€ไบ›ๅทฎ่ทใ€‚ไธบไบ†ไธŽๅ…ถไป–ๆจกๅž‹่ฟ›่กŒๅ…ฌๅนณๆฏ”่พƒ๏ผŒๆˆ‘ไปฌๅœจ่ฟ™้‡ŒๆŠฅๅ‘Šไบ†ไธคไธช็ป“ๆžœใ€‚่ฏทๆณจๆ„๏ผŒๆˆ‘ไปฌๅชๆŠฅๅ‘Šไบ†ๆŽจ็†็ป“ๆžœ๏ผŒ็”ฑไบŽ UniFormer ๅ’Œๅ…ถไป–ๆจกๅž‹ไน‹้—ด็š„่ฎญ็ปƒ้›†ไธๅŒ๏ผŒ่ฏฅ็ป“ๆžœไฝŽไบŽๅœจไฝœ่€…็‰ˆๆœฌไธŠๆต‹่ฏ•็š„็ป“ๆžœใ€‚ +4. ็”ฑไบŽ Kinetics-400/600/700 ็š„ๅŽŸๅง‹ๆจกๅž‹้‡‡็”จไบ†ไธๅŒ็š„[ๆ ‡็ญพๆ–‡ไปถ](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)๏ผŒๆˆ‘ไปฌๆ นๆฎๆ ‡็ญพๅ็งฐ็ฎ€ๅ•ๅœฐๆ˜ ๅฐ„ไบ†ๆƒ้‡ใ€‚Kinetics-400/600/700 ็š„ๆ–ฐๆ ‡็ญพๆ˜ ๅฐ„ๅฏไปฅๅœจ[่ฟ™้‡Œ](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/kinetics)ๆ‰พๅˆฐใ€‚ +5. ็”ฑไบŽ \[SlowFast\] (https://github.com/facebookresearch/SlowFast)ๅ’Œ MMAction2 ไน‹้—ดๅญ˜ๅœจไธ€ไบ›ๅทฎๅผ‚๏ผŒๅฎƒไปฌ็š„ๆ€ง่ƒฝๅญ˜ๅœจไธ€ไบ›ๅทฎ่ทใ€‚ + +ๆœ‰ๅ…ณๆ•ฐๆฎๅ‡†ๅค‡็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒๆ‚จๅฏไปฅๅ‚่€ƒ[ๅ‡†ๅค‡_kinetics](/tools/data/kinetics/README_zh-CN.md)ใ€‚ + +## ๅฆ‚ไฝ•ๆต‹่ฏ• + +ๆ‚จๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๆฅๆต‹่ฏ•ๆจกๅž‹๏ผš + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +็คบไพ‹๏ผšๅœจKinetics-400ๆ•ฐๆฎ้›†ไธŠๆต‹่ฏ• UniFormer-S ๆจกๅž‹๏ผŒๅนถๅฐ†็ป“ๆžœ่ฝฌๅ‚จๅˆฐไธ€ไธช pkl ๆ–‡ไปถไธญใ€‚ + +```shell +python tools/test.py configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +ๆœ‰ๅ…ณๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ[่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ไธญ็š„**ๆต‹่ฏ•**้ƒจๅˆ†ใ€‚ diff --git a/configs/recognition/uniformer/metafile.yml b/configs/recognition/uniformer/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..33784142eeed7cc6d198405c16b7282a940da3b0 --- /dev/null +++ b/configs/recognition/uniformer/metafile.yml @@ -0,0 +1,70 @@ +Collections: +- Name: UniFormer + README: configs/recognition/uniformer/README.md + Paper: + URL: https://arxiv.org/abs/2201.04676 + Title: "UniFormer: Unified Transformer for Efficient Spatiotemporal Representation Learning" + +Models: + - Name: uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb + Config: configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormer-S + Pretrained: ImageNet-1K + Resolution: short-side 320 + Frame: 16 + Sampling rate: 4 + Modality: RGB + Converted From: + Weights: https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md + Code: https://github.com/Sense-X/UniFormer/tree/main/video_classification + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 80.9 + Top 5 Accuracy: 94.6 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-c630a037.pth + + - Name: uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb + Config: configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormer-B + Pretrained: ImageNet-1K + Resolution: short-side 320 + Frame: 16 + Sampling rate: 4 + Modality: RGB + Converted From: + Weights: https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md + Code: https://github.com/Sense-X/UniFormer/tree/main/video_classification + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 82.0 + Top 5 Accuracy: 95.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-157c2e66.pth + + - Name: uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb + Config: configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormer-B + Pretrained: ImageNet-1K + Resolution: short-side 320 + Frame: 32 + Sampling rate: 4 + Modality: RGB + Converted From: + Weights: https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md + Code: https://github.com/Sense-X/UniFormer/tree/main/video_classification + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 83.1 + Top 5 Accuracy: 95.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth diff --git a/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py b/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5164cb934cc967ae865a272cf58c680736c35045 --- /dev/null +++ b/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py @@ -0,0 +1,58 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormer', + depth=[5, 8, 20, 7], + embed_dim=[64, 128, 320, 512], + head_dim=64, + drop_path_rate=0.3), + cls_head=dict( + type='I3DHead', + dropout_ratio=0., + num_classes=400, + in_channels=512, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py b/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..015edb342f89ccefc61e4aa2862188c0a5265638 --- /dev/null +++ b/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py @@ -0,0 +1,58 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormer', + depth=[5, 8, 20, 7], + embed_dim=[64, 128, 320, 512], + head_dim=64, + drop_path_rate=0.3), + cls_head=dict( + type='I3DHead', + dropout_ratio=0., + num_classes=400, + in_channels=512, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=4, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py b/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..28ff38540b008a2cfa2ea9e3cec7f337da31c4ee --- /dev/null +++ b/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py @@ -0,0 +1,58 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormer', + depth=[3, 4, 8, 3], + embed_dim=[64, 128, 320, 512], + head_dim=64, + drop_path_rate=0.1), + cls_head=dict( + type='I3DHead', + dropout_ratio=0., + num_classes=400, + in_channels=512, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a596c703f0bfd1a5378a891cd087b62c2de4e27a --- /dev/null +++ b/configs/recognition/uniformerv2/README.md @@ -0,0 +1,110 @@ +# UniFormerV2 + +[UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer](https://arxiv.org/abs/2211.09552) + + + +## Abstract + + + +Learning discriminative spatiotemporal representation is the key problem of video understanding. Recently, Vision Transformers (ViTs) have shown their power in learning long-term video dependency with self-attention. Unfortunately, they exhibit limitations in tackling local video redundancy, due to the blind global comparison among tokens. UniFormer has successfully alleviated this issue, by unifying convolution and self-attention as a relation aggregator in the transformer format. However, this model has to require a tiresome and complicated image-pretraining phrase, before being finetuned on videos. This blocks its wide usage in practice. On the contrary, open-sourced ViTs are readily available and well-pretrained with rich image supervision. Based on these observations, we propose a generic paradigm to build a powerful family of video networks, by arming the pretrained ViTs with efficient UniFormer designs. We call this family UniFormerV2, since it inherits the concise style of the UniFormer block. But it contains brand-new local and global relation aggregators, which allow for preferable accuracy-computation balance by seamlessly integrating advantages from both ViTs and UniFormer. Without any bells and whistles, our UniFormerV2 gets the state-of-the-art recognition performance on 8 popular video benchmarks, including scene-related Kinetics-400/600/700 and Moments in Time, temporal-related Something-Something V1/V2, untrimmed ActivityNet and HACS. In particular, it is the first model to achieve 90% top-1 accuracy on Kinetics-400, to our best knowledge. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | short-side 320 | UniFormerV2-B/16 | clip | - | - | 84.3 | 96.4 | 84.4 | 96.3 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-B/16 | clip-kinetics710 | - | - | 85.6 | 97.0 | 85.8 | 97.1 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) | - | +| 16 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) | - | + +### Kinetics-600 + +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | - | - | 86.1 | 97.2 | 86.4 | 97.3 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) | - | + +### Kinetics-700 + +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip | - | - | 75.8 | 92.8 | 75.9 | 92.9 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | - | - | 76.3 | 92.7 | 76.3 | 92.9 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) | - | + +### MiTv1 + +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710-kinetics400 | 42.3 | 71.5 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710-kinetics400 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) | - | +| 8 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) | - | + +### Kinetics-710 + +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :------: | :------: | :------: | :------------------------------------------: | :----------------------------------------: | :----------------------------------------: | +| 8 | Raw | UniFormerV2-B/16\* | clip | 78.9 | 94.2 | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip | - | - | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth) | - | +| 8 | Raw | UniFormerV2-L/14@336\* | clip | - | - | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth) | - | + +The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16). + +1. The values in columns named after "reference" are the results of the original repo. +2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL). +3. The values in columns named after "mm-Kinetics" are the testing results on the Kinetics dataset held by MMAction2, which is also used by other models in MMAction2. Due to the differences between various versions of Kinetics dataset, there is a little gap between `top1/5 acc` and `mm-Kinetics top1/5 acc`. For a fair comparison with other models, we report both results here. Note that we simply report the inference results, since the training set is different between UniFormer and other models, the results are lower than that tested on the author's version. +4. Since the original models for Kinetics-400/600/700 adopt different [label file](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL), we simply map the weight according to the label name. New label map for Kinetics-400/600/700 can be found [here](/tools/data/kinetics). +5. Due to some differences between [SlowFast](https://github.com/facebookresearch/SlowFast) and MMAction2, there are some gaps between their performances. +6. Kinetics-710 is used for pretraining, which helps improve the performance on other datasets efficiently. You can find more details in the [paper](https://arxiv.org/abs/2211.09552). We also map the wegiht for Kinetics-710 checkpoints, you can find the label map [here](/tools/data/kinetics710/label_map_k710.txt). + +For more details on data preparation, you can refer to + +- [preparing_kinetics](/tools/data/kinetics/README.md) +- [preparing_mit](/tools/data/mit/README.md) + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test UniFormerV2-B/16 model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{Li2022UniFormerV2SL, + title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer}, + author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Y. Qiao}, + journal={ArXiv}, + year={2022}, + volume={abs/2211.09552} +} +``` diff --git a/configs/recognition/uniformerv2/README_zh-CN.md b/configs/recognition/uniformerv2/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..ee1f9e84c512a9feecd096dd83c510ef5d002306 --- /dev/null +++ b/configs/recognition/uniformerv2/README_zh-CN.md @@ -0,0 +1,98 @@ +# UniFormerV2 + +[UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer](https://arxiv.org/abs/2211.09552) + + + +## ็ฎ€ไป‹ + +```BibTeX +@article{Li2022UniFormerV2SL, + title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer}, + author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Y. Qiao}, + journal={ArXiv}, + year={2022}, + volume={abs/2211.09552} +} +``` + +## ๆจกๅž‹ๅบ“ + +### Kinetics-400 + +| ๅ‡ๅŒ€้‡‡ๆ ทๅธงๆ•ฐ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | ไธŽ่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 ๅ‡†็กฎ็އ | mm-Kinetics top1 ๅ‡†็กฎ็އ | mm-Kinetics top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๆ–นๆกˆ | FLOPs | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :----------: | :------------: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | short-side 320 | UniFormerV2-B/16 | clip | - | - | 84.3 | 96.4 | 84.4 | 96.3 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-B/16 | clip-kinetics710 | - | - | 85.6 | 97.0 | 85.8 | 97.1 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) | - | +| 16 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) | - | + +### Kinetics-600 + +| ๅ‡ๅŒ€้‡‡ๆ ทๅธงๆ•ฐ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 ๅ‡†็กฎ็އ | mm-Kinetics top1 ๅ‡†็กฎ็އ | mm-Kinetics top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๆ–นๆกˆ | FLOPs | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :----------: | :----: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | - | - | 86.1 | 97.2 | 86.4 | 97.3 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) | - | + +### Kinetics-700 + +| ๅ‡ๅŒ€้‡‡ๆ ทๅธงๆ•ฐ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 ๅ‡†็กฎ็އ | mm-Kinetics top1 ๅ‡†็กฎ็އ | mm-Kinetics top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๆ–นๆกˆ | FLOPs | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | log | +| :----------: | :----: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip | - | - | 75.8 | 92.8 | 75.9 | 92.9 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | - | - | 76.3 | 92.7 | 76.3 | 92.9 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) | - | + +### MiTv1 + +| ๅ‡ๅŒ€้‡‡ๆ ทๅธงๆ•ฐ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 ๅ‡†็กฎ็އ | [ๅ‚่€ƒๆ–‡็Œฎ](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๆ–นๆกˆ | FLOPs | ๅ‚ๆ•ฐ้‡ | config | ckpt | log | +| :----------: | :----: | :--------------------: | :--------------------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710-kinetics400 | 42.3 | 71.5 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710-kinetics400 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) | - | +| 8 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) | - | + +### Kinetics-710 + +| ๅ‡ๅŒ€้‡‡ๆ ทๅธงๆ•ฐ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | ้ข„่ฎญ็ปƒ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | config | ckpt | log | +| :----------: | :----: | :--------------------: | :----: | :---------: | :---------: | :-------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| 8 | Raw | UniFormerV2-B/16\* | clip | 78.9 | 94.2 | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip | - | - | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth) | - | +| 8 | Raw | UniFormerV2-L/14@336\* | clip | - | - | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth) | - | + +ไปฅไธŠๅธฆๆœ‰ * ็š„ๆจกๅž‹ๆ˜ฏ่ฟ็งป่‡ช[UniFormerV2ไป“ๅบ“](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)๏ผŒๅนถๅœจๆˆ‘ไปฌ็š„ๆ•ฐๆฎไธŠ่ฟ›่กŒไบ†ๆต‹่ฏ•ใ€‚็”ฑไบŽ็ฎ—ๅŠ›้™ๅˆถ๏ผŒๆˆ‘ไปฌไป…ๆ”ฏๆŒๅŸบ็ก€ๆจกๅž‹๏ผˆๅณ UniFormerV2-B/16๏ผ‰่ฎญ็ปƒ้…็ฝฎ็š„ๅฏ้ ๆ€งใ€‚ + +1. "ๅ‚่€ƒๆ–‡็Œฎ"ๅˆ—ไธญ็š„ๆ•ฐๅ€ผๆ˜ฏๅŽŸๅง‹ไป“ๅบ“็š„็ป“ๆžœใ€‚ +2. `top1/5ๅ‡†็กฎ็އ` ไธญ็š„ๆ•ฐๅ€ผๆ˜ฏๅœจไธŽๅŽŸๅง‹ไป“ๅบ“็›ธๅŒ็š„ๆ•ฐๆฎไธŠ่ฟ›่กŒๆต‹่ฏ•ๅพ—ๅˆฐ็š„๏ผŒๅนถไธ”ๅˆ†็ฑปๅ™จ-ๆ ‡็ญพๆ˜ ๅฐ„ไธŽ [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)ไธ€่‡ดใ€‚ +3. "mm-Kinetics" ๅˆ—ไธญ็š„ๆ•ฐๅ€ผๆ˜ฏๅœจ MMAction2 ๆŒๆœ‰็š„ Kinetics ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒ็š„ๆต‹่ฏ•็ป“ๆžœ๏ผŒๅ…ถไป– MMAction2 ๆจกๅž‹ไนŸไฝฟ็”จไบ†่ฏฅๆ•ฐๆฎ้›†ใ€‚็”ฑไบŽๅ„ไธช็‰ˆๆœฌ็š„ Kinetics ๆ•ฐๆฎ้›†ไน‹้—ดๅญ˜ๅœจๅทฎๅผ‚๏ผŒ`top1/5ๅ‡†็กฎ็އ` ๅ’Œ `mm-Kinetics top1/5ๅ‡†็กฎ็އ` ไน‹้—ดๅญ˜ๅœจไธ€ไบ›ๅทฎๅผ‚ใ€‚ไธบไบ†ไธŽๅ…ถไป–ๆจกๅž‹่ฟ›่กŒๅ…ฌๅนณๆฏ”่พƒ๏ผŒๆˆ‘ไปฌๅœจ่ฟ™้‡ŒๆŠฅๅ‘Šไบ†ไธคไธช็ป“ๆžœใ€‚่ฏทๆณจๆ„๏ผŒๆˆ‘ไปฌๅชๆŠฅๅ‘ŠๆŽจๆ–ญ็ป“ๆžœ๏ผŒๅ› ไธบ UniFormer ๅ’Œๅ…ถไป–ๆจกๅž‹็š„่ฎญ็ปƒ้›†ไธๅŒ๏ผŒๆ‰€ไปฅ่ฏฅ็ป“ๆžœไฝŽไบŽๅœจไฝœ่€…็‰ˆๆœฌไธŠๆต‹่ฏ•็š„็ป“ๆžœใ€‚ +4. ็”ฑไบŽ Kinetics-400/600/700 ็š„ๅŽŸๅง‹ๆจกๅž‹้‡‡็”จไบ†ไธๅŒ็š„[ๆ ‡็ญพๆ–‡ไปถ](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)๏ผŒๆˆ‘ไปฌๆ นๆฎๆ ‡็ญพๅ็งฐ็ฎ€ๅ•ๆ˜ ๅฐ„ๆƒ้‡ใ€‚Kinetics-400/600/700็š„ๆ–ฐๆ ‡็ญพๆ˜ ๅฐ„ๅฏไปฅๅœจ[่ฟ™้‡Œ](/tools/data/kinetics)ๆ‰พๅˆฐใ€‚ +5. ็”ฑไบŽ [SlowFast](https://github.com/facebookresearch/SlowFast)ๅ’Œ MMAction2 ไน‹้—ดๅญ˜ๅœจไธ€ไบ›ๅทฎๅผ‚๏ผŒๅฎƒไปฌ็š„ๆ€ง่ƒฝไน‹้—ดๅญ˜ๅœจไธ€ไบ›ๅทฎ่ทใ€‚ +6. ๆˆ‘ไปฌไฝฟ็”จKinetics-710่ฟ›่กŒ้ข„่ฎญ็ปƒ๏ผŒ่ฟ™ๆœ‰ๅŠฉไบŽๆ้ซ˜ๅ…ถไป–ๆ•ฐๆฎ้›†็š„ๆ€ง่ƒฝใ€‚ไฝ ๅฏไปฅๅœจ[่ฎบๆ–‡](https://arxiv.org/abs/2211.09552)ไธญๆ‰พๅˆฐๆ›ดๅคš็ป†่Š‚ใ€‚ๆˆ‘ไปฌ่ฟ˜ๆ นๆฎ Kinetics-710 ็š„ๆจกๅž‹ๆƒ้‡่ฟ›่กŒไบ†ๆƒ้‡ๆ˜ ๅฐ„๏ผŒไฝ ๅฏไปฅๅœจ[่ฟ™้‡Œ](/tools/data/kinetics710/label_map_k710.txt)ๆ‰พๅˆฐๆ ‡็ญพๆ˜ ๅฐ„ใ€‚ + +ๆœ‰ๅ…ณๆ•ฐๆฎๅ‡†ๅค‡็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒๅฏไปฅๅ‚่€ƒไปฅไธ‹้“พๆŽฅ๏ผš + +- [ๅ‡†ๅค‡ Kinetics ๆ•ฐๆฎ้›†](/tools/data/kinetics/README_zh-CN.md) +- [ๅ‡†ๅค‡ MIT ๆ•ฐๆฎ้›†](/tools/data/mit/README_zh-CN.md) + +## ๅฆ‚ไฝ•ๆต‹่ฏ• + +ๆ‚จๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๆฅๆต‹่ฏ•ๆจกๅž‹๏ผš + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +็คบไพ‹๏ผšๅœจ Kinetics-400 ๆ•ฐๆฎ้›†ไธŠๆต‹่ฏ• UniFormerV2-B/16 ๆจกๅž‹๏ผŒๅนถๅฐ†็ป“ๆžœ่ฝฌๅ‚จๅˆฐไธ€ไธชpklๆ–‡ไปถไธญใ€‚ + +```shell +python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +ๆœ‰ๅ…ณๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ[่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ไธญ็š„**ๆต‹่ฏ•**้ƒจๅˆ†ใ€‚ diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json new file mode 100644 index 0000000000000000000000000000000000000000..055a6c7f3a972e1568d3c26d6a720b69afaa4871 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json @@ -0,0 +1 @@ +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399] diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json new file mode 100644 index 0000000000000000000000000000000000000000..618ee5ba988dd2f4fd3c4e03029fad7195c993c0 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json @@ -0,0 +1 @@ +[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 639, 8, 584, 9, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 18, 19, 20, 21, 22, 23, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 591, 30, 31, 32, 33, 34, 660, 644, 35, 36, 37, 38, 522, 629, 39, 709, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 45, 696, 46, 702, 47, 48, 49, 50, 51, 682, 52, 53, 54, 55, 505, 529, 514, 652, 708, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 656, 521, 69, 563, 71, 72, 73, 569, 688, 74, 75, 597, 512, 76, 77, 576, 78, 79, 636, 585, 80, 641, 81, 496, 82, 83, 84, 85, 86, 87, 88, 89, 649, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 677, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 127, 564, 128, 554, 129, 524, 130, 131, 132, 133, 134, 135, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 145, 146, 147, 148, 620, 640, 149, 150, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 495, 650, 501, 552, 543, 519, 555, 182, 672, 560, 581, 183, 184, 185, 609, 499, 561, 568, 187, 573, 188, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 197, 675, 198, 654, 199, 638, 200, 201, 648, 539, 202, 203, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 211, 136, 212, 213, 655, 666, 214, 593, 513, 580, 687, 215, 216, 217, 218, 219, 220, 221, 678, 695, 223, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 697, 676, 238, 542, 572, 239, 240, 615, 241, 523, 665, 242, 671, 243, 515, 244, 574, 245, 246, 247, 248, 249, 250, 251, 533, 252, 562, 253, 492, 614, 498, 608, 254, 255, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 528, 607, 281, 282, 283, 284, 551, 557, 285, 553, 685, 286, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 296, 297, 598, 298, 299, 602, 301, 642, 302, 303, 304, 100, 305, 306, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 326, 327, 329, 570, 330, 623, 601, 534, 331, 332, 333, 334, 703, 336, 337, 338, 339, 340, 341, 587, 342, 669, 344, 345, 518, 690, 610, 346, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 363, 364, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 374, 375, 520, 575, 606, 626, 377, 668, 378, 546, 379, 380, 381, 382, 583, 383, 384, 385, 527, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399] diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e18b787ef8f66a3d1d434a08b2fbbaf297c659 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json @@ -0,0 +1 @@ +[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 447, 639, 8, 584, 9, 10, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 453, 477, 18, 19, 20, 21, 22, 23, 439, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 484, 591, 30, 31, 32, 33, 34, 660, 435, 644, 35, 419, 36, 37, 38, 522, 629, 39, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 436, 45, 696, 450, 46, 431, 702, 47, 48, 49, 50, 51, 682, 52, 53, 475, 54, 458, 55, 505, 529, 514, 652, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 403, 656, 521, 69, 563, 70, 71, 72, 73, 569, 688, 406, 74, 75, 597, 512, 76, 77, 470, 576, 78, 79, 636, 585, 418, 80, 641, 451, 81, 496, 82, 83, 84, 85, 86, 87, 88, 415, 89, 479, 649, 90, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 405, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 409, 116, 117, 118, 677, 402, 119, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 456, 126, 127, 564, 128, 554, 445, 129, 524, 130, 131, 132, 133, 134, 135, 137, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 422, 145, 146, 147, 148, 620, 640, 149, 150, 404, 486, 473, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 467, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 182, 672, 560, 466, 581, 183, 184, 185, 609, 499, 561, 568, 187, 481, 573, 188, 442, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 490, 197, 488, 437, 675, 198, 654, 199, 638, 438, 424, 200, 201, 648, 539, 202, 203, 427, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 408, 211, 136, 212, 213, 454, 655, 666, 214, 429, 593, 513, 580, 687, 215, 216, 217, 421, 218, 219, 220, 221, 678, 446, 695, 222, 223, 423, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 485, 697, 676, 238, 542, 401, 483, 572, 239, 240, 615, 241, 471, 523, 665, 242, 671, 243, 430, 465, 515, 244, 574, 474, 491, 245, 246, 247, 248, 249, 250, 251, 533, 252, 400, 562, 253, 413, 492, 614, 498, 440, 462, 608, 254, 463, 255, 420, 476, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 416, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 428, 528, 607, 281, 282, 283, 433, 284, 478, 551, 557, 285, 553, 685, 286, 407, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 452, 296, 297, 461, 598, 298, 411, 299, 300, 602, 301, 642, 302, 443, 303, 412, 304, 100, 305, 306, 482, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 432, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 449, 326, 455, 327, 328, 329, 570, 330, 426, 425, 457, 623, 601, 534, 464, 331, 332, 333, 334, 703, 336, 337, 441, 338, 339, 340, 341, 587, 489, 487, 342, 669, 344, 345, 518, 690, 610, 346, 414, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 362, 363, 364, 459, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 417, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 374, 375, 472, 520, 575, 606, 626, 377, 668, 469, 378, 546, 444, 379, 380, 381, 382, 583, 383, 384, 385, 527, 410, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399] diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..7172768e01903e60c42d400480336a9482e6e3cb --- /dev/null +++ b/configs/recognition/uniformerv2/metafile.yml @@ -0,0 +1,466 @@ +Collections: +- Name: UniFormerV2 + README: configs/recognition/uniformerv2/README.md + Paper: + URL: https://arxiv.org/abs/2211.09552 + Title: "UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer" + +Models: + - Name: uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-B/16 + Batch Size: 32 + Pretrained: CLIP-400M + Frame: 8 + Sampling method: Uniform + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 84.3 + Top 5 Accuracy: 96.4 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-B/16 + Batch Size: 32 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 85.8 + Top 5 Accuracy: 97.1 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Resolution: 224x224 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 88.7 + Top 5 Accuracy: 98.1 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Resolution: 224x224 + Frame: 16 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.0 + Top 5 Accuracy: 98.2 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Resolution: 224x224 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.3 + Top 5 Accuracy: 98.2 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth + + - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14@336 + Pretrained: Kinetics-710 + Resolution: 224x224 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.5 + Top 5 Accuracy: 98.4 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Training Resources: 8 GPUs + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-600 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 86.4 + Top 5 Accuracy: 97.3 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-600 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.0 + Top 5 Accuracy: 98.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 16 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-600 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.4 + Top 5 Accuracy: 98.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-600 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.2 + Top 5 Accuracy: 98.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth + + - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14@336 + Pretrained: Kinetics-710 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-600 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 89.8 + Top 5 Accuracy: 98.5 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth + + - Name: uniformerv2-base-p16-res224_clip-pre_8xb32-u8_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_8xb32-u8_kinetics700-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: CLIP-400M + Frame: 8 + Sampling method: Uniform + Training Resources: 8 GPUs + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.9 + Top 5 Accuracy: 92.9 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Training Resources: 8 GPUs + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.3 + Top 5 Accuracy: 92.9 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 80.8 + Top 5 Accuracy: 95.2 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 16 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.2 + Top 5 Accuracy: 95.6 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth + + - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.4 + Top 5 Accuracy: 95.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth + + - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14@336 + Pretrained: Kinetics-710 + Frame: 32 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 82.1 + Top 5 Accuracy: 96.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-bfb9f401.pth + + - Name: uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: CLIP-400M + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth + + - Name: uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: CLIP-400M + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth + + - Name: uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14@336 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: Kinetics-710 + Kinetics-400 + Frame: 8 + Sampling method: Uniform + Training Resources: 16 GPUs + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Moments in Time V1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 42.3 + Top 5 Accuracy: 71.5 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth + + - Name: uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14 + Pretrained: Kinetics-710 + Kinetics-400 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Moments in Time V1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 47.0 + Top 5 Accuracy: 76.1 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth + + - Name: uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb + Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py + In Collection: UniFormerV2 + Metadata: + Architecture: UniFormerV2-L/14@336 + Pretrained: Kinetics-710 + Kinetics-400 + Frame: 8 + Sampling method: Uniform + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Moments in Time V1 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 47.7 + Top 5 Accuracy: 76.8 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7899080d7bfe1cca72685d0d3304762e86082b8a --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py @@ -0,0 +1,166 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=339, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/mit/videos/training' +data_root_val = 'data/mit/videos/validation' +ann_file_train = 'data/mit/mit_train_list_videos.txt' +ann_file_val = 'data/mit/mit_val_list_videos.txt' +ann_file_test = 'data/mit/mit_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 2e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1 / 20, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min_ratio=1 / 20, + by_epoch=True, + begin=5, + end=24, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5b204cbff46e6b3ac3431561d61e26c0cd1965a9 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py @@ -0,0 +1,174 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k400.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..4d02918194e4945de81392d19f1051ae1c038984 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py @@ -0,0 +1,174 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=600, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k600.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics600/videos_train' +data_root_val = 'data/kinetics600/videos_val' +ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt' +ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt' +ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8051b3e55427912da037a2bbba5aa3271ecfa6 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py @@ -0,0 +1,174 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k700.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0d9f77cde4a5a9eae713b8c937a8f1e1e8d26f80 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,37 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=710, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0eece62693901806e7d0e63ce651da747f4cc467 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py @@ -0,0 +1,163 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6db1923dcd09ff4d4bea2c4294cb878b00af4d --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py @@ -0,0 +1,163 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..da3f239e200f326084044633bacd02824b0bc7cc --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py @@ -0,0 +1,209 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=710, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +# dataset settings +k400_data_root = 'data/kinetics400/videos_train' +k600_data_root = 'data/kinetics600/videos' +k700_data_root = 'data/kinetics700/videos' +k400_data_root_val = 'data/kinetics400/videos_val' +k600_data_root_val = k600_data_root +k700_data_root_val = k700_data_root + +k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt' +k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt' +k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt' + +k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt' +k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt' +k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' + +k400_trainset = dict( + type='VideoDataset', + ann_file=k400_ann_file_train, + data_prefix=dict(video=k400_data_root), + pipeline=train_pipeline) +k600_trainset = dict( + type='VideoDataset', + ann_file=k600_ann_file_train, + data_prefix=dict(video=k600_data_root), + pipeline=train_pipeline) +k700_trainset = dict( + type='VideoDataset', + ann_file=k700_ann_file_train, + data_prefix=dict(video=k700_data_root), + pipeline=train_pipeline) + +k400_valset = dict( + type='VideoDataset', + ann_file=k400_ann_file_val, + data_prefix=dict(video=k400_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k600_valset = dict( + type='VideoDataset', + ann_file=k600_ann_file_val, + data_prefix=dict(video=k600_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k700_valset = dict( + type='VideoDataset', + ann_file=k700_ann_file_val, + data_prefix=dict(video=k700_data_root_val), + pipeline=val_pipeline, + test_mode=True) + +k400_testset = k400_valset.copy() +k600_testset = k600_valset.copy() +k700_testset = k700_valset.copy() +k400_testset['pipeline'] = test_pipeline +k600_testset['pipeline'] = test_pipeline +k700_testset['pipeline'] = test_pipeline + +k710_trainset = dict( + type='ConcatDataset', + datasets=[k400_trainset, k600_trainset, k700_trainset]) +k710_valset = dict( + type='ConcatDataset', datasets=[k400_valset, k600_valset, k700_valset]) +k710_testset = dict( + type='ConcatDataset', + datasets=[k400_testset, k600_testset, k700_testset], +) + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=k710_trainset) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_valset) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=k710_testset) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=50, + eta_min_ratio=0.5, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e90ab34c0986fd6c62c4722e8bc0022da0ebbe88 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 16 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e5320a286804a56cf5b44aea6c2aa3fba4523b7f --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 16 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..249f02aa4c921fef0a15b39114f316cbe030812a --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 16 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..b1068b7d108330fe69c9301cc98b730b69e15d44 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..94ba4fab323a10ec97c3ba6434ac6efe01306f4d --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd9758ce12aab2f1c4dc01fc4d03f195fd724ad --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c0f9e7036c553f939d5a612dc9c2c02c12ad0eec --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6a80137fa323b34ca37a1b9a8515a933f3bb5046 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..f7df26fd449061176092cfe445d01caa1ddd106f --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2e0732b92be789155cee29e126b61aba9c3fa882 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,37 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=710, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..2a3a90ae2e00e9f82d039de852e776dc2e45fe06 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=2, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 336)), + dict(type='ThreeCrop', crop_size=336), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..b341643a082f6a3e990759d46ac8d55cdc890ca5 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=2, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 336)), + dict(type='ThreeCrop', crop_size=336), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..85a80bab0cc4c459ed13cb1c1c3eb7bde057706f --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=2, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 336)), + dict(type='ThreeCrop', crop_size=336), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..58e234bea7425a7bdb74028a72f32cb5b0641e80 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,37 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 32 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=710, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..0c15724589252c7f2ca18cf8ee26a0bbd2b52e35 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=339, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/mit_v1' +ann_file_test = 'data/mit_v1/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=' ')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8a6907feeebbca6367ec62cd92e8faeb8168b0 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -0,0 +1,70 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type='TimeSformerHead', + dropout_ratio=0.5, + num_classes=339, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/mit_v1' +ann_file_test = 'data/mit_v1/val.csv' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 336)), + dict(type='ThreeCrop', crop_size=336), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=' ')) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/videomae/README.md b/configs/recognition/videomae/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5c9e3bb6c0ba29580a39f1a4e22c9c87ef3ab165 --- /dev/null +++ b/configs/recognition/videomae/README.md @@ -0,0 +1,61 @@ +# VideoMAE + +[VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) + + + +## Abstract + + + +Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking with an extremely high ratio. This simple design makes video reconstruction a more challenging self-supervision task, thus encouraging extracting more effective video representations during this pre-training process. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables a higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets is an important issue. Notably, our VideoMAE with the vanilla ViT can achieve 87.4% on Kinetics-400, 75.4% on Something-Something V2, 91.3% on UCF101, and 62.6% on HMDB51, without using any extra data. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | backbone | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :------------: | :------: | :------: | :------: | :--------------------------------: | :--------------------------------: | :---------------: | :---: | :----: | :--------------------: | :-------------------: | +| 16x4x1 | short-side 320 | ViT-B | 81.3 | 95.0 | 81.5 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 95.1 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 5 clips x 3 crops | 180G | 87M | [config](/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth) \[1\] | +| 16x4x1 | short-side 320 | ViT-L | 85.3 | 96.7 | 85.2 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 96.8 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 5 clips x 3 crops | 597G | 305M | [config](/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-229dbb03.pth) \[1\] | + +\[1\] The models are ported from the repo [VideoMAE](https://github.com/MCG-NJU/VideoMAE) and tested on our data. Currently, we only support the testing of VideoMAE models, training will be available soon. + +1. The values in columns named after "reference" are the results of the original repo. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test ViT-base model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{tong2022videomae, + title={Video{MAE}: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training}, + author={Zhan Tong and Yibing Song and Jue Wang and Limin Wang}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022} +} +``` diff --git a/configs/recognition/videomae/README_zh-CN.md b/configs/recognition/videomae/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..802411f1fb2e01b62553baf1de3d223b332c2ab5 --- /dev/null +++ b/configs/recognition/videomae/README_zh-CN.md @@ -0,0 +1,47 @@ +# VideoMAE + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{tong2022videomae, + title={Video{MAE}: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training}, + author={Zhan Tong and Yibing Song and Jue Wang and Limin Wang}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022} +} +``` + +## ๆจกๅž‹ๅบ“ + +### Kinetics-400 + +| ๅธง้‡‡ๆ ท็ญ–็•ฅ | ๅˆ†่พจ็އ | ไธปๅนฒ็ฝ‘็ปœ | top1 ๅ‡†็กฎ็އ | top5 ๅ‡†็กฎ็އ | ๅ‚่€ƒไปฃ็ ็š„ top1 ๅ‡†็กฎ็އ | ๅ‚่€ƒไปฃ็ ็š„ top5ๅ‡†็กฎ็އ | ๆต‹่ฏ•ๅ่ฎฎ | ๆตฎ็‚น่ฟ็ฎ—ๆ•ฐ | ๅ‚ๆ•ฐ้‡ | ้…็ฝฎๆ–‡ไปถ | ckpt | +| :--------: | :------: | :------: | :---------: | :---------: | :---------------------------------: | :--------------------------------: | :---------------: | :--------: | :----: | :-----------------------: | :-----------------------: | +| 16x4x1 | ็Ÿญ่พน 320 | ViT-B | 81.3 | 95.0 | 81.5 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 95.1 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 5 clips x 3 crops | 180G | 87M | [config](/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth) \[1\] | +| 16x4x1 | ็Ÿญ่พน 320 | ViT-L | 85.3 | 96.7 | 85.2 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 96.8 \[[VideoMAE](https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md)\] | 5 clips x 3 crops | 597G | 305M | [config](/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-229dbb03.pth) \[1\] | + +\[1\] ่ฏฅๆจกๅž‹็งปๆค่‡ช [VideoMAE](https://github.com/MCG-NJU/VideoMAE) ๅนถๅœจๆˆ‘ไปฌ็š„ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒๆต‹่ฏ•ใ€‚็›ฎๅ‰ไป…ๆ”ฏๆŒVideoMAEๆจกๅž‹็š„ๆต‹่ฏ•๏ผŒ่ฎญ็ปƒๅณๅฐ†ๆŽจๅ‡บใ€‚ + +1. ๅ‚่€ƒไปฃ็ ็š„ๅ‡†็กฎ็އๆ•ฐๆฎๆฅๆบไบŽๅŽŸๅง‹ไป“ๅบ“ใ€‚ +2. ๆˆ‘ไปฌไฝฟ็”จ็š„ Kinetics400 ้ชŒ่ฏ้›†ๅŒ…ๅซ 19796 ไธช่ง†้ข‘ใ€‚ ็”จๆˆทๅฏไปฅไปŽ [้ชŒ่ฏ้›†่ง†้ข‘](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB)ไธ‹่ฝฝ่ฟ™ไบ›่ง†้ข‘ใ€‚ ๅŒๆ—ถไนŸๆไพ›ไบ†ๅฏนๅบ”็š„ [ๆ•ฐๆฎๅˆ—่กจ](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (ๆฏ่กŒๆ ผๅผไธบ๏ผš่ง†้ข‘ ID๏ผŒ่ง†้ข‘ๅธงๆ•ฐ็›ฎ๏ผŒ็ฑปๅˆซๅบๅท) ไปฅๅŠ [ๆ˜ ๅฐ„ๆ ‡็ญพ](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) ใ€‚ + +ๅ…ณไบŽๆ•ฐๆฎๅค„็†็š„ๆ›ดๅคš็ป†่Š‚๏ผŒ็”จๆˆทๅฏไปฅๅ‚็…ง [preparing_kinetics](/tools/data/kinetics/README_zh-CN.md). + +## ๅฆ‚ไฝ•ๆต‹่ฏ• + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒๆจกๅž‹ๆต‹่ฏ•ใ€‚ + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +ไพ‹ๅฆ‚๏ผšๅœจ Kinetics-400 ๆ•ฐๆฎ้›†ไธŠๆต‹่ฏ• ViT-base ๆจกๅž‹๏ผŒๅนถๅฐ†็ป“ๆžœๅฏผๅ‡บไธบไธ€ไธช pkl ๆ–‡ไปถใ€‚ + +```shell +python tools/test.py configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +ๆ›ดๅคšๆต‹่ฏ•็ป†่Š‚๏ผŒๅฏๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md) ไธญ็š„ **ๆต‹่ฏ•** ้ƒจๅˆ†ใ€‚ diff --git a/configs/recognition/videomae/metafile.yml b/configs/recognition/videomae/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..9d0c8f5969e9279f838cbeafd67f7b3f353ed42c --- /dev/null +++ b/configs/recognition/videomae/metafile.yml @@ -0,0 +1,43 @@ +Collections: +- Name: VideoMAE + README: configs/recognition/videomae/README.md + Paper: + URL: https://arxiv.org/abs/2203.12602 + Title: "VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training" + +Models: + - Name: vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400 + Config: configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py + In Collection: VideoMAE + Metadata: + Architecture: ViT-B + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md + Code: https://github.com/MCG-NJU/VideoMAE/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.3 + Top 5 Accuracy: 95.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth + + - Name: vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400 + Config: configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py + In Collection: VideoMAE + Metadata: + Architecture: ViT-L + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/MCG-NJU/VideoMAE/blob/main/MODEL_ZOO.md + Code: https://github.com/MCG-NJU/VideoMAE/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 85.3 + Top 5 Accuracy: 96.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-229dbb03.pth diff --git a/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py b/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py new file mode 100644 index 0000000000000000000000000000000000000000..1313e5515c9dd8d86f6d2c68ad0bd597aee8b2fc --- /dev/null +++ b/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py @@ -0,0 +1,61 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='VisionTransformer', + img_size=224, + patch_size=16, + embed_dims=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + num_frames=16, + norm_cfg=dict(type='LN', eps=1e-6)), + cls_head=dict( + type='TimeSformerHead', + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py b/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py new file mode 100644 index 0000000000000000000000000000000000000000..a2c921dc08c7d39640c73e8b5b06936311ced7e3 --- /dev/null +++ b/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py @@ -0,0 +1,6 @@ +_base_ = ['vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py'] + +# model settings +model = dict( + backbone=dict(embed_dims=1024, depth=24, num_heads=16), + cls_head=dict(in_channels=1024)) diff --git a/configs/recognition/videomaev2/README.md b/configs/recognition/videomaev2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4ec88cbbadf0085c3949501f84426c18c6763a95 --- /dev/null +++ b/configs/recognition/videomaev2/README.md @@ -0,0 +1,63 @@ +# VideoMAE V2 + +[VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking](https://arxiv.org/abs/2303.16727) + + + +## Abstract + + + +Scale is the primary factor for building a powerful foundation model that could well generalize to a variety of downstream tasks. However, it is still challenging to train video foundation models with billions of parameters. This paper shows that video masked autoencoder (VideoMAE) is a scalable and general self-supervised pre-trainer for building video foundation models. We scale the VideoMAE in both model and data with a core design. Specifically, we present a dual masking strategy for efficient pre-training, with an encoder operating on a subset of video tokens and a decoder processing another subset of video tokens. Although VideoMAE is very efficient due to high masking ratio in encoder, masking decoder can still further reduce the overall computational cost. This enables the efficient pre-training of billion-level models in video. We also use a progressive training paradigm that involves an initial pre-training on a diverse multi-sourced unlabeled dataset, followed by a post-pre-training on a mixed labeled dataset. Finally, we successfully train a video ViT model with a billion parameters, which achieves a new state-of-the-art performance on the datasets of Kinetics (90.0% on K400 and 89.9% on K600) and Something-Something (68.7% on V1 and 77.0% on V2). In addition, we extensively verify the pre-trained video ViT models on a variety of downstream tasks, demonstrating its effectiveness as a general video representation learner. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | backbone | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :------------: | :------: | :------: | :------: | :--------------------------------: | :--------------------------------: | :---------------: | :---: | :----: | :--------------------: | :-------------------: | +| 16x4x1 | short-side 320 | ViT-S | 83.6 | 96.3 | 83.7 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 96.2 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 5 clips x 3 crops | 57G | 22M | [config](/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth) \[1\] | +| 16x4x1 | short-side 320 | ViT-B | 86.6 | 97.3 | 86.6 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 97.3 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 5 clips x 3 crops | 180G | 87M | [config](/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth) \[1\] | + +\[1\] The models were distilled from the VideoMAE V2-g model. Specifically, models are initialized with VideoMAE V2 pretraining, then distilled on Kinetics 710 dataset. They are ported from the repo [VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2) and tested on our data. The VideoMAE V2-g model can be obtained from the original repository. Currently, we only support the testing of VideoMAE V2 models. + +1. The values in columns named after "reference" are the results of the original repo. +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test ViT-base model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@misc{wang2023videomaev2, + title={VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking}, + author={Limin Wang and Bingkun Huang and Zhiyu Zhao and Zhan Tong and Yinan He and Yi Wang and Yali Wang and Yu Qiao}, + year={2023}, + eprint={2303.16727}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/recognition/videomaev2/metafile.yml b/configs/recognition/videomaev2/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..daec99ffc09442403bd4b33ecc3b37b4605fa6a9 --- /dev/null +++ b/configs/recognition/videomaev2/metafile.yml @@ -0,0 +1,43 @@ +Collections: +- Name: VideoMAEv2 + README: configs/recognition/videomaev2/README.md + Paper: + URL: https://arxiv.org/abs/2303.16727 + Title: "VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking" + +Models: + - Name: vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400 + Config: configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py + In Collection: VideoMAEv2 + Metadata: + Architecture: ViT-S + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/VideoMAEv2/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 83.6 + Top 5 Accuracy: 96.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth + + - Name: vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400 + Config: configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py + In Collection: VideoMAEv2 + Metadata: + Architecture: ViT-B + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/VideoMAEv2/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 86.6 + Top 5 Accuracy: 97.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth diff --git a/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py b/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py new file mode 100644 index 0000000000000000000000000000000000000000..1313e5515c9dd8d86f6d2c68ad0bd597aee8b2fc --- /dev/null +++ b/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py @@ -0,0 +1,61 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='VisionTransformer', + img_size=224, + patch_size=16, + embed_dims=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + num_frames=16, + norm_cfg=dict(type='LN', eps=1e-6)), + cls_head=dict( + type='TimeSformerHead', + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py b/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py new file mode 100644 index 0000000000000000000000000000000000000000..80ff18ea77aed47c6d1fd97dddb1de3312e5ce54 --- /dev/null +++ b/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py @@ -0,0 +1,6 @@ +_base_ = ['vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py'] + +# model settings +model = dict( + backbone=dict(embed_dims=384, depth=12, num_heads=6), + cls_head=dict(in_channels=384)) diff --git a/configs/recognition/x3d/README.md b/configs/recognition/x3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..723764c80b05dcbca90e992d5b3f3dda78ac367a --- /dev/null +++ b/configs/recognition/x3d/README.md @@ -0,0 +1,63 @@ +# X3D + +[X3D: Expanding Architectures for Efficient Video Recognition](https://openaccess.thecvf.com/content_CVPR_2020/html/Feichtenhofer_X3D_Expanding_Architectures_for_Efficient_Video_Recognition_CVPR_2020_paper.html) + + + +## Abstract + + + +This paper presents X3D, a family of efficient video networks that progressively expand a tiny 2D image classification architecture along multiple network axes, in space, time, width and depth. Inspired by feature selection methods in machine learning, a simple stepwise network expansion approach is employed that expands a single axis in each step, such that good accuracy to complexity trade-off is achieved. To expand X3D to a specific target complexity, we perform progressive forward expansion followed by backward contraction. X3D achieves state-of-the-art performance while requiring 4.8x and 5.5x fewer multiply-adds and parameters for similar accuracy as previous work. Our most surprising finding is that networks with high spatiotemporal resolution can perform well, while being extremely light in terms of network width and parameters. We report competitive accuracy at unprecedented efficiency on video classification and detection benchmarks. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | resolution | backbone | top1 10-view | top1 30-view | reference top1 10-view | reference top1 30-view | config | ckpt | +| :---------------------: | :--------: | :------: | :----------: | :----------: | :----------------------------------------: | :----------------------------------------: | :------------------------: | :-----------------------: | +| 13x6x1 | 160x160 | X3D_S | 73.2 | 73.3 | 73.1 \[[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)\] | 73.5 \[[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)\] | [config](/configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_s_13x6x1_facebook-kinetics400-rgb_20201027-623825a0.pth)\[1\] | +| 16x5x1 | 224x224 | X3D_M | 75.2 | 76.4 | 75.1 \[[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)\] | 76.2 \[[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)\] | [config](/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_m_16x5x1_facebook-kinetics400-rgb_20201027-3f42382a.pth)\[1\] | + +\[1\] The models are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of X3D models, training will be available soon. + +1. The values in columns named after "reference" are the results got by testing the checkpoint released on the original repo and codes, using the same dataset with ours. +2. The validation set of Kinetics400 we used is same as the repo [SlowFast](https://github.com/facebookresearch/SlowFast/), which is available [here](https://github.com/facebookresearch/video-nonlocal-net/issues/67). + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test X3D model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@misc{feichtenhofer2020x3d, + title={X3D: Expanding Architectures for Efficient Video Recognition}, + author={Christoph Feichtenhofer}, + year={2020}, + eprint={2004.04730}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/recognition/x3d/metafile.yml b/configs/recognition/x3d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..22e5be5a5bfe4bb8c01e040a58979d656cac2f0f --- /dev/null +++ b/configs/recognition/x3d/metafile.yml @@ -0,0 +1,49 @@ +Collections: +- Name: X3D + README: configs/recognition/x3d/README.md + Paper: + URL: https://arxiv.org/abs/2004.04730 + Title: "X3D: Expanding Architectures for Efficient Video Recognition" + +Models: + - Name: x3d_s_13x6x1_facebook-kinetics400-rgb + Config: configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py + In Collection: X3D + Metadata: + Architecture: X3D_S + FLOPs: 2967543760 + Parameters: 3794322 + Resolution: 160x160 + Modality: RGB + Converted From: + Weights: https://dl.fbaipublicfiles.com/pyslowfast/x3d_models/x3d_s.pyth + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.3 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_s_13x6x1_facebook-kinetics400-rgb_20201027-623825a0.pth + reference top1 10-view: 73.1 [[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)] + reference top1 30-view: 73.5 [[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)] + + - Name: x3d_m_16x5x1_facebook-kinetics400-rgb + Config: configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py + In Collection: X3D + Metadata: + Architecture: X3D_M + FLOPs: 6490866832 + Parameters: 3794322 + Resolution: 224x224 + Modality: RGB + Converted From: + Weights: https://dl.fbaipublicfiles.com/pyslowfast/x3d_models/x3d_m.pyth + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 76.4 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_m_16x5x1_facebook-kinetics400-rgb_20201027-3f42382a.pth + reference top1 10-view: 75.1 [[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)] + reference top1 30-view: 76.2 [[SlowFast](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md)] diff --git a/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py b/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..af215c16e8149c1ada55d13638bdf53b810e1c7a --- /dev/null +++ b/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py @@ -0,0 +1,36 @@ +_base_ = ['../../_base_/models/x3d.py', '../../_base_/default_runtime.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=5, + num_clips=10, + target_fps=30, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py b/configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..e330ced45bcc4cf28e1b550fdfcef81b9fd48618 --- /dev/null +++ b/configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py @@ -0,0 +1,36 @@ +_base_ = ['../../_base_/models/x3d.py', '../../_base_/default_runtime.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=13, + frame_interval=6, + num_clips=10, + target_fps=30, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 182)), + dict(type='ThreeCrop', crop_size=182), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') diff --git a/configs/recognition_audio/audioonly/audioonly_r50_8xb160-64x1x1-100e_kinetics400-audio-feature.py b/configs/recognition_audio/audioonly/audioonly_r50_8xb160-64x1x1-100e_kinetics400-audio-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..c6bb4de9c6d520dc44fba15f3163210e774c8e2e --- /dev/null +++ b/configs/recognition_audio/audioonly/audioonly_r50_8xb160-64x1x1-100e_kinetics400-audio-feature.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/audioonly_r50.py', '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'AudioDataset' +data_root = 'data/kinetics400/audio_features_train' +data_root_val = 'data/kinetics400/audio_features_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_audio_features.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_audio_features.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_audio_features.txt' +train_pipeline = [ + dict(type='LoadAudioFeature'), + dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1), + dict(type='AudioFeatureSelector'), + dict(type='FormatAudioShape', input_format='NCTF'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='LoadAudioFeature'), + dict( + type='SampleFrames', + clip_len=64, + frame_interval=1, + num_clips=1, + test_mode=True), + dict(type='AudioFeatureSelector'), + dict(type='FormatAudioShape', input_format='NCTF'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='LoadAudioFeature'), + dict( + type='SampleFrames', + clip_len=64, + frame_interval=1, + num_clips=10, + test_mode=True), + dict(type='AudioFeatureSelector'), + dict(type='FormatAudioShape', input_format='NCTF'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=160, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + pipeline=train_pipeline, + data_prefix=dict(audio=data_root), + suffix='.npy')) +val_dataloader = dict( + batch_size=160, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + pipeline=val_pipeline, + data_prefix=dict(audio=data_root_val), + suffix='.npy', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + pipeline=test_pipeline, + data_prefix=dict(audio=data_root_val), + suffix='.npy', + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='CosineAnnealingLR', eta_min=0, T_max=100, by_epoch=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=2.0, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict( + checkpoint=dict(max_keep_ckpts=3, interval=5), logger=dict(interval=20)) diff --git a/configs/recognition_audio/resnet/README.md b/configs/recognition_audio/resnet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..84b1855e2ec54633b7192609e267665db60462df --- /dev/null +++ b/configs/recognition_audio/resnet/README.md @@ -0,0 +1,70 @@ +# ResNet for Audio + +[Audiovisual SlowFast Networks for Video Recognition](https://arxiv.org/abs/2001.08740) + + + +## Abstract + + + +We present Audiovisual SlowFast Networks, an architecture for integrated audiovisual perception. AVSlowFast has Slow and Fast visual pathways that are deeply integrated with a Faster Audio pathway to model vision and sound in a unified representation. We fuse audio and visual features at multiple layers, enabling audio to contribute to the formation of hierarchical audiovisual concepts. To overcome training difficulties that arise from different learning dynamics for audio and visual modalities, we introduce DropPathway, which randomly drops the Au- dio pathway during training as an effective regularization technique. Inspired by prior studies in neuroscience, we perform hierarchical audiovisual synchronization to learn joint audiovisual features. We report state-of-the-art results on six video action classification and detection datasets, perform detailed ablation studies, and show the generalization of AVSlowFast to learn self-supervised audiovisual features. + + + +
+ +
+ +## Results and Models + +### Kinetics-400 + +| frame sampling strategy | n_fft | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :---: | :--: | :------: | :------: | :------: | :------: | :--------------: | :---: | :----: | :------------------------------------: | :----------------------------------: | :---------------------------------: | +| 64x1x1 | 1024 | 8 | Resnet18 | None | 13.7 | 27.3 | 1 clips | 0.37G | 11.4M | [config](/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature_20230702-e4642fb0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.log) | + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train ResNet model on Kinetics-400 audio dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test ResNet model on Kinetics-400 audio dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{xiao2020audiovisual, + title={Audiovisual SlowFast Networks for Video Recognition}, + author={Xiao, Fanyi and Lee, Yong Jae and Grauman, Kristen and Malik, Jitendra and Feichtenhofer, Christoph}, + journal={arXiv preprint arXiv:2001.08740}, + year={2020} +} +``` diff --git a/configs/recognition_audio/resnet/metafile.yml b/configs/recognition_audio/resnet/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..71a246e8559301e4717b3d21a9f5a450b2e44294 --- /dev/null +++ b/configs/recognition_audio/resnet/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: Audio + README: configs/recognition_audio/resnet/README.md + Paper: + URL: https://arxiv.org/abs/2001.08740 + Title: "Audiovisual SlowFast Networks for Video Recognition" + +Models: + - Name: tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature + Config: configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py + In Collection: Audio + Metadata: + Architecture: ResNet18 + Batch Size: 320 + Epochs: 100 + FLOPs: 0.37G + Parameters: 11.4M + Pretrained: None + n_fft: 1024 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: Audio + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 13.7 + Top 5 Accuracy: 27.3 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature_20230702-e4642fb0.pth diff --git a/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py b/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py new file mode 100644 index 0000000000000000000000000000000000000000..e44f8cdd1e8e3047476bc4fb8e1c7537f3ba1d3f --- /dev/null +++ b/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py @@ -0,0 +1,95 @@ +_base_ = '../../_base_/default_runtime.py' + +# model settings +model = dict( + type='RecognizerAudio', + backbone=dict(type='ResNet', depth=18, in_channels=1, norm_eval=False), + cls_head=dict( + type='TSNAudioHead', + num_classes=400, + in_channels=512, + dropout_ratio=0.5, + init_std=0.01, + average_clips='prob')) + +# dataset settings +dataset_type = 'AudioDataset' +data_root = 'data/kinetics400' +ann_file_train = 'kinetics400_train_list_audio_features.txt' +ann_file_val = 'kinetics400_val_list_audio_features.txt' +ann_file_test = 'kinetics400_val_list_audio_features.txt' + +train_pipeline = [ + dict(type='LoadAudioFeature'), + dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1), + dict(type='AudioFeatureSelector'), + dict(type='FormatAudioShape', input_format='NCTF'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='LoadAudioFeature'), + dict( + type='SampleFrames', + clip_len=64, + frame_interval=1, + num_clips=1, + test_mode=True), + dict(type='AudioFeatureSelector'), + dict(type='FormatAudioShape', input_format='NCTF'), + dict(type='PackActionInputs') +] +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=320, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + pipeline=train_pipeline, + data_root=data_root, + data_prefix=dict(audio='audio_features_train'))) +val_dataloader = dict( + batch_size=320, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + pipeline=val_pipeline, + data_root=data_root, + data_prefix=dict(audio='audio_features_val'), + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + pipeline=test_pipeline, + data_root=data_root, + data_prefix=dict(audio='audio_features_val'), + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='CosineAnnealingLR', eta_min=0, T_max=100, by_epoch=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3, interval=5)) diff --git a/configs/retrieval/clip4clip/README.md b/configs/retrieval/clip4clip/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd247e6cd9eff6c072c56f5c4917a139d14cc9ae --- /dev/null +++ b/configs/retrieval/clip4clip/README.md @@ -0,0 +1,74 @@ +# CLIP4Clip + +[CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval](https://arxiv.org/abs/2104.08860) + + + +## Abstract + + + +Video-text retrieval plays an essential role in multi-modal research and has been widely used in many real-world web applications. The CLIP (Contrastive Language-Image Pre-training), an image-language pre-training model, has demonstrated the power of visual concepts learning from web collected image-text datasets. In this paper, we propose a CLIP4Clip model to transfer the knowledge of the CLIP model to video-language retrieval in an end-to-end manner. Several questions are investigated via empirical studies: 1) Whether image feature is enough for video-text retrieval? 2) How a post-pretraining on a large-scale video-text dataset based on the CLIP affect the performance? 3) What is the practical mechanism to model temporal dependency between video frames? And 4) The Hyper-parameters sensitivity of the model on video-text retrieval task. Extensive experimental results present that the CLIP4Clip model transferred from the CLIP can achieve SOTA results on various video-text retrieval datasets, including MSR-VTT, MSVC, LSMDC, ActivityNet, and DiDeMo. + + + +
+ +
+ +## Results and Models + +### MSRVTT-9k + +| frame sampling strategy | resolution | gpus | backbone | adapter | pretrain | Recall@1 | Recall@5 | Recall@10 | MdR | MnR | testing protocol | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :-----: | :------: | :------: | :------: | :-------: | :-: | :--: | :--------------: | :------------------------------: | :----------------------------: | :----------------------------: | +| uniform 12 | 224x224 | 8 | ViT-B/32 | Mean | clip | 43.1 | 69.4 | 78.9 | 2.0 | 16.8 | 1 clips x 1 crop | [config](/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb_20230612-b9706e54.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.log) | + +For more details on data preparation, you can refer to [video_retrieval](/tools/data/video_retrieval/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train CLIP4Clip model on MSRVTT-9k dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test CLIP4Clip model on MSRVTT-9k dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@article{luo2022clip4clip, + title={CLIP4Clip: An empirical study of CLIP for end to end video clip retrieval and captioning}, + author={Luo, Huaishao and Ji, Lei and Zhong, Ming and Chen, Yang and Lei, Wen and Duan, Nan and Li, Tianrui}, + journal={Neurocomputing}, + volume={508}, + pages={293--304}, + year={2022}, +} +``` diff --git a/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py b/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab3ff8e2533aa655504c545ad0c967188b054a3 --- /dev/null +++ b/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py @@ -0,0 +1,122 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='CLIPSimilarity', + clip_arch='ViT-B/32', + to_float32=True, + frozen_layers=0, + data_preprocessor=dict( + type='MultiModalDataPreprocessor', + preprocessors=dict( + imgs=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW'), + text=dict(type='ActionDataPreprocessor', to_float32=False))), + adapter=dict(type='SimpleMeanAdapter')) + +dataset_type = 'VideoTextDataset' +data_root = 'data/video_retrieval/msrvtt' +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=12, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='CLIPTokenize'), + dict(type='PackActionInputs', collect_keys=('imgs', 'text')) +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=12, num_clips=1, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='CLIPTokenize'), + dict(type='PackActionInputs', collect_keys=('imgs', 'text')) +] +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='train_9k.json', + data_root=data_root, + data_prefix=dict(video='videos'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file='test_JSFUSION.json', + data_root=data_root, + data_prefix=dict(video='videos'), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file='test_JSFUSION.json', + data_root=data_root, + data_prefix=dict(video='videos'), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='RetrievalMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.05, + by_epoch=True, + begin=0, + end=0.5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4.5, + eta_min=0, + by_epoch=True, + begin=0.5, + end=5, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=1e-05, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.), +) + +default_hooks = dict(checkpoint=dict(save_best=None)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/retrieval/clip4clip/metafile.yml b/configs/retrieval/clip4clip/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..6daba4b5830c0ac03f7b0f74511cb95edff6c26e --- /dev/null +++ b/configs/retrieval/clip4clip/metafile.yml @@ -0,0 +1,28 @@ +Collections: + - Name: CLIP4Clip + README: configs/retrieval/clip4clip/README.md + Paper: + URL: https://arxiv.org/abs/2104.08860 + Title: 'CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval' + +Models: + - Name: clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb + Config: configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py + In Collection: CLIP4Clip + Metadata: + Architecture: ViT-B/32 + Batch Size: 16 + Epochs: 5 + Training Data: MSRVTT-9k + Training Resources: 8 GPUs + Results: + Dataset: MSRVTT + Task: Video Retrieval + Metrics: + Recall@1: 43.1 + Recall@5: 69.4 + Recall@10: 78.9 + MdR: 2.0 + MnR: 16.8 + Training Log: https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb_20230612-b9706e54.pth diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..9b28b21b8ed4e960fd760d46a101496ca79947eb --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..f712bdda9df5a2bf43714ed9d0c35b1e95f0975b --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..8fc1889823723d1af0523cca16f7746ef42fc77e --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf26ccd83975a8ee8755a553cf8e3a4c4669015 --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..f38299d287636c460a08642b4229e88f398efb06 --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..00573321f77e008afc3f05110f62b89fe3c04c7d --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = '2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..0f30aac2c13d6ef5eb5c06b65da3807f6916435f --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,104 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='AAGCN', + graph_cfg=dict(layout='coco', mode='spatial'), + gcn_attention=False), # degenerate AAGCN to AGCN + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6c90bed6eca07e0a6e2ee1397f38d865778ac7 --- /dev/null +++ b/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,104 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='AAGCN', + graph_cfg=dict(layout='nturgb+d', mode='spatial'), + gcn_attention=False), # degenerate AAGCN to AGCN + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/2s-agcn/README.md b/configs/skeleton/2s-agcn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e7bf5d2645ad3b34d197241b72a2802b8db8fdd7 --- /dev/null +++ b/configs/skeleton/2s-agcn/README.md @@ -0,0 +1,90 @@ +# AGCN + +[Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition](https://openaccess.thecvf.com/content_CVPR_2019/html/Shi_Two-Stream_Adaptive_Graph_Convolutional_Networks_for_Skeleton-Based_Action_Recognition_CVPR_2019_paper.html) + + + +## Abstract + + + +In skeleton-based action recognition, graph convolutional networks (GCNs), which model the human body skeletons as spatiotemporal graphs, have achieved remarkable performance. However, in existing GCN-based methods, the topology of the graph is set manually, and it is fixed over all layers and input samples. This may not be optimal for the hierarchical GCN and diverse samples in action recognition tasks. In addition, the second-order information (the lengths and directions of bones) of the skeleton data, which is naturally more informative and discriminative for action recognition, is rarely investigated in existing methods. In this work, we propose a novel two-stream adaptive graph convolutional network (2s-AGCN) for skeleton-based action recognition. The topology of the graph in our model can be either uniformly or individually learned by the BP algorithm in an end-to-end manner. This data-driven method increases the flexibility of the model for graph construction and brings more generality to adapt to various data samples. Moreover, a two-stream framework is proposed to model both the first-order and the second-order information simultaneously, which shows notable improvement for the recognition accuracy. Extensive experiments on the two large-scale datasets, NTU-RGBD and Kinetics-Skeleton, demonstrate that the performance of our model exceeds the state-of-the-art with a significant margin. + + + +
+ +
+ +## Results and Models + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | AGCN | 88.60 | 10 clips | 4.4G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221222-4c0ed77e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone | 8 | AGCN | 91.59 | 10 clips | 4.4G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221222-293878b5.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | joint-motion | 8 | AGCN | 88.02 | 10 clips | 4.4G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221222-0c86e3a1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone-motion | 8 | AGCN | 88.82 | 10 clips | 4.4G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221222-87996f0d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| | two-stream | | | 91.95 | | | | | | | +| | four-stream | | | 92.34 | | | | | | | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | AGCN | 88.26 | 10 clips | 6.5G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221222-24dabf78.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone | 8 | AGCN | 89.22 | 10 clips | 6.5G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221222-abe70a7f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | joint-motion | 8 | AGCN | 86.73 | 10 clips | 6.5G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221222-923cd3c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone-motion | 8 | AGCN | 86.41 | 10 clips | 6.5G | 3.5M | [config](/configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221222-3d8f6f43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| | two-stream | | | 90.27 | | | | | | | +| | four-stream | | | 90.89 | | | | | | | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/useful_tools.md#multi-stream-fusion). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train STGCN model on NTU60-2D dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test AGCN model on NTU60-2D dataset and dump the result to a pickle file. + +```shell +python tools/test.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{shi2019two, + title={Two-stream adaptive graph convolutional networks for skeleton-based action recognition}, + author={Shi, Lei and Zhang, Yifan and Cheng, Jian and Lu, Hanqing}, + booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages={12026--12035}, + year={2019} +} +``` diff --git a/configs/skeleton/2s-agcn/metafile.yml b/configs/skeleton/2s-agcn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6d4430ebe240af5891d1f6b9295fca3c366826f --- /dev/null +++ b/configs/skeleton/2s-agcn/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: AGCN + README: configs/skeleton/2s-agcn/README.md + Paper: + URL: https://arxiv.org/abs/1805.07694 + Title: 'Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition' + +Models: + - Name: 2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 4.4G + Parameters: 3.5M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.60 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221222-4c0ed77e.pth + + - Name: 2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 4.4G + Parameters: 3.5M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 91.59 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221222-293878b5.pth + + - Name: 2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 4.4G + Parameters: 3.5M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.02 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221222-0c86e3a1.pth + + - Name: 2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 4.4G + Parameters: 3.5M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.82 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221222-87996f0d.pth + + - Name: 2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 6.5G + Parameters: 3.5M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.26 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221222-24dabf78.pth + + - Name: 2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 6.5G + Parameters: 3.5M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 89.22 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221222-abe70a7f.pth + + - Name: 2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 6.5G + Parameters: 3.5M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.73 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221222-923cd3c3.pth + + - Name: 2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: AGCN + Metadata: + Architecture: AGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 6.5G + Parameters: 3.5M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/2s-agcn/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/2s-agcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221222-3d8f6f43.pth diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cc4e49b35266a696fb0339e7b97e5e60d60f1a8f --- /dev/null +++ b/configs/skeleton/posec3d/README.md @@ -0,0 +1,137 @@ +# PoseC3D + +[Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586) + + + +## Abstract + + + +Human skeleton, as a compact representation of human action, has received increasing attention in recent years. Many skeleton-based action recognition methods adopt graph convolutional networks (GCN) to extract features on top of human skeletons. Despite the positive results shown in previous works, GCN-based methods are subject to limitations in robustness, interoperability, and scalability. In this work, we propose PoseC3D, a new approach to skeleton-based action recognition, which relies on a 3D heatmap stack instead of a graph sequence as the base representation of human skeletons. Compared to GCN-based methods, PoseC3D is more effective in learning spatiotemporal features, more robust against pose estimation noises, and generalizes better in cross-dataset settings. Also, PoseC3D can handle multiple-person scenarios without additional computation cost, and its features can be easily integrated with other modalities at early fusion stages, which provides a great design space to further boost the performance. On four challenging datasets, PoseC3D consistently obtains superior performance, when used alone on skeletons and in combination with the RGB modality. + + + +
+ +
+ + + + + + + + + +
+
+ Pose Estimation Results +
+ +
+
+ +
+
+ Keypoint Heatmap Volume Visualization +
+ +
+
+ +
+
+ Limb Heatmap Volume Visualization +
+ +
+
+ +
+ +## Results and Models + +### FineGYM + +| frame sampling strategy | pseudo heatmap | gpus | backbone | Mean Top-1 | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :--------: | :--------------: | :---: | :----: | :------------------------------------: | :----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) | +| uniform 48 | limb | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) | + +### NTU60_XSub + +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) | +| uniform 48 | limb | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) | +| | Fusion | | | 94.0 | | | | | | | + +### UCF101 + +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 86.8 | 10 clips | 14.6G | 3.1M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) | + +### HMDB51 + +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 69.6 | 10 clips | 14.6G | 3.0M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) | + +# Kinetics400 + +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 47.4 | 10 clips | 19.1G | 3.2M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint/slowonly_r50_8xb32-u48-240e_k400-keypoint_20230731-7f498b55.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint/slowonly_r50_8xb32-u48-240e_k400-keypoint.log) | + +You can follow the guide in [Preparing Skeleton Dataset](/tools/data/skeleton/README.md) to obtain skeleton annotations used in the above configs. + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train PoseC3D model on FineGYM dataset in a deterministic option. + +```shell +python tools/train.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py \ + --seed=0 --deterministic +``` + +For training with your custom dataset, you can refer to [Custom Dataset Training](/configs/skeleton/posec3d/custom_dataset_training.md). + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test PoseC3D model on FineGYM dataset. + +```shell +python tools/test.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py \ + checkpoints/SOME_CHECKPOINT.pth +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@misc{duan2021revisiting, + title={Revisiting Skeleton-based Action Recognition}, + author={Haodong Duan and Yue Zhao and Kai Chen and Dian Shao and Dahua Lin and Bo Dai}, + year={2021}, + eprint={2104.13586}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/skeleton/posec3d/custom_dataset_training.md b/configs/skeleton/posec3d/custom_dataset_training.md new file mode 100644 index 0000000000000000000000000000000000000000..15d8e10fbe2368568c14aef05db5f75be19f0ffb --- /dev/null +++ b/configs/skeleton/posec3d/custom_dataset_training.md @@ -0,0 +1,41 @@ +# Custom Dataset Training with PoseC3D + +We provide a step-by-step tutorial on how to train your custom dataset with PoseC3D. + +1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to install MMDetection and MMPose. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`): + + ```shell + # You can use the above command to generate pickle files for all of your training and validation videos. + python ntu_pose_extraction.py some_video_from_my_dataset.mp4 some_video_from_my_dataset.pkl + ``` + + @kennymckormick's [note](https://github.com/open-mmlab/mmaction2/issues/1216#issuecomment-950130079): + + > One only thing you may need to change is that: since ntu_pose_extraction.py is developed specifically for pose extraction of NTU videos, you can skip the [ntu_det_postproc](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py#L307) step when using this script for extracting pose from your custom video datasets. + +2. Then, you will collect all the pickle files into one list for training (and, of course, for validation) and save them as a single file (like `custom_dataset_train.pkl` or `custom_dataset_val.pkl`). At that time, you finalize preparing annotation files for your custom dataset. + +3. Next, you may use the following script (with some alterations according to your needs) for training as shown in [PoseC3D/Train](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/README.md#train): `python tools/train.py configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py --work-dir work_dirs/slowonly_r50_u48_240e_ntu120_xsub_keypoint --validate --test-best --gpus 2 --seed 0 --deterministic`: + + - Before running the above script, you need to modify the variables to initialize with your newly made annotation files: + + ```python + model = dict( + ... + cls_head=dict( + ... + num_classes=4, # Your class number + ... + ), + ... + ) + + ann_file_train = 'data/posec3d/custom_dataset_train.pkl' # Your annotation for training + ann_file_val = 'data/posec3d/custom_dataset_val.pkl' # Your annotation for validation + + load_from = 'pretrained_weight.pth' # Your can use released weights for initialization, set to None if training from scratch + + # You can also alter the hyper parameters or training schedule + ``` + +With that, your machine should start its work to let you grab a cup of coffee and watch how the training goes. diff --git a/configs/skeleton/posec3d/metafile.yml b/configs/skeleton/posec3d/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..eaa4594b11b6b3cc350c08d461b4750ef3175f98 --- /dev/null +++ b/configs/skeleton/posec3d/metafile.yml @@ -0,0 +1,147 @@ +Collections: + - Name: PoseC3D + README: configs/skeleton/posec3d/README.md + Paper: + URL: https://arxiv.org/abs/2104.13586 + Title: "Revisiting Skeleton-based Action Recognition" + +Models: + - Name: slowonly_r50_8xb16-u48-240e_gym-keypoint + Config: configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 240 + FLOPs: 20.6G + Parameters: 2.0M + Training Data: FineGYM + Training Resources: 8 GPUs + pseudo heatmap: keypoint + Results: + - Dataset: FineGYM + Task: Skeleton-based Action Recognition + Metrics: + mean Top 1 Accuracy: 93.5 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth + + - Name: slowonly_r50_8xb16-u48-240e_gym-limb + Config: configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 240 + FLOPs: 20.6G + Parameters: 2.0M + Training Data: FineGYM + Training Resources: 8 GPUs + pseudo heatmap: limb + Results: + - Dataset: FineGYM + Task: Skeleton-based Action Recognition + Metrics: + mean Top 1 Accuracy: 93.6 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth + + - Name: slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint + Config: configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 240 + FLOPs: 20.6G + Parameters: 2.0M + Training Data: NTU60-XSub + Training Resources: 8 GPUs + pseudo heatmap: keypoint + Results: + - Dataset: NTU60-XSub + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 93.6 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth + + - Name: slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb + Config: configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 240 + FLOPs: 20.6G + Parameters: 2.0M + Training Data: NTU60-XSub + Training Resources: 8 GPUs + pseudo heatmap: limb + Results: + - Dataset: NTU60-XSub + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 93.5 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth + + - Name: slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint + Config: configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 120 + FLOPs: 14.6G + Parameters: 3.0M + Training Data: HMDB51 + Training Resources: 8 GPUs + pseudo heatmap: keypoint + Results: + - Dataset: HMDB51 + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 69.6 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth + + - Name: slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint + Config: configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 16 + Epochs: 120 + FLOPs: 14.6G + Parameters: 3.1M + Training Data: UCF101 + Training Resources: 8 GPUs + pseudo heatmap: keypoint + Results: + - Dataset: UCF101 + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.8 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth + + - Name: slowonly_r50_8xb32-u48-240e_k400-keypoint + Config: configs/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint.py + In Collection: PoseC3D + Metadata: + Architecture: SlowOnly-R50 + Batch Size: 32 + Epochs: 240 + FLOPs: 19.1G + Parameters: 3.2M + Training Data: Kinetic400 + Training Resources: 8 GPUs + pseudo heatmap: keypoint + Results: + - Dataset: Kinetic400 + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 47.4 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint/slowonly_r50_8xb32-u48-240e_k400-keypoint.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint/slowonly_r50_8xb32-u48-240e_k400-keypoint_20230731-7f498b55.pth diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/README.md b/configs/skeleton/posec3d/rgbpose_conv3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..74f74a958fee21df4c057668b1e8c3581997cd5c --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/README.md @@ -0,0 +1,107 @@ +# RGBPoseConv3D + +## Introduction + +RGBPoseConv3D is a framework that jointly use 2D human skeletons and RGB appearance for human action recognition. It is a 3D CNN with two streams, with the architecture borrowed from SlowFast. In RGBPoseConv3D: + +- The RGB stream corresponds to the `slow` stream in SlowFast; The Skeleton stream corresponds to the `fast` stream in SlowFast. +- The input resolution of RGB frames is `4x` larger than the pseudo heatmaps. +- Bilateral connections are used for early feature fusion between the two modalities. + +
+ +
+ +## Citation + +```BibTeX +@inproceedings{duan2022revisiting, + title={Revisiting skeleton-based action recognition}, + author={Duan, Haodong and Zhao, Yue and Chen, Kai and Lin, Dahua and Dai, Bo}, + booktitle={CVPR}, + pages={2969--2978}, + year={2022} +} +``` + +## How to train RGBPoseConv3D (on NTURGB+D, for example)? + +#### Step 0. Data Preparation + +Besides the skeleton annotations, you also need RGB videos to train RGBPoseConv3D. You need to download them from the official website of [NTURGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) and put these videos in `$MMACTION2/data/nturgbd_raw`. After that, you need to use the provided script to compress the raw videos (from `1920x1080` to `960x540`) and switch the suffix to `.mp4`: + +```bash +# That step is mandatory, unless you know how to modify the code & config to make it work for raw videos! +python tools/data/skeleton/compress_nturgbd.py +``` + +After that, you will find processed videos in `$MMACTION2/data/nturgbd_videos`, named like `S001C001P001R001A001.mp4`. + +#### Step 1. Pretraining + +You first need to train the RGB-only and Pose-only model on the target dataset, the pretrained checkpoints will be used to initialize the RGBPoseConv3D model. + +You can either train these two models from scratch with provided configs files: + +```bash +# We train each model for 180 epochs. By default, we use 8 GPUs. +# Train the RGB-only model +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py 8 +# Train the Pose-only model +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py 8 +``` + +or directly download and use the provided pretrain models: + +| Dataset | Config | Checkpoint | Top-1 (1 clip testing) | Top-1 (10 clip testing) | +| :-----------: | :------------------------------------------------------------------: | :------------------------------------------------------------------------: | :--------------------: | :---------------------: | +| NTURGB+D XSub | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth) | 94.9 | 95.4 | +| NTURGB+D XSub | [pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 93.1 | 93.5 | + +#### Step 2. Generate the initializing weight for RGBPoseConv3D + +You can use the provided [IPython notebook](/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb) to merge two pretrained models into a single `rgbpose_conv3d_init.pth`. + +You can do it your own or directly download and use the provided [rgbpose_conv3d_init.pth](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth). + +#### Step 3. Finetune RGBPoseConv3D + +You can use our provided config files to finetune RGBPoseConv3D, jointly with two modalities (RGB & Pose): + +```bash +# We finetune RGBPoseConv3D for 20 epochs on NTURGB+D XSub (8 GPUs) +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py 8 +# After finetuning, you can test the model with the following command (8 GPUs) +bash tools/dist_test.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py $CKPT 8 --dump result.pkl +``` + +**Notes** + +1. We use linear scaling learning rate (`Initial LR` โˆ `Batch Size`). If you change the training batch size, remember to change the initial LR proportionally. + +2. Though optimized, multi-clip testing may consumes large amounts of time. For faster inference, you may change the test_pipeline to disable the multi-clip testing, this may lead to a small drop in recognition performance. Below is the guide: + + ```python + test_pipeline = [ + dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8, Pose=32), num_clips=10, test_mode=True), # change `num_clips=10` to `num_clips=1` + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='GeneratePoseTarget', sigma=0.7, use_score=True, with_kp=True, with_limb=False, scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) + ] + ``` + +## Results + +On action recognition with multiple modalities (RGB & Pose), RGBPoseConv3D can achieve better recognition performance than the late fusion baseline. + +| Dataset | Fusion | Config | Checkpoint | RGB Stream Top-1
(1-clip / 10-clip) | Pose Stream Top-1
(1-clip / 10-clip) | 2 Stream Top-1 (1:1)
(1-clip / 10-clip) | +| :-----------: | :-------------------: | :-------------------: | :------------------------: | :------------------------------------: | :-------------------------------------: | :----------------------------------------: | +| NTURGB+D XSub | Late Fusion | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py)
[pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth)
[pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 94.9 / 95.4 | 93.1 / 93.5 | 96.0 / 96.2 | +| NTURGB+D XSub | Early Fusion + Late Fusion | [config](/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_20230301-ac7b0e77.pth) | 96.2 / 96.4 | 96.0 / 96.2 | 96.6 / 96.8 | + +**Notes** + +For both `Late Fusion` and `Early Fusion + Late Fusion`, we combine the action scores based on two modalities with 1:1 ratio to get the final prediction. diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d6e952ccfccee2bb25b7b3d8d9266720d2c80ef4 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import copy as cp\n", + "from collections import OrderedDict\n", + "\n", + "import torch\n", + "from mmengine.runner.checkpoint import _load_checkpoint\n", + "\n", + "from mmaction.utils import register_all_modules\n", + "from mmaction.registry import MODELS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "backbone_cfg = dict(\n", + " type='RGBPoseConv3D',\n", + " speed_ratio=4,\n", + " channel_ratio=4,\n", + " rgb_pathway=dict(\n", + " num_stages=4,\n", + " lateral=True,\n", + " lateral_infl=1,\n", + " lateral_activate=[0, 0, 1, 1],\n", + " fusion_kernel=7,\n", + " base_channels=64,\n", + " conv1_kernel=(1, 7, 7),\n", + " inflate=(0, 0, 1, 1),\n", + " with_pool2=False),\n", + " pose_pathway=dict(\n", + " num_stages=3,\n", + " stage_blocks=(4, 6, 3),\n", + " lateral=True,\n", + " lateral_inv=True,\n", + " lateral_infl=16,\n", + " lateral_activate=(0, 1, 1),\n", + " fusion_kernel=7,\n", + " in_channels=17,\n", + " base_channels=32,\n", + " out_indices=(2, ),\n", + " conv1_kernel=(1, 7, 7),\n", + " conv1_stride_s=1,\n", + " conv1_stride_t=1,\n", + " pool1_stride_s=1,\n", + " pool1_stride_t=1,\n", + " inflate=(0, 1, 1),\n", + " spatial_strides=(2, 2, 2),\n", + " temporal_strides=(1, 1, 1),\n", + " dilations=(1, 1, 1),\n", + " with_pool2=False))\n", + "head_cfg = dict(\n", + " type='RGBPoseHead',\n", + " num_classes=60,\n", + " in_channels=[2048, 512],\n", + " average_clips='prob')\n", + "model_cfg = dict(\n", + " type='Recognizer3D',\n", + " backbone=backbone_cfg,\n", + " cls_head=head_cfg)\n", + "\n", + "register_all_modules()\n", + "model = MODELS.build(model_cfg)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "# set your paths of the pretrained weights here\n", + "rgb_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth'\n", + "pose_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth'" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\rgb_only_20230226-8bd9d8df.pth\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\pose_only_20230226-fa40054e.pth\n" + ] + } + ], + "source": [ + "rgb_ckpt = _load_checkpoint(rgb_filepath, map_location='cpu')['state_dict']\n", + "pose_ckpt = _load_checkpoint(pose_filepath, map_location='cpu')['state_dict']" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "rgb_ckpt = {k.replace('backbone', 'backbone.rgb_path').replace('fc_cls', 'fc_rgb'): v for k, v in rgb_ckpt.items()}\n", + "pose_ckpt = {k.replace('backbone', 'backbone.pose_path').replace('fc_cls', 'fc_pose'): v for k, v in pose_ckpt.items()}" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "old_ckpt = {}\n", + "old_ckpt.update(rgb_ckpt)\n", + "old_ckpt.update(pose_ckpt)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "# The difference is in dim-1\n", + "def padding(weight, new_shape):\n", + " new_weight = weight.new_zeros(new_shape)\n", + " new_weight[:, :weight.shape[1]] = weight\n", + " return new_weight" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "ckpt = cp.deepcopy(old_ckpt)\n", + "name = 'backbone.rgb_path.layer3.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (256, 640, 3, 1, 1))\n", + "name = 'backbone.rgb_path.layer3.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (1024, 640, 1, 1, 1))\n", + "name = 'backbone.rgb_path.layer4.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (512, 1280, 3, 1, 1))\n", + "name = 'backbone.rgb_path.layer4.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (2048, 1280, 1, 1, 1))\n", + "name = 'backbone.pose_path.layer2.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (64, 160, 3, 1, 1))\n", + "name = 'backbone.pose_path.layer2.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (256, 160, 1, 1, 1))\n", + "name = 'backbone.pose_path.layer3.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (128, 320, 3, 1, 1))\n", + "name = 'backbone.pose_path.layer3.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (512, 320, 1, 1, 1))\n", + "ckpt = OrderedDict(ckpt)\n", + "torch.save({'state_dict': ckpt}, 'rgbpose_conv3d_init.pth')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "_IncompatibleKeys(missing_keys=['backbone.rgb_path.layer2_lateral.conv.weight', 'backbone.rgb_path.layer3_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.bn.weight', 'backbone.pose_path.layer1_lateral.bn.bias', 'backbone.pose_path.layer1_lateral.bn.running_mean', 'backbone.pose_path.layer1_lateral.bn.running_var', 'backbone.pose_path.layer2_lateral.conv.weight', 'backbone.pose_path.layer2_lateral.bn.weight', 'backbone.pose_path.layer2_lateral.bn.bias', 'backbone.pose_path.layer2_lateral.bn.running_mean', 'backbone.pose_path.layer2_lateral.bn.running_var'], unexpected_keys=[])" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.load_state_dict(ckpt, strict=False)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py new file mode 100644 index 0000000000000000000000000000000000000000..263c6c9b04cc59970082560f51b34e73f3eb58b8 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py @@ -0,0 +1,127 @@ +_base_ = '../../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=32), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=32, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=32, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict( + type='GeneratePoseTarget', + with_kp=True, + with_limb=False, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=18, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py new file mode 100644 index 0000000000000000000000000000000000000000..f825a59e219353f92510b0e80140a77fefcc4183 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py @@ -0,0 +1,126 @@ +_base_ = '../../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +dataset_type = 'PoseDataset' +data_root = 'data/nturgbd_videos/' +ann_file = 'data/skeleton/ntu60_2d.pkl' + +train_pipeline = [ + dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8), num_clips=1), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8), + num_clips=1, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8), + num_clips=10, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=18, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (12 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=96) diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py new file mode 100644 index 0000000000000000000000000000000000000000..cbcc1aa6d1c1e094cd7f6c7638f537c5bd4ecc48 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py @@ -0,0 +1,190 @@ +_base_ = '../../../_base_/default_runtime.py' + +# model_cfg +backbone_cfg = dict( + type='RGBPoseConv3D', + speed_ratio=4, + channel_ratio=4, + rgb_pathway=dict( + num_stages=4, + lateral=True, + lateral_infl=1, + lateral_activate=[0, 0, 1, 1], + fusion_kernel=7, + base_channels=64, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1), + with_pool2=False), + pose_pathway=dict( + num_stages=3, + stage_blocks=(4, 6, 3), + lateral=True, + lateral_inv=True, + lateral_infl=16, + lateral_activate=(0, 1, 1), + fusion_kernel=7, + in_channels=17, + base_channels=32, + out_indices=(2, ), + conv1_kernel=(1, 7, 7), + conv1_stride_s=1, + conv1_stride_t=1, + pool1_stride_s=1, + pool1_stride_t=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1), + with_pool2=False)) +head_cfg = dict( + type='RGBPoseHead', + num_classes=60, + in_channels=[2048, 512], + loss_components=['rgb', 'pose'], + loss_weights=[1., 1.], + average_clips='prob') +data_preprocessor = dict( + type='MultiModalDataPreprocessor', + preprocessors=dict( + imgs=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + heatmap_imgs=dict(type='ActionDataPreprocessor'))) +model = dict( + type='MMRecognizer3D', + backbone=backbone_cfg, + cls_head=head_cfg, + data_preprocessor=data_preprocessor) + +dataset_type = 'PoseDataset' +data_root = 'data/nturgbd_videos/' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=1), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] +val_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=1, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] +test_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=10, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] + +train_dataloader = dict( + batch_size=6, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_train', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + data_prefix=dict(video=data_root), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[12, 16], + gamma=0.1) +] + +load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth' # noqa: E501 + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..8f5e4ab88ced288d9689b84c7927f94b3f392061 --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(3, 4, 6), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=51, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + train_cfg=None, + test_cfg=None) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/hmdb51_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(48, 48), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 56)), + dict(type='CenterCrop', crop_size=56), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 56)), + dict(type='CenterCrop', crop_size=56), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train1', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='test1', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='test1', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[9, 11], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/k400_posec3d-041f49c6.pth' # noqa: E501 diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6e52e0f2ea62142defa67a29ee7f4a2fbfbd3b --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(3, 4, 6), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=101, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + train_cfg=None, + test_cfg=None) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ucf101_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(48, 48), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 56)), + dict(type='CenterCrop', crop_size=56), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 56)), + dict(type='CenterCrop', crop_size=56), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train1', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='test1', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='test1', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[9, 11], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/k400_posec3d-041f49c6.pth' # noqa: E501 diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..30d69bc4308a45984a80820c985715abf69fe268 --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py @@ -0,0 +1,139 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=99, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/gym_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py new file mode 100644 index 0000000000000000000000000000000000000000..e29e1c739381a6ceaafe57c638d4a4ed995c479f --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py @@ -0,0 +1,149 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=99, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/gym_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], + [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2], + [1, 3], [2, 4], [11, 12]] +left_limb = [0, 2, 3, 6, 7, 8, 12, 14] +right_limb = [1, 4, 5, 9, 10, 11, 13, 15] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons, + double=True, + left_kp=left_kp, + right_kp=right_kp, + left_limb=left_limb, + right_limb=right_limb), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train', + pipeline=train_pipeline))), +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2e92415a50d7a980f49a22a52e087dbfa62b40b9 --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py @@ -0,0 +1,138 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa05edfad2025985a71873d23e54da520ba62dd --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], + [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2], + [1, 3], [2, 4], [11, 12]] +left_limb = [0, 2, 3, 6, 7, 8, 12, 14] +right_limb = [1, 4, 5, 9, 10, 11, 13, 15] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='CenterCrop', crop_size=64), + dict( + type='GeneratePoseTarget', + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons, + double=True, + left_limb=left_limb, + right_limb=right_limb), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..456ac8a57859c8724472b7e1c025b2f4335c2c88 --- /dev/null +++ b/configs/skeleton/posec3d/slowonly_r50_8xb32-u48-240e_k400-keypoint.py @@ -0,0 +1,146 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(3, 4, 6), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +data_root = 'data/skeleton/kpfiles' +ann_file = 'data/skeleton/k400_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +box_thr = 0.5 +valid_ratio = 0.0 + +train_pipeline = [ + dict(type='DecompressPose', squeeze=True), + dict(type='UniformSampleFrames', clip_len=48), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(-1, 64)), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecompressPose', squeeze=True), + dict(type='UniformSampleFrames', clip_len=48, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecompressPose', squeeze=True), + dict( + type='UniformSampleFrames', clip_len=48, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict( + type='GeneratePoseTarget', + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train', + pipeline=train_pipeline, + box_thr=box_thr, + data_prefix=dict(skeleton=data_root)))) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=val_pipeline, + box_thr=box_thr, + data_prefix=dict(skeleton=data_root), + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='val', + pipeline=test_pipeline, + box_thr=box_thr, + data_prefix=dict(skeleton=data_root), + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.4, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) diff --git a/configs/skeleton/stgcn/README.md b/configs/skeleton/stgcn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d42ddb97b69157aba06621a646eb62101ca0a1c0 --- /dev/null +++ b/configs/skeleton/stgcn/README.md @@ -0,0 +1,111 @@ +# STGCN + +[Spatial temporal graph convolutional networks for skeleton-based action recognition](https://ojs.aaai.org/index.php/AAAI/article/view/12328) + + + +## Abstract + + + +Dynamics of human body skeletons convey significant information for human action recognition. Conventional approaches for modeling skeletons usually rely on hand-crafted parts or traversal rules, thus resulting in limited expressive power and difficulties of generalization. In this work, we propose a novel model of dynamic skeletons called Spatial-Temporal Graph Convolutional Networks (ST-GCN), which moves beyond the limitations of previous methods by automatically learning both the spatial and temporal patterns from data. This formulation not only leads to greater expressive power but also stronger generalization capability. On two large datasets, Kinetics and NTU-RGBD, it achieves substantial improvements over mainstream methods. + + + +
+ +
+ +## Results and Models + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN | 88.95 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221129-484a394a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone | 8 | STGCN | 91.69 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221129-c4b44488.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | joint-motion | 8 | STGCN | 86.90 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221129-f18eb408.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone-motion | 8 | STGCN | 87.86 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221129-99c60e2d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| | two-stream | | | 92.12 | | | | | | | +| | four-stream | | | 92.34 | | | | | | | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN | 88.11 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221129-850308e1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone | 8 | STGCN | 88.76 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221129-9c8d2970.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | joint-motion | 8 | STGCN | 86.06 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221129-927648ea.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone-motion | 8 | STGCN | 85.49 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221129-593162ca.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| | two-stream | | | 90.14 | | | | | | | +| | four-stream | | | 90.39 | | | | | | | + +### NTU120_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN | 83.19 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d_20221129-612416c6.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.log) | +| uniform 100 | bone | 8 | STGCN | 83.36 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d_20221129-131e63c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.log) | +| uniform 100 | joint-motion | 8 | STGCN | 78.87 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d_20221129-7cb38ec2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.log) | +| uniform 100 | bone-motion | 8 | STGCN | 79.55 | 10 clips | 3.8G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d_20221129-f5b19892.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.log) | +| | two-stream | | | 84.84 | | | | | | | +| | four-stream | | | 85.23 | | | | | | | + +### NTU120_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN | 82.15 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d_20221129-0484f579.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.log) | +| uniform 100 | bone | 8 | STGCN | 84.28 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d_20221129-bc007510.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.log) | +| uniform 100 | joint-motion | 8 | STGCN | 78.93 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d_20221129-5d54f525.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.log) | +| uniform 100 | bone-motion | 8 | STGCN | 80.02 | 10 clips | 5.7G | 3.1M | [config](/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d_20221129-3cb0e4e1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.log) | +| | two-stream | | | 85.68 | | | | | | | +| | four-stream | | | 86.19 | | | | | | | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/useful_tools.md#multi-stream-fusion.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train STGCN model on NTU60-2D dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test STGCN model on NTU60-2D dataset and dump the result to a pickle file. + +```shell +python tools/test.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@inproceedings{yan2018spatial, + title={Spatial temporal graph convolutional networks for skeleton-based action recognition}, + author={Yan, Sijie and Xiong, Yuanjun and Lin, Dahua}, + booktitle={Thirty-second AAAI conference on artificial intelligence}, + year={2018} +} +``` diff --git a/configs/skeleton/stgcn/metafile.yml b/configs/skeleton/stgcn/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..32aa80857a7549bb7f6312b7e4d51a2608ba5d8d --- /dev/null +++ b/configs/skeleton/stgcn/metafile.yml @@ -0,0 +1,311 @@ +Collections: + - Name: STGCN + README: configs/skeleton/stgcn/README.md + Paper: + URL: https://arxiv.org/abs/1801.07455 + Title: 'Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition' + +Models: + - Name: stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.95 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221129-484a394a.pth + + - Name: stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 91.69 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221129-c4b44488.pth + + - Name: stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.90 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221129-f18eb408.pth + + - Name: stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 87.86 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221129-99c60e2d.pth + + - Name: stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.11 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221129-850308e1.pth + + - Name: stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.76 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221129-9c8d2970.pth + + - Name: stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.06 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221129-927648ea.pth + + - Name: stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 85.49 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221129-593162ca.pth + + - Name: stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU120-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 83.19 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d_20221129-612416c6.pth + + - Name: stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU120-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 83.36 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d_20221129-131e63c3.pth + + - Name: stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU120-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 78.87 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d_20221129-7cb38ec2.pth + + - Name: stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 3.8G + Parameters: 3.1M + Training Data: NTU120-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 79.55 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d_20221129-f5b19892.pth + + - Name: stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU120-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 82.15 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d_20221129-0484f579.pth + + - Name: stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU120-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 84.28 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d_20221129-bc007510.pth + + - Name: stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU120-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 78.93 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d_20221129-5d54f525.pth + + - Name: stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d + Config: configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.py + In Collection: STGCN + Metadata: + Architecture: STGCN + Batch Size: 16 + Epochs: 80 + FLOPs: 5.7G + Parameters: 3.1M + Training Data: NTU120-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU120-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 80.02 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d_20221129-3cb0e4e1.pth diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..3089ea4df3a52b6349cfa30f3709b3ae17c929b8 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0d01696391d69133c8ede26152668c89b0fb77cf --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu120-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..843e109930e869c4978833f332c55e63fd6df144 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..49f79ed3eb7ff1f5c6c86a6fb6407d6ece02b7bd --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..7594ff49213f06ad3eae211260980302bed38ab9 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..6500ebb7709e80c546ebb18f5284fa454c03579d --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu120-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..7a5881b6def48fe62836bfc0c804866603b58a4b --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..d413c9e19fea4daf459201145ce418385889cb28 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a01c3490b78ace283442ea1cb0fa06f528cffba1 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..8c7f6c1694ce72c39969cfbb27700929d279b37e --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu120-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..05b964fe1ea163829504f25c943db4a49828b8ea --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..1ddf2f4321e0ce12498632928f21ecd78ed6dcc6 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b6936b056d1817e76bbd2ffec15a133b0b07e3 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-2d.py @@ -0,0 +1,102 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='coco', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=120, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..f0d827484ffe4724371a452eeab0f0b8ba50113e --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu120-xsub-keypoint-3d.py @@ -0,0 +1,102 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='nturgb+d', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=120, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu120_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..8bf4cff813cf724b1ad4b990ab5057f8f88fd1e8 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,102 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='coco', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..21a8b1a6ae039b51c18300d46919e1c40d3ee612 --- /dev/null +++ b/configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,102 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='nturgb+d', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/stgcnpp/README.md b/configs/skeleton/stgcnpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3799300b99dd4aee6157b658c54c2c0b368e9443 --- /dev/null +++ b/configs/skeleton/stgcnpp/README.md @@ -0,0 +1,84 @@ +# STGCN++ + +[PYSKL: Towards Good Practices for Skeleton Action Recognition](https://arxiv.org/abs/2205.09443) + + + +## Abstract + + + +We present PYSKL: an open-source toolbox for skeleton-based action recognition based on PyTorch. The toolbox supports a wide variety of skeleton action recognition algorithms, including approaches based on GCN and CNN. In contrast to existing open-source skeleton action recognition projects that include only one or two algorithms, PYSKL implements six different algorithms under a unified framework with both the latest and original good practices to ease the comparison of efficacy and efficiency. We also provide an original GCN-based skeleton action recognition model named ST-GCN++, which achieves competitive recognition performance without any complicated attention schemes, serving as a strong baseline. Meanwhile, PYSKL supports the training and testing of nine skeleton-based action recognition benchmarks and achieves state-of-the-art recognition performance on eight of them. To facilitate future research on skeleton action recognition, we also provide a large number of trained models and detailed benchmark results to give some insights. PYSKL is released at this https URL and is actively maintained. We will update this report when we add new features or benchmarks. The current version corresponds to PYSKL v0.2. + +## Results and Models + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN++ | 89.29 | 10 clips | 1.95G | 1.39M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone | 8 | STGCN++ | 92.30 | 10 clips | 1.95G | 1.39M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221228-cd11a691.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | joint-motion | 8 | STGCN++ | 87.30 | 10 clips | 1.95G | 1.39M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221228-19a34aba.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| uniform 100 | bone-motion | 8 | STGCN++ | 88.76 | 10 clips | 1.95G | 1.39M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221228-c02a0749.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log) | +| | two-stream | | | 92.61 | | | | | | | +| | four-stream | | | 92.77 | | | | | | | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :----------: | :--: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| uniform 100 | joint | 8 | STGCN++ | 89.14 | 10 clips | 2.96G | 1.4M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221230-4e455ce3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone | 8 | STGCN++ | 90.21 | 10 clips | 2.96G | 1.4M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221230-7f356072.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | joint-motion | 8 | STGCN++ | 86.67 | 10 clips | 2.96G | 1.4M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221230-650de5cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| uniform 100 | bone-motion | 8 | STGCN++ | 87.45 | 10 clips | 2.96G | 1.4M | [config](/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221230-b00440d2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log) | +| | two-stream | | | 91.39 | | | | | | | +| | four-stream | | | 91.87 | | | | | | | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/useful_tools.md#multi-stream-fusion). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train STGCN++ model on NTU60-2D dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test STGCN++ model on NTU60-2D dataset and dump the result to a pickle file. + +```shell +python tools/test.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Citation + +```BibTeX +@misc{duan2022PYSKL, + url = {https://arxiv.org/abs/2205.09443}, + author = {Duan, Haodong and Wang, Jiaqi and Chen, Kai and Lin, Dahua}, + title = {PYSKL: Towards Good Practices for Skeleton Action Recognition}, + publisher = {arXiv}, + year = {2022} +} +``` diff --git a/configs/skeleton/stgcnpp/metafile.yml b/configs/skeleton/stgcnpp/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..daf79fb49b9ee59ed43ccf49dd044b4f6c9b3c97 --- /dev/null +++ b/configs/skeleton/stgcnpp/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: STGCN++ + README: configs/skeleton/stgcnpp/README.md + Paper: + URL: https://arxiv.org/abs/2205.09443 + Title: 'PYSKL: Towards Good Practices for Skeleton Action Recognition' + +Models: + - Name: stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 1.95G + Parameters: 1.39M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 89.29 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth + + - Name: stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 1.95G + Parameters: 1.39M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 92.30 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d_20221228-cd11a691.pth + + - Name: stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 1.95G + Parameters: 1.39M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 87.30 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d_20221228-19a34aba.pth + + - Name: stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 1.95G + Parameters: 1.39M + Training Data: NTU60-XSub-2D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-2D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 88.76 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d_20221228-c02a0749.pth + + - Name: stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 2.96G + Parameters: 1.4M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 89.14 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20221230-4e455ce3.pth + + - Name: stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 2.96G + Parameters: 1.4M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 90.21 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d_20221230-7f356072.pth + + - Name: stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 2.96G + Parameters: 1.4M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 86.67 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d_20221230-650de5cc.pth + + - Name: stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d + Config: configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py + In Collection: STGCN++ + Metadata: + Architecture: STGCN++ + Batch Size: 16 + Epochs: 80 + FLOPs: 2.96G + Parameters: 1.4M + Training Data: NTU60-XSub-3D + Training Resources: 8 GPUs + Results: + Dataset: NTU60-XSub-3D + Task: Skeleton-based Action Recognition + Metrics: + Top 1 Accuracy: 87.45 + Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.log + Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d_20221230-b00440d2.pth diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..422d8b883968d7d6cdde9091fc276c4b3940c981 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..26bbe6618dd3fc1ed28380fde1007b81ec76a507 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['bm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..1a329662ad1d2336efb11953f5966408d62d4c03 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..bbb31cc108a0ba693fe53aa2603c6c97dd1fd085 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-bone-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['b']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bbe90186ce93782e5cbaa4521707a329de705a --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..23cfe7690fe3a7f95e38fc8040e65ecb5b59df8d --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-motion-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,67 @@ +_base_ = 'stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py' + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['jm']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a423e7101731d46f0543bd9bc7ce9c1f9f744c26 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,106 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', + gcn_adaptive='init', + gcn_with_res=True, + tcn_type='mstcn', + graph_cfg=dict(layout='coco', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..04cb0edd45b84b8d49763745bb2163be5639d5f9 --- /dev/null +++ b/configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,106 @@ +_base_ = '../../_base_/default_runtime.py' + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', + gcn_adaptive='init', + gcn_with_res=True, + tcn_type='mstcn', + graph_cfg=dict(layout='nturgb+d', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/dataset-index.yml b/dataset-index.yml new file mode 100644 index 0000000000000000000000000000000000000000..de9f03728f0d47eff0f2a47ec55ddbbcda2ae0c4 --- /dev/null +++ b/dataset-index.yml @@ -0,0 +1,40 @@ +openxlab: true +kinetics400: + dataset: OpenMMLab/Kinetics-400 + download_root: data + data_root: data/kinetics400 + script: tools/data/kinetics/preprocess_k400.sh + +kinetics600: + dataset: OpenMMLab/Kinetics600 + download_root: data + data_root: data/kinetics600 + script: tools/data/kinetics/preprocess_k600.sh + +kinetics700: + dataset: OpenMMLab/Kinetics_700 + download_root: data + data_root: data/kinetics700 + script: tools/data/kinetics/preprocess_k700.sh + +sthv2: + dataset: OpenDataLab/sthv2 + download_root: data + data_root: data/sthv2 + script: tools/data/sthv2/preprocess.sh + +ucf-101: + dataset: OpenDataLab/UCF101 + download_root: data + data_root: data/ucf101 + +finegym: + dataset: OpenDataLab/FineGym + download_root: data + data_root: data/gym + +diving48: + dataset: OpenDataLab/diving48 + download_root: data + data_root: data/diving48 + script: tools/data/diving48/preprocess.sh diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..70b8b065b4950066b2f2cb59a8cb4311888d053c --- /dev/null +++ b/demo/README.md @@ -0,0 +1,743 @@ +# Demo + +## Outline + +- [Modify configs through script arguments](#modify-config-through-script-arguments): Tricks to directly modify configs through script arguments. +- [Video demo](#video-demo): A demo script to predict the recognition result using a single video. +- [Video GradCAM Demo](#video-gradcam-demo): A demo script to visualize GradCAM results using a single video. +- [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera. +- [Long Video demo](#long-video-demo): a demo script to predict different labels using a single long video. +- [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video. +- [SpatioTemporal Action Detection Webcam Demo](#spatiotemporal-action-detection-webcam-demo): A demo script to implement real-time spatio-temporal action detection from a web camera. +- [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the spatiotemporal action detection result using a single video. +- [SpatioTemporal Action Detection ONNX Video Demo](#spatiotemporal-action-detection-onnx-video-demo): A demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models. +- [Inferencer Demo](#inferencer): A demo script to implement fast predict for video analysis tasks based on unified inferencer interface. +- [Audio Demo](#audio-demo): A demo script to predict the recognition result using a single audio file. +- [Video Structuralize Demo](#video-structuralize-demo): A demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video. + +## Modify configs through script arguments + +When running demos using our provided scripts, you may specify `--cfg-options` to in-place modify the config. + +- Update config keys of dict. + + The config options can be specified following the order of the dict keys in the original config. + For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode. + +- Update keys inside a list of configs. + + Some config dicts are composed as a list in your config. For example, the training pipeline `train_dataloader.dataset.pipeline` is normally a list + e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline, + you may specify `--cfg-options train_dataloader.dataset.pipeline.0.type=DenseSampleFrames`. + +- Update values of list/tuples. + + If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to + change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to + support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value. + +## Video demo + +MMAction2 provides a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `model['test_cfg'] = dict(average_clips='prob')` in config file. + +```shell +python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \ + [--device ${DEVICE_TYPE}] [--fps ${FPS}] [--font-scale ${FONT_SCALE}] [--font-color ${FONT_COLOR}] \ + [--target-resolution ${TARGET_RESOLUTION}] [--out-filename ${OUT_FILE}] +``` + +Optional arguments: + +- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input. +- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. +- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30. +- `FONT_SCALE`: Font scale of the text added in the video. If not specified, it will be None. +- `FONT_COLOR`: Font color of the text added in the video. If not specified, it will be `white`. +- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio. +- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file. + +Examples: + +Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`, +or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`. + +1. Recognize a video file as input by using a TSN model on cuda by default. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \ + demo/demo.mp4 tools/data/kinetics/label_map_k400.txt + ``` + +2. Recognize a video file as input by using a TSN model on cuda by default, loading checkpoint from url. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \ + demo/demo.mp4 tools/data/kinetics/label_map_k400.txt + ``` + +3. Recognize a video file as input by using a TSN model and then generate an mp4 file. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \ + demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --out-filename demo/demo_out.mp4 + ``` + +## Video GradCAM Demo + +MMAction2 provides a demo script to visualize GradCAM results using a single video. + +```shell +python tools/visualizations/vis_cam.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} [--use-frames] \ + [--device ${DEVICE_TYPE}] [--target-layer-name ${TARGET_LAYER_NAME}] [--fps {FPS}] \ + [--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}] +``` + +- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input. +- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. +- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30. +- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file. +- `TARGET_LAYER_NAME`: Layer name to generate GradCAM localization map. +- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio. +- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bilinear`. + +Examples: + +Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`, +or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`. + +1. Get GradCAM results of a I3D model, using a video file as input and then generate an gif file with 10 fps. + + ```shell + python tools/visualizations/vis_cam.py demo/demo_configs/i3d_r50_32x2x1_video_infer.py \ + checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth demo/demo.mp4 \ + --target-layer-name backbone/layer4/1/relu --fps 10 \ + --out-filename demo/demo_gradcam.gif + ``` + +2. Get GradCAM results of a TSN model, using a video file as input and then generate an gif file, loading checkpoint from url. + + ```shell + python tools/visualizations/vis_cam.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb_20220906-dcbc6e01.pth \ + demo/demo.mp4 --target-layer-name backbone/layer4/1/relu --out-filename demo/demo_gradcam_tsn.gif + ``` + +## Webcam demo + +We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `model.cls_head.average_clips='prob'` in config file. + +```shell +python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \ + [--device ${DEVICE_TYPE}] [--camera-id ${CAMERA_ID}] [--threshold ${THRESHOLD}] \ + [--average-size ${AVERAGE_SIZE}] [--drawing-fps ${DRAWING_FPS}] [--inference-fps ${INFERENCE_FPS}] +``` + +Optional arguments: + +- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. +- `CAMERA_ID`: ID of camera device If not specified, it will be set to 0. +- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0. +- `AVERAGE_SIZE`: Number of latest clips to be averaged for prediction. If not specified, it will be set to 1. +- `DRAWING_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 20. +- `INFERENCE_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 4. + +If your hardware is good enough, increasing the value of `DRAWING_FPS` and `INFERENCE_FPS` will get a better experience. + +Examples: + +Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`, +or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`. + +1. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times + and outputting result labels with score higher than 0.2. + + ```shell + python demo/webcam_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + checkpoints/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth tools/data/kinetics/label_map_k400.txt --average-size 5 \ + --threshold 0.2 --device cpu + ``` + +2. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times + and outputting result labels with score higher than 0.2, loading checkpoint from url. + + ```shell + python demo/webcam_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \ + tools/data/kinetics/label_map_k400.txt --average-size 5 --threshold 0.2 --device cpu + ``` + +3. Recognize the action from web camera as input by using a I3D model on gpu by default, averaging the score per 5 times + and outputting result labels with score higher than 0.2. + + ```shell + python demo/webcam_demo.py demo/demo_configs/i3d_r50_32x2x1_video_infer.py \ + checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth tools/data/kinetics/label_map_k400.txt \ + --average-size 5 --threshold 0.2 + ``` + +Considering the efficiency difference for users' hardware, Some modifications might be done to suit the case. +Users can change: + +- `SampleFrames` step (especially the number of `clip_len` and `num_clips`) of `test_pipeline` in the config file, like `--cfg-options test_pipeline.0.num_clips=3`. +- Change to the suitable Crop methods like `TenCrop`, `ThreeCrop`, `CenterCrop`, etc. in `test_pipeline` of the config file, like `--cfg-options test_pipeline.4.type=CenterCrop`. +- Change the number of `--average-size`. The smaller, the faster. + +## Long video demo + +We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `cls_head = dict(average_clips='prob')` in config file. + +```shell +python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \ + ${OUT_FILE} [--input-step ${INPUT_STEP}] [--device ${DEVICE_TYPE}] [--threshold ${THRESHOLD}] +``` + +Optional arguments: + +- `OUT_FILE`: Path to the output, either video or json file +- `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1. +- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. +- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01. +- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1\], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0. +- `LABEL_COLOR`: Font Color of the labels in (B, G, R). Default is white, that is (256, 256, 256). +- `MSG_COLOR`: Font Color of the messages in (B, G, R). Default is gray, that is (128, 128, 128). + +Examples: + +Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`, +or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`. + +1. Predict different labels in a long video by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames) + and outputting result labels with score higher than 0.2. + + ```shell + python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \ + --input-step 3 --device cpu --threshold 0.2 + ``` + +2. Predict different labels in a long video by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames) + and outputting result labels with score higher than 0.2, loading checkpoint from url. + + ```shell + python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \ + PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2 + ``` + +3. Predict different labels in a long video from web by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames) + and outputting result labels with score higher than 0.2, loading checkpoint from url. + + ```shell + python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \ + https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4 \ + tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2 + ``` + +4. Predict different labels in a long video by using a I3D model on gpu, with input_step=1, threshold=0.01 as default and print the labels in cyan. + + ```shell + python demo/long_video_demo.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \ + checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \ + --label-color 255 255 0 + ``` + +5. Predict different labels in a long video by using a I3D model on gpu and save the results as a `json` file + + ```shell + python demo/long_video_demo.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \ + checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt ./results.json + ``` + +## Skeleton-based Action Recognition Demo + +MMAction2 provides a demo script to predict the skeleton-based action recognition result using a single video. + +```shell +python demo/demo_skeleton.py ${VIDEO_FILE} ${OUT_FILENAME} \ + [--config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \ + [--checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \ + [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \ + [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \ + [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \ + [--label-map ${LABEL_MAP}] \ + [--device ${DEVICE}] \ + [--short-side] ${SHORT_SIDE} +``` + +Optional arguments: + +- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path. +- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or url. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint path or url. +- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Defaults to 0.9. +- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0. +- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint). +- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint path or url (trained on COCO-Keypoint). +- `LABEL_MAP`: The label map used. Defaults to `'tools/data/skeleton/label_map_ntu60.txt'`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `'cuda:0'` or `'cpu'`. Defaults to `'cuda:0'`. +- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 480. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Use the Faster-RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D-NTURGB+D-60-XSub-Keypoint as the skeleton-based action recognizer. + +```shell +python demo/demo_skeleton.py demo/demo_skeleton.mp4 demo/demo_skeleton_out.mp4 \ + --config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \ + --checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu60_xsub_keypoint/slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --det-cat-id 0 \ + --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \ + --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \ + --label-map tools/data/skeleton/label_map_ntu60.txt +``` + +2. Use the Faster-RCNN as the human detector, HRNetw32 as the pose estimator, STGCN-NTURGB+D-60-XSub-Keypoint as the skeleton-based action recognizer. + +```shell +python demo/demo_skeleton.py demo/demo_skeleton.mp4 demo/demo_skeleton_out.mp4 \ + --config configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \ + --checkpoint https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221129-484a394a.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --det-cat-id 0 \ + --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \ + --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \ + --label-map tools/data/skeleton/label_map_ntu60.txt +``` + +## SpatioTemporal Action Detection Webcam Demo + +We provide a demo script to implement real-time spatio-temporal action detection from a web camera. + +```shell +python demo/webcam_demo_spatiotemporal_det.py \ + [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \ + [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \ + [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \ + [--input-video] ${INPUT_VIDEO} \ + [--label-map ${LABEL_MAP}] \ + [--device ${DEVICE}] \ + [--output-fps ${OUTPUT_FPS}] \ + [--out-filename ${OUTPUT_FILENAME}] \ + [--show] \ + [--display-height] ${DISPLAY_HEIGHT} \ + [--display-width] ${DISPLAY_WIDTH} \ + [--predict-stepsize ${PREDICT_STEPSIZE}] \ + [--clip-vis-length] ${CLIP_VIS_LENGTH} +``` + +Optional arguments: + +- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path. +- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint path or URL. +- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.4. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL. +- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9. +- `INPUT_VIDEO`: The webcam id or video path of the source. Default: `0`. +- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`. +- `OUTPUT_FPS`: The FPS of demo video output. Default: 15. +- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: None. +- `--show`: Whether to show predictions with `cv2.imshow`. +- `DISPLAY_HEIGHT`: The height of the display frame. Default: 0. +- `DISPLAY_WIDTH`: The width of the display frame. Default: 0. If `DISPLAY_HEIGHT <= 0 and DISPLAY_WIDTH <= 0`, the display frame and input video share the same shape. +- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8. +- `CLIP_VIS_LENGTH`: The number of the draw frames for each clip. In other words, for each clip, there are at most `CLIP_VIS_LENGTH` frames to be draw around the keyframe. DEFAULT: 8. + +Tips to get a better experience for webcam demo: + +- How to choose `--output-fps`? + + - `--output-fps` should be almost equal to read thread fps. + - Read thread fps is printed by logger in format `DEBUG:__main__:Read Thread: {duration} ms, {fps} fps` + +- How to choose `--predict-stepsize`? + + - It's related to how to choose human detector and spatio-temporval model. + - Overall, the duration of read thread for each task should be greater equal to that of model inference. + - The durations for read/inference are both printed by logger. + - Larger `--predict-stepsize` leads to larger duration for read thread. + - In order to fully take the advantage of computation resources, decrease the value of `--predict-stepsize`. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 40 frames, and FPS of the output is 20. Show predictions with `cv2.imshow`. + +```shell +python demo/webcam_demo_spatiotemporal_det.py \ + --input-video 0 \ + --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --action-score-thr 0.5 \ + --label-map tools/data/ava/label_map.txt \ + --predict-stepsize 40 \ + --output-fps 20 \ + --show +``` + +## SpatioTemporal Action Detection Video Demo + +MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using a single video. + +```shell +python demo/demo_spatiotemporal_det.py --video ${VIDEO_FILE} \ + [--out-filename ${OUTPUT_FILENAME}] \ + [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \ + [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \ + [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \ + [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \ + [--label-map ${LABEL_MAP}] \ + [--device ${DEVICE}] \ + [--short-side] ${SHORT_SIDE} \ + [--predict-stepsize ${PREDICT_STEPSIZE}] \ + [--output-stepsize ${OUTPUT_STEPSIZE}] \ + [--output-fps ${OUTPUT_FPS}] +``` + +Optional arguments: + +- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`. +- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path. +- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint URL. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL. +- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9. +- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0. +- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5. +- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`. +- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256. +- `PREDICT_STEPSIZE`: Make a prediction per N frames. Defaults to 8. +- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4. +- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4. + +```shell +python demo/demo_spatiotemporal_det.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \ + --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --action-score-thr 0.5 \ + --label-map tools/data/ava/label_map.txt \ + --predict-stepsize 8 \ + --output-stepsize 4 \ + --output-fps 6 +``` + +## SpatioTemporal Action Detection ONNX Video Demo + +MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models. + +```shell +python demo/demo_spatiotemporal_det_onnx.py --video ${VIDEO_FILE} \ + [--out-filename ${OUTPUT_FILENAME}] \ + [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \ + [--onnx-file ${SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \ + [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \ + [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \ + [--label-map ${LABEL_MAP}] \ + [--device ${DEVICE}] \ + [--short-side] ${SHORT_SIDE} \ + [--predict-stepsize ${PREDICT_STEPSIZE}] \ + [--output-stepsize ${OUTPUT_STEPSIZE}] \ + [--output-fps ${OUTPUT_FPS}] +``` + +Optional arguments: + +- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`. +- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path. +- `SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE`: The spatiotemporal action detection onnx file. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL. +- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9. +- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0. +- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5. +- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`. +- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256. +- `PREDICT_STEPSIZE`: Make a prediction per N frames. Defaults to 8. +- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4. +- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Export an onnx file given the config file and checkpoint. + +```shell +python tools/deployment/export_onnx_stdet.py \ + configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --output_file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \ + --num_frames 8 +``` + +2. Use the Faster RCNN as the human detector, the generated `slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx` file as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4. + +```shell +python demo/demo_spatiotemporal_det_onnx.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \ + --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --onnx-file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --action-score-thr 0.5 \ + --label-map tools/data/ava/label_map.txt \ + --predict-stepsize 8 \ + --output-stepsize 4 \ + --output-fps 6 +``` + +## Inferencer + +MMAction2 provides a demo script to implement fast prediction for video analysis tasks based on unified inferencer interface, currently only supports action recognition task. + +```shell +python demo/demo.py ${INPUTS} \ + [--vid-out-dir ${VID_OUT_DIR}] \ + [--rec ${RECOG_TASK}] \ + [--rec-weights ${RECOG_WEIGHTS}] \ + [--label-file ${LABEL_FILE}] \ + [--device ${DEVICE_TYPE}] \ + [--batch-size ${BATCH_SIZE}] \ + [--print-result ${PRINT_RESULT}] \ + [--pred-out-file ${PRED_OUT_FILE} ] +``` + +Optional arguments: + +- `--show`: If specified, the demo will display the video in a popup window. +- `--print-result`: If specified, the demo will print the inference results' +- `VID_OUT_DIR`: Output directory of saved videos. Defaults to None, means not to save videos. +- `RECOG_TASK`: Type of Action Recognition algorithm. It could be the path to the config file, the model name or alias defined in metafile. +- `RECOG_WEIGHTS`: Path to the custom checkpoint file of the selected recog model. If it is not specified and "rec" is a model name of metafile, the weights will be loaded from metafile. +- `LABEL_FILE`: Label file for dataset the algorithm pretrained on. Defaults to None, means don't show label in result. +- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`. +- `BATCH_SIZE`: The batch size used in inference. Defaults to 1. +- `PRED_OUT_FILE`: File path to save the inference results. Defaults to None, means not to save prediction results. + +Examples: + +Assume that you are located at `$MMACTION2`. + +1. Recognize a video file as input by using a TSN model, loading checkpoint from metafile. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo_inferencer.py demo/demo.mp4 \ + --rec tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb \ + --label-file tools/data/kinetics/label_map_k400.txt + ``` + +2. Recognize a video file as input by using a TSN model, using model alias in metafile. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo_inferencer.py demo/demo.mp4 \ + --rec tsn \ + --label-file tools/data/kinetics/label_map_k400.txt + ``` + +3. Recognize a video file as input by using a TSN model, and then save visulization video. + + ```shell + # The demo.mp4 and label_map_k400.txt are both from Kinetics-400 + python demo/demo_inferencer.py demo/demo.mp4 \ + --vid-out-dir demo_out \ + --rec tsn \ + --label-file tools/data/kinetics/label_map_k400.txt + ``` + +## Audio Demo + +Demo script to predict the audio-based action recognition using a single audio feature. + +The script [`extract_audio.py`](/tools/data/extract_audio.py) can be used to extract audios from videos and the script [`build_audio_features.py`](/tools/data/build_audio_features.py) can be used to extract the audio features. + +```shell +python demo/demo_audio.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${AUDIO_FILE} {LABEL_FILE} [--device ${DEVICE}] +``` + +Optional arguments: + +- `DEVICE`: Type of device to run the demo. Allowed values are cuda devices like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. + +Examples: + +Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`, +or use checkpoint url from `configs/` to directly load the corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`. + +1. Recognize an audio file as input by using a tsn model on cuda by default. + + ```shell + python demo/demo_audio.py \ + configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py \ + https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature_20230702-e4642fb0.pth \ + audio_feature.npy tools/data/kinetics/label_map_k400.txt + ``` + +## Video Structuralize Demo + +We provide a demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video. + +```shell +python demo/demo_video_structuralize.py \ + [--rgb-stdet-config ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \ + [--rgb-stdet-checkpoint ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \ + [--skeleton-stdet-checkpoint ${SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \ + [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \ + [--skeleton-config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \ + [--skeleton-checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \ + [--rgb-config ${RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \ + [--rgb-checkpoint ${RGB_BASED_ACTION_RECOGNITION_CHECKPOINT}] \ + [--use-skeleton-stdet ${USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD}] \ + [--use-skeleton-recog ${USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRE}] \ + [--action-score-thr ${ACTION_DETECTION_SCORE_THRE}] \ + [--video ${VIDEO_FILE}] \ + [--label-map-stdet ${LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION}] \ + [--device ${DEVICE}] \ + [--out-filename ${OUTPUT_FILENAME}] \ + [--predict-stepsize ${PREDICT_STEPSIZE}] \ + [--output-stepsize ${OUTPU_STEPSIZE}] \ + [--output-fps ${OUTPUT_FPS}] \ + [--cfg-options] +``` + +Optional arguments: + +- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The rgb-based spatio temoral action detection config file path. +- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The rgb-based spatio temoral action detection checkpoint path or URL. +- `SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The skeleton-based spatio temoral action detection checkpoint path or URL. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL. +- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint). +- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint). +- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path. +- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL. +- `RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The rgb-based action recognition config file path. +- `RGB_BASED_ACTION_RECOGNITION_CHECKPOINT`: The rgb-based action recognition checkpoint path or URL. +- `USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD`: Use skeleton-based spatio temporal action detection method. +- `USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD`: Use skeleton-based action recognition method. +- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9. +- `ACTION_DETECTION_SCORE_THRE`: The score threshold for action detection. Default: 0.4. +- `LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION`: The label map for spatio temporal action detection used. Default: `tools/data/ava/label_map.txt`. +- `LABEL_MAP`: The label map for action recognition. Default: `tools/data/kinetics/label_map_k400.txt`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`. +- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/test_stdet_recognition_output.mp4`. +- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8. +- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 1. +- `OUTPUT_FPS`: The FPS of demo video output. Default: 24. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer and the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24. + +```shell +python demo/demo_video_structuralize.py \ + --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \ + --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \ + --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \ + --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \ + --use-skeleton-stdet \ + --use-skeleton-recog \ + --label-map-stdet tools/data/ava/label_map.txt \ + --label-map tools/data/kinetics/label_map_k400.txt +``` + +2. Use the Faster RCNN as the human detector, TSN-R50-1x1x3 as the rgb-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24. + +```shell +python demo/demo_video_structuralize.py \ + --rgb-stdet-config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --rgb-config demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \ + --label-map-stdet tools/data/ava/label_map.txt \ + --label-map tools/data/kinetics/label_map_k400.txt +``` + +3. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24. + +```shell +python demo/demo_video_structuralize.py \ + --rgb-stdet-config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \ + --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \ + --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \ + --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \ + --use-skeleton-recog \ + --label-map-stdet tools/data/ava/label_map.txt \ + --label-map tools/data/kinetics/label_map_k400.txt +``` + +4. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, TSN-R50-1x1x3 as the rgb-based action recognizer, PoseC3D as the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24. + +```shell +python demo/demo_video_structuralize.py + --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \ + --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \ + --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \ + --rgb-config demo/demo_configs/tsn_r50_1x1x8_video_infer.py \ + --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \ + --use-skeleton-stdet \ + --label-map-stdet tools/data/ava/label_map.txt \ + --label-map tools/data/kinetics/label_map_k400.txt +``` diff --git a/demo/demo.ipynb b/demo/demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..509d1852151a49ab24ba64de21d1a44c2f43c7a2 --- /dev/null +++ b/demo/demo.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "from mmaction.apis import init_recognizer, inference_recognizer" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "config_file = '../demo/demo_configs/tsn_r50_1x1x8_video_infer.py'\n", + "# download the checkpoint from model zoo and put it in `checkpoints/`\n", + "checkpoint_file = '../checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loads checkpoint by local backend from path: ../checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth\n" + ] + } + ], + "source": [ + "# build the model from a config file and a checkpoint file\n", + "model = init_recognizer(config_file, checkpoint_file, device='cpu')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# test a single video and show the result:\n", + "video = 'demo.mp4'\n", + "label = '../tools/data/kinetics/label_map_k400.txt'\n", + "results = inference_recognizer(model, video)\n", + "\n", + "pred_scores = results.pred_score.tolist()\n", + "score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))\n", + "score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)\n", + "top5_label = score_sorted[:5]\n", + "\n", + "labels = open(label).readlines()\n", + "labels = [x.strip() for x in labels]\n", + "results = [(labels[k[0]], k[1]) for k in top5_label]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "arm wrestling: 1.0\n", + "rock scissors paper: 1.698846019067312e-15\n", + "massaging feet: 5.157996544393221e-16\n", + "stretching leg: 1.018867278715779e-16\n", + "bench pressing: 7.110452486439706e-17\n" + ] + } + ], + "source": [ + "# show the results\n", + "for result in results:\n", + " print(f'{result[0]}: ', result[1])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mmact_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13 (default, Mar 29 2022, 02:18:16) \n[GCC 7.5.0]" + }, + "vscode": { + "interpreter": { + "hash": "189c342a4747645665e89db23000ac4d4edb7a87c4cd0b2f881610f468fb778d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demo/demo.mp4 b/demo/demo.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cec78ef29eba4f72493a94797869bcf9b61a827b --- /dev/null +++ b/demo/demo.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:071e60535eaf0aed475ddac06269ee0cdfc4740158f22d9ccd2c3b93b42aa344 +size 635539 diff --git a/demo/demo.py b/demo/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..d536831ed22442328001e22a75b22b46421ed985 --- /dev/null +++ b/demo/demo.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from operator import itemgetter +from typing import Optional, Tuple + +from mmengine import Config, DictAction + +from mmaction.apis import inference_recognizer, init_recognizer +from mmaction.visualization import ActionVisualizer + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file/url') + parser.add_argument('video', help='video file/url or rawframes directory') + parser.add_argument('label', help='label file') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--fps', + default=30, + type=int, + help='specify fps value of the output video when using rawframes to ' + 'generate file') + parser.add_argument( + '--font-scale', + default=None, + type=float, + help='font scale of the text in output video') + parser.add_argument( + '--font-color', + default='white', + help='font color of the text in output video') + parser.add_argument( + '--target-resolution', + nargs=2, + default=None, + type=int, + help='Target resolution (w, h) for resizing the frames when using a ' + 'video as input. If either dimension is set to -1, the frames are ' + 'resized by keeping the existing aspect ratio') + parser.add_argument('--out-filename', default=None, help='output filename') + args = parser.parse_args() + return args + + +def get_output( + video_path: str, + out_filename: str, + data_sample: str, + labels: list, + fps: int = 30, + font_scale: Optional[str] = None, + font_color: str = 'white', + target_resolution: Optional[Tuple[int]] = None, +) -> None: + """Get demo output using ``moviepy``. + + This function will generate video file or gif file from raw video or + frames, by using ``moviepy``. For more information of some parameters, + you can refer to: https://github.com/Zulko/moviepy. + + Args: + video_path (str): The video file path. + out_filename (str): Output filename for the generated file. + datasample (str): Predicted label of the generated file. + labels (list): Label list of current dataset. + fps (int): Number of picture frames to read per second. Defaults to 30. + font_scale (float): Font scale of the text. Defaults to None. + font_color (str): Font color of the text. Defaults to ``white``. + target_resolution (Tuple[int], optional): Set to + (desired_width desired_height) to have resized frames. If + either dimension is None, the frames are resized by keeping + the existing aspect ratio. Defaults to None. + """ + + if video_path.startswith(('http://', 'https://')): + raise NotImplementedError + + # init visualizer + out_type = 'gif' if osp.splitext(out_filename)[1] == '.gif' else 'video' + visualizer = ActionVisualizer() + visualizer.dataset_meta = dict(classes=labels) + + text_cfg = {'colors': font_color} + if font_scale is not None: + text_cfg.update({'font_sizes': font_scale}) + + visualizer.add_datasample( + out_filename, + video_path, + data_sample, + draw_pred=True, + draw_gt=False, + text_cfg=text_cfg, + fps=fps, + out_type=out_type, + out_path=osp.join('demo', out_filename), + target_resolution=target_resolution) + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # Build the recognizer from a config file and checkpoint file/url + model = init_recognizer(cfg, args.checkpoint, device=args.device) + pred_result = inference_recognizer(model, args.video) + + pred_scores = pred_result.pred_score.tolist() + score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) + score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) + top5_label = score_sorted[:5] + + labels = open(args.label).readlines() + labels = [x.strip() for x in labels] + results = [(labels[k[0]], k[1]) for k in top5_label] + + print('The top-5 labels with corresponding scores are:') + for result in results: + print(f'{result[0]}: ', result[1]) + + if args.out_filename is not None: + + if args.target_resolution is not None: + if args.target_resolution[0] == -1: + assert isinstance(args.target_resolution[1], int) + assert args.target_resolution[1] > 0 + if args.target_resolution[1] == -1: + assert isinstance(args.target_resolution[0], int) + assert args.target_resolution[0] > 0 + args.target_resolution = tuple(args.target_resolution) + + get_output( + args.video, + args.out_filename, + pred_result, + labels, + fps=args.fps, + font_scale=args.font_scale, + font_color=args.font_color, + target_resolution=args.target_resolution) + + +if __name__ == '__main__': + main() diff --git a/demo/demo_audio.py b/demo/demo_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6724afc44d059e4e8fbfe9c4f8adee721e1ec8 --- /dev/null +++ b/demo/demo_audio.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from operator import itemgetter + +import torch +from mmengine import Config, DictAction + +from mmaction.apis import inference_recognizer, init_recognizer + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file/url') + parser.add_argument('audio', help='audio file') + parser.add_argument('label', help='label file') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + device = torch.device(args.device) + cfg = Config.fromfile(args.config) + cfg.merge_from_dict(args.cfg_options) + model = init_recognizer(cfg, args.checkpoint, device=device) + + if not args.audio.endswith('.npy'): + raise NotImplementedError('Demo works on extracted audio features') + pred_result = inference_recognizer(model, args.audio) + + pred_scores = pred_result.pred_score.tolist() + score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) + score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) + top5_label = score_sorted[:5] + + labels = open(args.label).readlines() + labels = [x.strip() for x in labels] + results = [(labels[k[0]], k[1]) for k in top5_label] + + print('The top-5 labels with corresponding scores are:') + for result in results: + print(f'{result[0]}: ', result[1]) + + +if __name__ == '__main__': + main() diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py new file mode 100644 index 0000000000000000000000000000000000000000..d6cd685881d885ef25dcb5f1ef98e329cf334869 --- /dev/null +++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py @@ -0,0 +1,140 @@ +# Copyright (c) OpenMMLab. All rights reserved. +model = dict( + type='FasterRCNN', + _scope_='mmdet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +file_client_args = dict(backend='disk') + +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + metainfo=dict(classes=('person', ), palette=[(220, 20, 60)]))) diff --git a/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py b/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e851abadc79ffb5821cc9e27b099e990e156a1 --- /dev/null +++ b/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# model settings +model = dict( + type='FasterRCNN', + _scope_='mmdet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +file_client_args = dict(backend='disk') +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) diff --git a/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py b/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..54259066695fd6081343833b756c9d1184f5dc2f --- /dev/null +++ b/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = ['../../configs/_base_/models/i3d_r50.py'] + +# dataset settings +dataset_type = 'RawframeDataset' +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + ann_file=None, + data_prefix=None, + pipeline=test_pipeline)) diff --git a/demo/demo_configs/i3d_r50_32x2x1_video_infer.py b/demo/demo_configs/i3d_r50_32x2x1_video_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..fe3b11b819f2afc1ff1cefb0b6861fb7b7d84261 --- /dev/null +++ b/demo/demo_configs/i3d_r50_32x2x1_video_infer.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = ['../../configs/_base_/models/i3d_r50.py'] + +# dataset settings +dataset_type = 'VideoDataset' +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + ann_file=None, + data_prefix=None, + pipeline=test_pipeline)) diff --git a/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py b/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..e662527ef1d6d002e749e1963e2c7ec885ad2305 --- /dev/null +++ b/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + _scope_='mmpose', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose' + '/pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +test_pipeline = [ + dict(type='mmpose.LoadImage', file_client_args=file_client_args), + dict(type='mmpose.GetBBoxCenterScale'), + dict(type='mmpose.TopdownAffine', input_size=codec['input_size']), + dict(type='mmpose.PackPoseInputs') +] +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# visualizer +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='mmpose.PoseLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') diff --git a/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py b/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..7de4667a94185d8308e0834f3c8bc171fc4515f4 --- /dev/null +++ b/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = ['../../configs/_base_/models/tsn_r50.py'] + +# dataset settings +dataset_type = 'RawframeDataset' +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + ann_file=None, + data_prefix=None, + pipeline=test_pipeline)) diff --git a/demo/demo_configs/tsn_r50_1x1x8_video_infer.py b/demo/demo_configs/tsn_r50_1x1x8_video_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..7a256294c0e47efe6df8869835787b8bc24f06f4 --- /dev/null +++ b/demo/demo_configs/tsn_r50_1x1x8_video_infer.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = ['../../configs/_base_/models/tsn_r50.py'] + +# dataset settings +dataset_type = 'VideoDataset' +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + ann_file=None, + data_prefix=None, + pipeline=test_pipeline)) diff --git a/demo/demo_inferencer.py b/demo/demo_inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..82aa9b840e9becf4c0f2a839e26e7c5dbf7a9c7f --- /dev/null +++ b/demo/demo_inferencer.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from argparse import ArgumentParser + +from mmaction.apis.inferencers import MMAction2Inferencer + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'inputs', type=str, help='Input video file or rawframes folder path.') + parser.add_argument( + '--vid-out-dir', + type=str, + default='', + help='Output directory of videos.') + parser.add_argument( + '--rec', + type=str, + default=None, + help='Pretrained action recognition algorithm. It\'s the path to the ' + 'config file or the model name defined in metafile.') + parser.add_argument( + '--rec-weights', + type=str, + default=None, + help='Path to the custom checkpoint file of the selected recog model. ' + 'If it is not specified and "rec" is a model name of metafile, the ' + 'weights will be loaded from metafile.') + parser.add_argument( + '--label-file', type=str, default=None, help='label file for dataset.') + parser.add_argument( + '--device', + type=str, + default=None, + help='Device used for inference. ' + 'If not specified, the available device will be automatically used.') + parser.add_argument( + '--batch-size', type=int, default=1, help='Inference batch size.') + parser.add_argument( + '--show', + action='store_true', + help='Display the video in a popup window.') + parser.add_argument( + '--print-result', + action='store_true', + help='Whether to print the results.') + parser.add_argument( + '--pred-out-file', + type=str, + default='', + help='File to save the inference results.') + + call_args = vars(parser.parse_args()) + + init_kws = ['rec', 'rec_weights', 'device', 'label_file'] + init_args = {} + for init_kw in init_kws: + init_args[init_kw] = call_args.pop(init_kw) + + return init_args, call_args + + +def main(): + init_args, call_args = parse_args() + mmaction2 = MMAction2Inferencer(**init_args) + mmaction2(**call_args) + + +if __name__ == '__main__': + main() diff --git a/demo/demo_skeleton.mp4 b/demo/demo_skeleton.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..52a5a90285b54fa5ed0faf850cc8d7e62e574652 --- /dev/null +++ b/demo/demo_skeleton.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f82b816da14d6098438d606d961613fa67a38d19e091ca88006605a7df85a7f +size 749377 diff --git a/demo/demo_skeleton.py b/demo/demo_skeleton.py new file mode 100644 index 0000000000000000000000000000000000000000..ac59b73b9b373dc8b2b4ef1734c8de8b669e21bb --- /dev/null +++ b/demo/demo_skeleton.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile + +import cv2 +import mmcv +import mmengine +import torch +from mmengine import DictAction +from mmengine.utils import track_iter_progress + +from mmaction.apis import (detection_inference, inference_skeleton, + init_recognizer, pose_inference) +from mmaction.registry import VISUALIZERS +from mmaction.utils import frame_extract + +try: + import moviepy.editor as mpy +except ImportError: + raise ImportError('Please install moviepy to enable output file') + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.75 +FONTCOLOR = (255, 255, 255) # BGR, white +THICKNESS = 1 +LINETYPE = 1 + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('video', help='video file/url') + parser.add_argument('out_filename', help='output filename') + parser.add_argument( + '--config', + default=('configs/skeleton/posec3d/' + 'slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py'), + help='skeleton model config file path') + parser.add_argument( + '--checkpoint', + default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/' + 'slowonly_r50_u48_240e_ntu60_xsub_keypoint/' + 'slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth'), + help='skeleton model checkpoint file/url') + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='the category id for human detection') + parser.add_argument( + '--pose-config', + default='demo/demo_configs/' + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py', + help='human pose estimation config file path (from mmpose)') + parser.add_argument( + '--pose-checkpoint', + default=('https://download.openmmlab.com/mmpose/top_down/hrnet/' + 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'), + help='human pose estimation checkpoint file/url') + parser.add_argument( + '--label-map', + default='tools/data/skeleton/label_map_ntu60.txt', + help='label map file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--short-side', + type=int, + default=480, + help='specify the short-side length of the image') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def visualize(args, frames, data_samples, action_label): + pose_config = mmengine.Config.fromfile(args.pose_config) + visualizer = VISUALIZERS.build(pose_config.visualizer) + visualizer.set_dataset_meta(data_samples[0].dataset_meta) + + vis_frames = [] + print('Drawing skeleton for each frame') + for d, f in track_iter_progress(list(zip(data_samples, frames))): + f = mmcv.imconvert(f, 'bgr', 'rgb') + visualizer.add_datasample( + 'result', + f, + data_sample=d, + draw_gt=False, + draw_heatmap=False, + draw_bbox=True, + show=False, + wait_time=0, + out_file=None, + kpt_thr=0.3) + vis_frame = visualizer.get_image() + cv2.putText(vis_frame, action_label, (10, 30), FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + vis_frames.append(vis_frame) + + vid = mpy.ImageSequenceClip(vis_frames, fps=24) + vid.write_videofile(args.out_filename, remove_temp=True) + + +def main(): + args = parse_args() + + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, frames = frame_extract(args.video, args.short_side, + tmp_dir.name) + + h, w, _ = frames[0].shape + + # Get Human detection results. + det_results, _ = detection_inference(args.det_config, args.det_checkpoint, + frame_paths, args.det_score_thr, + args.det_cat_id, args.device) + torch.cuda.empty_cache() + + # Get Pose estimation results. + pose_results, pose_data_samples = pose_inference(args.pose_config, + args.pose_checkpoint, + frame_paths, det_results, + args.device) + torch.cuda.empty_cache() + + config = mmengine.Config.fromfile(args.config) + config.merge_from_dict(args.cfg_options) + + model = init_recognizer(config, args.checkpoint, args.device) + result = inference_skeleton(model, pose_results, (h, w)) + + max_pred_index = result.pred_score.argmax().item() + label_map = [x.strip() for x in open(args.label_map).readlines()] + action_label = label_map[max_pred_index] + + visualize(args, frames, pose_data_samples, action_label) + + tmp_dir.cleanup() + + +if __name__ == '__main__': + main() diff --git a/demo/demo_spatiotemporal_det.mp4 b/demo/demo_spatiotemporal_det.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..50857d945c9805aa84e093a90bcffecb51dc80ff --- /dev/null +++ b/demo/demo_spatiotemporal_det.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1a5e386878d58b4db49fcd7f61f60b6b02a43d0259f22e1a385fba05e5f4b6 +size 331376 diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py new file mode 100644 index 0000000000000000000000000000000000000000..aebf6e673a7a3c500287ea6f71584819c62aa005 --- /dev/null +++ b/demo/demo_spatiotemporal_det.py @@ -0,0 +1,375 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy as cp +import tempfile + +import cv2 +import mmcv +import mmengine +import numpy as np +import torch +from mmengine import DictAction +from mmengine.runner import load_checkpoint +from mmengine.structures import InstanceData + +from mmaction.apis import detection_inference +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.utils import frame_extract, get_str_type + +try: + import moviepy.editor as mpy +except ImportError: + raise ImportError('Please install moviepy to enable output file') + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.5 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 + + +def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + +plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' +plate_blue = plate_blue.split('-') +plate_blue = [hex2color(h) for h in plate_blue] +plate_green = '004b23-006400-007200-008000-38b000-70e000' +plate_green = plate_green.split('-') +plate_green = [hex2color(h) for h in plate_green] + + +def visualize(frames, annotations, plate=plate_blue, max_num=5): + """Visualize frames with predicted annotations. + + Args: + frames (list[np.ndarray]): Frames for visualization, note that + len(frames) % len(annotations) should be 0. + annotations (list[list[tuple]]): The predicted results. + plate (str): The plate used for visualization. Default: plate_blue. + max_num (int): Max number of labels to visualize for a person box. + Default: 5. + Returns: + list[np.ndarray]: Visualized frames. + """ + + assert max_num + 1 <= len(plate) + plate = [x[::-1] for x in plate] + frames_out = cp.deepcopy(frames) + nf, na = len(frames), len(annotations) + assert nf % na == 0 + nfpa = len(frames) // len(annotations) + anno = None + h, w, _ = frames[0].shape + scale_ratio = np.array([w, h, w, h]) + for i in range(na): + anno = annotations[i] + if anno is None: + continue + for j in range(nfpa): + ind = i * nfpa + j + frame = frames_out[ind] + for ann in anno: + box = ann[0] + label = ann[1] + if not len(label): + continue + score = ann[2] + box = (box * scale_ratio).astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + cv2.rectangle(frame, st, ed, plate[0], 2) + for k, lb in enumerate(label): + if k >= max_num: + break + text = abbrev(lb) + text = ': '.join([text, f'{score[k]:>.2f}']) + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, + THICKNESS)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + return frames_out + + +def load_label_map(file_path): + """Load Label Map. + + Args: + file_path (str): The file path of label map. + Returns: + dict: The label map (int -> label name). + """ + lines = open(file_path).readlines() + lines = [x.strip().split(': ') for x in lines] + return {int(x[0]): x[1] for x in lines} + + +def abbrev(name): + """Get the abbreviation of label name: + + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +def pack_result(human_detection, result, img_h, img_w): + """Short summary. + + Args: + human_detection (np.ndarray): Human detection result. + result (type): The predicted label of each human proposal. + img_h (int): The image height. + img_w (int): The image width. + Returns: + tuple: Tuple of human proposal, label name and label score. + """ + human_detection[:, 0::2] /= img_w + human_detection[:, 1::2] /= img_h + results = [] + if result is None: + return None + for prop, res in zip(human_detection, result): + res.sort(key=lambda x: -x[1]) + results.append( + (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] + for x in res])) + return results + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('video', help='video file/url') + parser.add_argument('out_filename', help='output filename') + parser.add_argument( + '--config', + default=('configs/detection/slowonly/slowonly_kinetics400-pretrained-' + 'r101_8xb16-8x8x1-20e_ava21-rgb.py'), + help='spatialtemporal detection model config file path') + parser.add_argument( + '--checkpoint', + default=('https://download.openmmlab.com/mmaction/detection/ava/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_' + '20201217-16378594.pth'), + help='spatialtemporal detection model checkpoint file/url') + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='the category id for human detection') + parser.add_argument( + '--action-score-thr', + type=float, + default=0.5, + help='the threshold of human action score') + parser.add_argument( + '--label-map', + default='tools/data/ava/label_map.txt', + help='label map file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--short-side', + type=int, + default=256, + help='specify the short-side length of the image') + parser.add_argument( + '--predict-stepsize', + default=8, + type=int, + help='give out a prediction per n frames') + parser.add_argument( + '--output-stepsize', + default=4, + type=int, + help=('show one frame per n frames in the demo, we should have: ' + 'predict_stepsize % output_stepsize == 0')) + parser.add_argument( + '--output-fps', + default=6, + type=int, + help='the fps of demo video output') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, original_frames = frame_extract( + args.video, out_dir=tmp_dir.name) + num_frame = len(frame_paths) + h, w, _ = original_frames[0].shape + + # resize frames to shortside + new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf)) + frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] + w_ratio, h_ratio = new_w / w, new_h / h + + # Get clip_len, frame_interval and calculate center index of each clip + config = mmengine.Config.fromfile(args.config) + config.merge_from_dict(args.cfg_options) + val_pipeline = config.val_pipeline + + sampler = [ + x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames' + ][0] + clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] + window_size = clip_len * frame_interval + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + # Note that it's 1 based here + timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, + args.predict_stepsize) + + # Load label_map + label_map = load_label_map(args.label_map) + try: + if config['data']['train']['custom_classes'] is not None: + label_map = { + id + 1: label_map[cls] + for id, cls in enumerate(config['data']['train'] + ['custom_classes']) + } + except KeyError: + pass + + # Get Human detection results + center_frames = [frame_paths[ind - 1] for ind in timestamps] + + human_detections, _ = detection_inference(args.det_config, + args.det_checkpoint, + center_frames, + args.det_score_thr, + args.det_cat_id, args.device) + torch.cuda.empty_cache() + for i in range(len(human_detections)): + det = human_detections[i] + det[:, 0:4:2] *= w_ratio + det[:, 1:4:2] *= h_ratio + human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) + + # Build STDET model + try: + # In our spatiotemporal detection demo, different actions should have + # the same number of bboxes. + config['model']['test_cfg']['rcnn'] = dict(action_thr=0) + except KeyError: + pass + + config.model.backbone.pretrained = None + model = MODELS.build(config.model) + + load_checkpoint(model, args.checkpoint, map_location='cpu') + model.to(args.device) + model.eval() + + predictions = [] + + img_norm_cfg = dict( + mean=np.array(config.model.data_preprocessor.mean), + std=np.array(config.model.data_preprocessor.std), + to_rgb=False) + + print('Performing SpatioTemporal Action Detection for each clip') + assert len(timestamps) == len(human_detections) + prog_bar = mmengine.ProgressBar(len(timestamps)) + for timestamp, proposal in zip(timestamps, human_detections): + if proposal.shape[0] == 0: + predictions.append(None) + continue + + start_frame = timestamp - (clip_len // 2 - 1) * frame_interval + frame_inds = start_frame + np.arange(0, window_size, frame_interval) + frame_inds = list(frame_inds - 1) + imgs = [frames[ind].astype(np.float32) for ind in frame_inds] + _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] + # THWC -> CTHW -> 1CTHW + input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] + input_tensor = torch.from_numpy(input_array).to(args.device) + + datasample = ActionDataSample() + datasample.proposals = InstanceData(bboxes=proposal) + datasample.set_metainfo(dict(img_shape=(new_h, new_w))) + with torch.no_grad(): + result = model(input_tensor, [datasample], mode='predict') + scores = result[0].pred_instances.scores + prediction = [] + # N proposals + for i in range(proposal.shape[0]): + prediction.append([]) + # Perform action score thr + for i in range(scores.shape[1]): + if i not in label_map: + continue + for j in range(proposal.shape[0]): + if scores[j, i] > args.action_score_thr: + prediction[j].append((label_map[i], scores[j, + i].item())) + predictions.append(prediction) + prog_bar.update() + + results = [] + for human_detection, prediction in zip(human_detections, predictions): + results.append(pack_result(human_detection, prediction, new_h, new_w)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int64) + + dense_n = int(args.predict_stepsize / args.output_stepsize) + frames = [ + cv2.imread(frame_paths[i - 1]) + for i in dense_timestamps(timestamps, dense_n) + ] + print('Performing visualization') + vis_frames = visualize(frames, results) + vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], + fps=args.output_fps) + vid.write_videofile(args.out_filename) + + tmp_dir.cleanup() + + +if __name__ == '__main__': + main() diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..3ee2ff14966951667667469cef6fbfc49fb1e493 --- /dev/null +++ b/demo/demo_spatiotemporal_det_onnx.py @@ -0,0 +1,358 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy as cp +import tempfile + +import cv2 +import mmcv +import mmengine +import numpy as np +import onnxruntime +import torch +from mmdet.structures.bbox import bbox2roi +from mmengine import DictAction + +from mmaction.apis import detection_inference +from mmaction.utils import frame_extract, get_str_type + +try: + import moviepy.editor as mpy +except ImportError: + raise ImportError('Please install moviepy to enable output file') + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.5 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 + + +def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + +plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' +plate_blue = plate_blue.split('-') +plate_blue = [hex2color(h) for h in plate_blue] +plate_green = '004b23-006400-007200-008000-38b000-70e000' +plate_green = plate_green.split('-') +plate_green = [hex2color(h) for h in plate_green] + + +def visualize(frames, annotations, plate=plate_blue, max_num=5): + """Visualize frames with predicted annotations. + + Args: + frames (list[np.ndarray]): Frames for visualization, note that + len(frames) % len(annotations) should be 0. + annotations (list[list[tuple]]): The predicted results. + plate (str): The plate used for visualization. Default: plate_blue. + max_num (int): Max number of labels to visualize for a person box. + Default: 5. + Returns: + list[np.ndarray]: Visualized frames. + """ + + assert max_num + 1 <= len(plate) + plate = [x[::-1] for x in plate] + frames_out = cp.deepcopy(frames) + nf, na = len(frames), len(annotations) + assert nf % na == 0 + nfpa = len(frames) // len(annotations) + anno = None + h, w, _ = frames[0].shape + scale_ratio = np.array([w, h, w, h]) + for i in range(na): + anno = annotations[i] + if anno is None: + continue + for j in range(nfpa): + ind = i * nfpa + j + frame = frames_out[ind] + for ann in anno: + box = ann[0] + label = ann[1] + if not len(label): + continue + score = ann[2] + box = (box * scale_ratio).astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + cv2.rectangle(frame, st, ed, plate[0], 2) + for k, lb in enumerate(label): + if k >= max_num: + break + text = abbrev(lb) + text = ': '.join([text, str(score[k])]) + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, + THICKNESS)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + return frames_out + + +def load_label_map(file_path): + """Load Label Map. + + Args: + file_path (str): The file path of label map. + Returns: + dict: The label map (int -> label name). + """ + lines = open(file_path).readlines() + lines = [x.strip().split(': ') for x in lines] + return {int(x[0]): x[1] for x in lines} + + +def abbrev(name): + """Get the abbreviation of label name: + + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +def pack_result(human_detection, result, img_h, img_w): + """Short summary. + + Args: + human_detection (np.ndarray): Human detection result. + result (type): The predicted label of each human proposal. + img_h (int): The image height. + img_w (int): The image width. + Returns: + tuple: Tuple of human proposal, label name and label score. + """ + human_detection[:, 0::2] /= img_w + human_detection[:, 1::2] /= img_h + results = [] + if result is None: + return None + for prop, res in zip(human_detection, result): + res.sort(key=lambda x: -x[1]) + results.append( + (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] + for x in res])) + return results + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('video', help='video file/url') + parser.add_argument('out_filename', help='output filename') + parser.add_argument( + '--config', + default=('configs/detection/slowonly/slowonly_k700-pre' + '-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'), + help='spatialtemporal detection model config file path') + parser.add_argument( + '--onnx-file', help='spatialtemporal detection onnx file path') + + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='the category id for human detection') + parser.add_argument( + '--action-score-thr', + type=float, + default=0.5, + help='the threshold of human action score') + parser.add_argument( + '--label-map', + default='tools/data/ava/label_map.txt', + help='label map file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--short-side', + type=int, + default=256, + help='specify the short-side length of the image') + parser.add_argument( + '--predict-stepsize', + default=8, + type=int, + help='give out a prediction per n frames') + parser.add_argument( + '--output-stepsize', + default=4, + type=int, + help=('show one frame per n frames in the demo, we should have: ' + 'predict_stepsize % output_stepsize == 0')) + parser.add_argument( + '--output-fps', + default=6, + type=int, + help='the fps of demo video output') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, original_frames = frame_extract( + args.video, out_dir=tmp_dir.name) + num_frame = len(frame_paths) + h, w, _ = original_frames[0].shape + + # resize frames to shortside + new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf)) + frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] + w_ratio, h_ratio = new_w / w, new_h / h + + # Get clip_len, frame_interval and calculate center index of each clip + config = mmengine.Config.fromfile(args.config) + config.merge_from_dict(args.cfg_options) + val_pipeline = config.val_pipeline + + sampler = [ + x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames' + ][0] + clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] + window_size = clip_len * frame_interval + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + # Note that it's 1 based here + timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, + args.predict_stepsize) + + # Load label_map + label_map = load_label_map(args.label_map) + try: + if config['data']['train']['custom_classes'] is not None: + label_map = { + id + 1: label_map[cls] + for id, cls in enumerate(config['data']['train'] + ['custom_classes']) + } + except KeyError: + pass + + # Get Human detection results + center_frames = [frame_paths[ind - 1] for ind in timestamps] + + human_detections, _ = detection_inference(args.det_config, + args.det_checkpoint, + center_frames, + args.det_score_thr, + args.det_cat_id, args.device) + torch.cuda.empty_cache() + for i in range(len(human_detections)): + det = human_detections[i] + det[:, 0:4:2] *= w_ratio + det[:, 1:4:2] *= h_ratio + human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) + + # Build STDET model + session = onnxruntime.InferenceSession(args.onnx_file) + + predictions = [] + + img_norm_cfg = dict( + mean=np.array(config.model.data_preprocessor.mean), + std=np.array(config.model.data_preprocessor.std), + to_rgb=False) + + print('Performing SpatioTemporal Action Detection for each clip') + assert len(timestamps) == len(human_detections) + prog_bar = mmengine.ProgressBar(len(timestamps)) + for timestamp, proposal in zip(timestamps, human_detections): + if proposal.shape[0] == 0: + predictions.append(None) + continue + + start_frame = timestamp - (clip_len // 2 - 1) * frame_interval + frame_inds = start_frame + np.arange(0, window_size, frame_interval) + frame_inds = list(frame_inds - 1) + imgs = [frames[ind].astype(np.float32) for ind in frame_inds] + _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] + # THWC -> CTHW -> 1CTHW + input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] + rois = bbox2roi([proposal]) + + input_feed = { + 'input_tensor': input_array, + 'rois': rois.cpu().data.numpy() + } + outputs = session.run(['cls_score'], input_feed=input_feed) + logits = outputs[0] + scores = 1 / (1 + np.exp(-logits)) + + prediction = [] + # N proposals + for i in range(proposal.shape[0]): + prediction.append([]) + # Perform action score thr + for i in range(scores.shape[1]): + if i not in label_map: + continue + for j in range(proposal.shape[0]): + if scores[j, i] > args.action_score_thr: + prediction[j].append((label_map[i], scores[j, i].item())) + predictions.append(prediction) + prog_bar.update() + + results = [] + for human_detection, prediction in zip(human_detections, predictions): + results.append(pack_result(human_detection, prediction, new_h, new_w)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int64) + + dense_n = int(args.predict_stepsize / args.output_stepsize) + frames = [ + cv2.imread(frame_paths[i - 1]) + for i in dense_timestamps(timestamps, dense_n) + ] + print('Performing visualization') + vis_frames = visualize(frames, results) + vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], + fps=args.output_fps) + vid.write_videofile(args.out_filename) + + tmp_dir.cleanup() + + +if __name__ == '__main__': + main() diff --git a/demo/demo_video_structuralize.py b/demo/demo_video_structuralize.py new file mode 100644 index 0000000000000000000000000000000000000000..8ff3e584eabfcc407931e260e346ff04eb79c7a2 --- /dev/null +++ b/demo/demo_video_structuralize.py @@ -0,0 +1,672 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy as cp +import tempfile +import warnings + +import cv2 +import mmcv +import mmengine +import numpy as np +import torch +from mmengine import DictAction +from mmengine.structures import InstanceData + +from mmaction.apis import (detection_inference, inference_recognizer, + inference_skeleton, init_recognizer, pose_inference) +from mmaction.registry import VISUALIZERS +from mmaction.structures import ActionDataSample +from mmaction.utils import frame_extract + +try: + from mmdet.apis import init_detector +except (ImportError, ModuleNotFoundError): + warnings.warn('Failed to import `init_detector` form `mmdet.apis`. ' + 'These apis are required in skeleton-based applications! ') + +try: + import moviepy.editor as mpy +except ImportError: + raise ImportError('Please install moviepy to enable output file') + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.5 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 + + +def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + +PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' +PLATEBLUE = PLATEBLUE.split('-') +PLATEBLUE = [hex2color(h) for h in PLATEBLUE] +PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000' +PLATEGREEN = PLATEGREEN.split('-') +PLATEGREEN = [hex2color(h) for h in PLATEGREEN] + + +def visualize(args, + frames, + annotations, + pose_data_samples, + action_result, + plate=PLATEBLUE, + max_num=5): + """Visualize frames with predicted annotations. + + Args: + frames (list[np.ndarray]): Frames for visualization, note that + len(frames) % len(annotations) should be 0. + annotations (list[list[tuple]]): The predicted spatio-temporal + detection results. + pose_data_samples (list[list[PoseDataSample]): The pose results. + action_result (str): The predicted action recognition results. + pose_model (nn.Module): The constructed pose model. + plate (str): The plate used for visualization. Default: PLATEBLUE. + max_num (int): Max number of labels to visualize for a person box. + Default: 5. + + Returns: + list[np.ndarray]: Visualized frames. + """ + + assert max_num + 1 <= len(plate) + frames_ = cp.deepcopy(frames) + frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_] + nf, na = len(frames), len(annotations) + assert nf % na == 0 + nfpa = len(frames) // len(annotations) + anno = None + h, w, _ = frames[0].shape + scale_ratio = np.array([w, h, w, h]) + + # add pose results + if pose_data_samples: + pose_config = mmengine.Config.fromfile(args.pose_config) + visualizer = VISUALIZERS.build(pose_config.visualizer) + visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta) + for i, (d, f) in enumerate(zip(pose_data_samples, frames_)): + visualizer.add_datasample( + 'result', + f, + data_sample=d, + draw_gt=False, + draw_heatmap=False, + draw_bbox=True, + show=False, + wait_time=0, + out_file=None, + kpt_thr=0.3) + frames_[i] = visualizer.get_image() + cv2.putText(frames_[i], action_result, (10, 30), FONTFACE, + FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE) + + for i in range(na): + anno = annotations[i] + if anno is None: + continue + for j in range(nfpa): + ind = i * nfpa + j + frame = frames_[ind] + + # add action result for whole video + cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + # add spatio-temporal action detection results + for ann in anno: + box = ann[0] + label = ann[1] + if not len(label): + continue + score = ann[2] + box = (box * scale_ratio).astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + if not pose_data_samples: + cv2.rectangle(frame, st, ed, plate[0], 2) + + for k, lb in enumerate(label): + if k >= max_num: + break + text = abbrev(lb) + text = ': '.join([text, f'{score[k]:.3f}']) + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, + THICKNESS)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + return frames_ + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument( + '--rgb-stdet-config', + default=( + 'configs/detection/slowonly/' + 'slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py' + ), + help='rgb-based spatio temporal detection config file path') + parser.add_argument( + '--rgb-stdet-checkpoint', + default=('https://download.openmmlab.com/mmaction/detection/ava/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb' + '_20201217-16378594.pth'), + help='rgb-based spatio temporal detection checkpoint file/url') + parser.add_argument( + '--skeleton-stdet-checkpoint', + default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/' + 'posec3d_ava.pth'), + help='skeleton-based spatio temporal detection checkpoint file/url') + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/' + 'faster_rcnn/faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--pose-config', + default='demo/demo_configs' + '/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py', + help='human pose estimation config file path (from mmpose)') + parser.add_argument( + '--pose-checkpoint', + default=('https://download.openmmlab.com/mmpose/top_down/hrnet/' + 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'), + help='human pose estimation checkpoint file/url') + parser.add_argument( + '--skeleton-config', + default='configs/skeleton/posec3d' + '/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py', + help='skeleton-based action recognition config file path') + parser.add_argument( + '--skeleton-checkpoint', + default='https://download.openmmlab.com/mmaction/skeleton/posec3d/' + 'posec3d_k400.pth', + help='skeleton-based action recognition checkpoint file/url') + parser.add_argument( + '--rgb-config', + default='configs/recognition/tsn/' + 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py', + help='rgb-based action recognition config file path') + parser.add_argument( + '--rgb-checkpoint', + default='https://download.openmmlab.com/mmaction/recognition/' + 'tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/' + 'tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth', + help='rgb-based action recognition checkpoint file/url') + parser.add_argument( + '--use-skeleton-stdet', + action='store_true', + help='use skeleton-based spatio temporal detection method') + parser.add_argument( + '--use-skeleton-recog', + action='store_true', + help='use skeleton-based action recognition method') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--action-score-thr', + type=float, + default=0.4, + help='the threshold of action prediction score') + parser.add_argument( + '--video', + default='demo/test_video_structuralize.mp4', + help='video file/url') + parser.add_argument( + '--label-map-stdet', + default='tools/data/ava/label_map.txt', + help='label map file for spatio-temporal action detection') + parser.add_argument( + '--label-map', + default='tools/data/kinetics/label_map_k400.txt', + help='label map file for action recognition') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--out-filename', + default='demo/test_stdet_recognition_output.mp4', + help='output filename') + parser.add_argument( + '--predict-stepsize', + default=8, + type=int, + help='give out a spatio-temporal detection prediction per n frames') + parser.add_argument( + '--output-stepsize', + default=1, + type=int, + help=('show one frame per n frames in the demo, we should have: ' + 'predict_stepsize % output_stepsize == 0')) + parser.add_argument( + '--output-fps', + default=24, + type=int, + help='the fps of demo video output') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def load_label_map(file_path): + """Load Label Map. + + Args: + file_path (str): The file path of label map. + + Returns: + dict: The label map (int -> label name). + """ + lines = open(file_path).readlines() + lines = [x.strip().split(': ') for x in lines] + return {int(x[0]): x[1] for x in lines} + + +def abbrev(name): + """Get the abbreviation of label name: + + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +def pack_result(human_detection, result, img_h, img_w): + """Short summary. + + Args: + human_detection (np.ndarray): Human detection result. + result (type): The predicted label of each human proposal. + img_h (int): The image height. + img_w (int): The image width. + + Returns: + tuple: Tuple of human proposal, label name and label score. + """ + human_detection[:, 0::2] /= img_w + human_detection[:, 1::2] /= img_h + results = [] + if result is None: + return None + for prop, res in zip(human_detection, result): + res.sort(key=lambda x: -x[1]) + results.append( + (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] + for x in res])) + return results + + +def expand_bbox(bbox, h, w, ratio=1.25): + x1, y1, x2, y2 = bbox + center_x = (x1 + x2) // 2 + center_y = (y1 + y2) // 2 + width = x2 - x1 + height = y2 - y1 + + square_l = max(width, height) + new_width = new_height = square_l * ratio + + new_x1 = max(0, int(center_x - new_width / 2)) + new_x2 = min(int(center_x + new_width / 2), w) + new_y1 = max(0, int(center_y - new_height / 2)) + new_y2 = min(int(center_y + new_height / 2), h) + return (new_x1, new_y1, new_x2, new_y2) + + +def cal_iou(box1, box2): + xmin1, ymin1, xmax1, ymax1 = box1 + xmin2, ymin2, xmax2, ymax2 = box2 + + s1 = (xmax1 - xmin1) * (ymax1 - ymin1) + s2 = (xmax2 - xmin2) * (ymax2 - ymin2) + + xmin = max(xmin1, xmin2) + ymin = max(ymin1, ymin2) + xmax = min(xmax1, xmax2) + ymax = min(ymax1, ymax2) + + w = max(0, xmax - xmin) + h = max(0, ymax - ymin) + intersect = w * h + union = s1 + s2 - intersect + iou = intersect / union + + return iou + + +def skeleton_based_action_recognition(args, pose_results, h, w): + label_map = [x.strip() for x in open(args.label_map).readlines()] + num_class = len(label_map) + + skeleton_config = mmengine.Config.fromfile(args.skeleton_config) + skeleton_config.model.cls_head.num_classes = num_class # for K400 dataset + + skeleton_model = init_recognizer( + skeleton_config, args.skeleton_checkpoint, device=args.device) + result = inference_skeleton(skeleton_model, pose_results, (h, w)) + action_idx = result.pred_score.argmax().item() + return label_map[action_idx] + + +def rgb_based_action_recognition(args): + rgb_config = mmengine.Config.fromfile(args.rgb_config) + rgb_config.model.backbone.pretrained = None + rgb_model = init_recognizer(rgb_config, args.rgb_checkpoint, args.device) + action_results = inference_recognizer(rgb_model, args.video) + rgb_action_result = action_results.pred_score.argmax().item() + label_map = [x.strip() for x in open(args.label_map).readlines()] + return label_map[rgb_action_result] + + +def skeleton_based_stdet(args, label_map, human_detections, pose_results, + num_frame, clip_len, frame_interval, h, w): + window_size = clip_len * frame_interval + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, + args.predict_stepsize) + + skeleton_config = mmengine.Config.fromfile(args.skeleton_config) + num_class = max(label_map.keys()) + 1 # for AVA dataset (81) + skeleton_config.model.cls_head.num_classes = num_class + skeleton_stdet_model = init_recognizer(skeleton_config, + args.skeleton_stdet_checkpoint, + args.device) + + skeleton_predictions = [] + + print('Performing SpatioTemporal Action Detection for each clip') + prog_bar = mmengine.ProgressBar(len(timestamps)) + for timestamp in timestamps: + proposal = human_detections[timestamp - 1] + if proposal.shape[0] == 0: # no people detected + skeleton_predictions.append(None) + continue + + start_frame = timestamp - (clip_len // 2 - 1) * frame_interval + frame_inds = start_frame + np.arange(0, window_size, frame_interval) + frame_inds = list(frame_inds - 1) + num_frame = len(frame_inds) # 30 + + pose_result = [pose_results[ind] for ind in frame_inds] + + skeleton_prediction = [] + for i in range(proposal.shape[0]): # num_person + skeleton_prediction.append([]) + + fake_anno = dict( + frame_dict='', + label=-1, + img_shape=(h, w), + origin_shape=(h, w), + start_index=0, + modality='Pose', + total_frames=num_frame) + num_person = 1 + + num_keypoint = 17 + keypoint = np.zeros( + (num_person, num_frame, num_keypoint, 2)) # M T V 2 + keypoint_score = np.zeros( + (num_person, num_frame, num_keypoint)) # M T V + + # pose matching + person_bbox = proposal[i][:4] + area = expand_bbox(person_bbox, h, w) + + for j, poses in enumerate(pose_result): # num_frame + max_iou = float('-inf') + index = -1 + if len(poses['keypoints']) == 0: + continue + for k, bbox in enumerate(poses['bboxes']): + iou = cal_iou(bbox, area) + if max_iou < iou: + index = k + max_iou = iou + keypoint[0, j] = poses['keypoints'][index] + keypoint_score[0, j] = poses['keypoint_scores'][index] + + fake_anno['keypoint'] = keypoint + fake_anno['keypoint_score'] = keypoint_score + + output = inference_recognizer(skeleton_stdet_model, fake_anno) + # for multi-label recognition + score = output.pred_score.tolist() + for k in range(len(score)): # 81 + if k not in label_map: + continue + if score[k] > args.action_score_thr: + skeleton_prediction[i].append((label_map[k], score[k])) + + skeleton_predictions.append(skeleton_prediction) + prog_bar.update() + + return timestamps, skeleton_predictions + + +def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w, + new_h, w_ratio, h_ratio): + + rgb_stdet_config = mmengine.Config.fromfile(args.rgb_stdet_config) + rgb_stdet_config.merge_from_dict(args.cfg_options) + + val_pipeline = rgb_stdet_config.val_pipeline + sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] + clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + + window_size = clip_len * frame_interval + num_frame = len(frames) + # Note that it's 1 based here + timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, + args.predict_stepsize) + + # Get img_norm_cfg + img_norm_cfg = dict( + mean=np.array(rgb_stdet_config.model.data_preprocessor.mean), + std=np.array(rgb_stdet_config.model.data_preprocessor.std), + to_rgb=False) + + # Build STDET model + try: + # In our spatiotemporal detection demo, different actions should have + # the same number of bboxes. + rgb_stdet_config['model']['test_cfg']['rcnn'] = dict(action_thr=0) + except KeyError: + pass + + rgb_stdet_config.model.backbone.pretrained = None + rgb_stdet_model = init_detector( + rgb_stdet_config, args.rgb_stdet_checkpoint, device=args.device) + + predictions = [] + + print('Performing SpatioTemporal Action Detection for each clip') + prog_bar = mmengine.ProgressBar(len(timestamps)) + # for timestamp, proposal in zip(timestamps, human_detections): + for timestamp in timestamps: + proposal = human_detections[timestamp - 1] + if proposal.shape[0] == 0: + predictions.append(None) + continue + + start_frame = timestamp - (clip_len // 2 - 1) * frame_interval + frame_inds = start_frame + np.arange(0, window_size, frame_interval) + frame_inds = list(frame_inds - 1) + + imgs = [frames[ind].astype(np.float32) for ind in frame_inds] + _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] + # THWC -> CTHW -> 1CTHW + input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] + input_tensor = torch.from_numpy(input_array).to(args.device) + + datasample = ActionDataSample() + datasample.proposals = InstanceData(bboxes=proposal) + datasample.set_metainfo(dict(img_shape=(new_h, new_w))) + with torch.no_grad(): + result = rgb_stdet_model( + input_tensor, [datasample], mode='predict') + scores = result[0].pred_instances.scores + prediction = [] + # N proposals + for i in range(proposal.shape[0]): + prediction.append([]) + # Perform action score thr + for i in range(scores.shape[1]): + if i not in label_map: + continue + for j in range(proposal.shape[0]): + if scores[j, i] > args.action_score_thr: + prediction[j].append((label_map[i], scores[j, + i].item())) + predictions.append(prediction) + prog_bar.update() + + return timestamps, predictions + + +def main(): + args = parse_args() + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, original_frames = frame_extract( + args.video, out_dir=tmp_dir.name) + num_frame = len(frame_paths) + h, w, _ = original_frames[0].shape + + # Get Human detection results and pose results + human_detections, _ = detection_inference( + args.det_config, + args.det_checkpoint, + frame_paths, + args.det_score_thr, + device=args.device) + pose_datasample = None + if args.use_skeleton_recog or args.use_skeleton_stdet: + pose_results, pose_datasample = pose_inference( + args.pose_config, + args.pose_checkpoint, + frame_paths, + human_detections, + device=args.device) + + # resize frames to shortside 256 + new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) + frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] + w_ratio, h_ratio = new_w / w, new_h / h + + # Load spatio-temporal detection label_map + stdet_label_map = load_label_map(args.label_map_stdet) + rgb_stdet_config = mmengine.Config.fromfile(args.rgb_stdet_config) + rgb_stdet_config.merge_from_dict(args.cfg_options) + try: + if rgb_stdet_config['data']['train']['custom_classes'] is not None: + stdet_label_map = { + id + 1: stdet_label_map[cls] + for id, cls in enumerate(rgb_stdet_config['data']['train'] + ['custom_classes']) + } + except KeyError: + pass + + action_result = None + if args.use_skeleton_recog: + print('Use skeleton-based recognition') + action_result = skeleton_based_action_recognition( + args, pose_results, h, w) + else: + print('Use rgb-based recognition') + action_result = rgb_based_action_recognition(args) + + stdet_preds = None + if args.use_skeleton_stdet: + print('Use skeleton-based SpatioTemporal Action Detection') + clip_len, frame_interval = 30, 1 + timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map, + human_detections, + pose_results, num_frame, + clip_len, + frame_interval, h, w) + for i in range(len(human_detections)): + det = human_detections[i] + det[:, 0:4:2] *= w_ratio + det[:, 1:4:2] *= h_ratio + human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) + + else: + print('Use rgb-based SpatioTemporal Action Detection') + for i in range(len(human_detections)): + det = human_detections[i] + det[:, 0:4:2] *= w_ratio + det[:, 1:4:2] *= h_ratio + human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) + timestamps, stdet_preds = rgb_based_stdet(args, frames, + stdet_label_map, + human_detections, w, h, + new_w, new_h, w_ratio, + h_ratio) + + stdet_results = [] + for timestamp, prediction in zip(timestamps, stdet_preds): + human_detection = human_detections[timestamp - 1] + stdet_results.append( + pack_result(human_detection, prediction, new_h, new_w)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int64) + + dense_n = int(args.predict_stepsize / args.output_stepsize) + output_timestamps = dense_timestamps(timestamps, dense_n) + frames = [ + cv2.imread(frame_paths[timestamp - 1]) + for timestamp in output_timestamps + ] + + if args.use_skeleton_recog or args.use_skeleton_stdet: + pose_datasample = [ + pose_datasample[timestamp - 1] for timestamp in output_timestamps + ] + + vis_frames = visualize(args, frames, stdet_results, pose_datasample, + action_result) + vid = mpy.ImageSequenceClip(vis_frames, fps=args.output_fps) + vid.write_videofile(args.out_filename) + + tmp_dir.cleanup() + + +if __name__ == '__main__': + main() diff --git a/demo/fuse/bone.pkl b/demo/fuse/bone.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3b758c7af6f2b3d027a0bb6531b803e5ab1b52fe --- /dev/null +++ b/demo/fuse/bone.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cc4bc5dc56cd42fc26406d3cd6498481cbda66238256555404428076484025 +size 81562 diff --git a/demo/fuse/joint.pkl b/demo/fuse/joint.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13dd82489d5c36a97d5bef24b650d7c542fe96bf --- /dev/null +++ b/demo/fuse/joint.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36731c66dd998e2d1cf9ceee290660638a57166f135b036ac3e925e01761d708 +size 81385 diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..a2afe3f28395b07414776e93fe11ff3b864966ea --- /dev/null +++ b/demo/long_video_demo.py @@ -0,0 +1,270 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import argparse +import json +import random +from collections import deque +from operator import itemgetter + +import cv2 +import mmengine +import numpy as np +import torch +from mmengine import Config, DictAction +from mmengine.dataset import Compose + +from mmaction.apis import inference_recognizer, init_recognizer + +FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL +FONTSCALE = 1 +THICKNESS = 1 +LINETYPE = 1 + +EXCLUED_STEPS = [ + 'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit', + 'PyAVDecode', 'RawFrameDecode' +] + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 predict different labels in a long video demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file/url') + parser.add_argument('video_path', help='video file/url') + parser.add_argument('label', help='label file') + parser.add_argument('out_file', help='output result file in video/json') + parser.add_argument( + '--input-step', + type=int, + default=1, + help='input step for sampling frames') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--threshold', + type=float, + default=0.01, + help='recognition score threshold') + parser.add_argument( + '--stride', + type=float, + default=0, + help=('the prediction stride equals to stride * sample_length ' + '(sample_length indicates the size of temporal window from ' + 'which you sample frames, which equals to ' + 'clip_len x frame_interval), if set as 0, the ' + 'prediction stride is 1')) + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--label-color', + nargs='+', + type=int, + default=(255, 255, 255), + help='font color (B, G, R) of the labels in output video') + parser.add_argument( + '--msg-color', + nargs='+', + type=int, + default=(128, 128, 128), + help='font color (B, G, R) of the messages in output video') + args = parser.parse_args() + return args + + +def show_results_video(result_queue, + text_info, + thr, + msg, + frame, + video_writer, + label_color=(255, 255, 255), + msg_color=(128, 128, 128)): + if len(result_queue) != 0: + text_info = {} + results = result_queue.popleft() + for i, result in enumerate(results): + selected_label, score = result + if score < thr: + break + location = (0, 40 + i * 20) + text = selected_label + ': ' + str(round(score, 2)) + text_info[location] = text + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + label_color, THICKNESS, LINETYPE) + elif len(text_info): + for location, text in text_info.items(): + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + label_color, THICKNESS, LINETYPE) + else: + cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, msg_color, + THICKNESS, LINETYPE) + video_writer.write(frame) + return text_info + + +def get_results_json(result_queue, text_info, thr, msg, ind, out_json): + if len(result_queue) != 0: + text_info = {} + results = result_queue.popleft() + for i, result in enumerate(results): + selected_label, score = result + if score < thr: + break + text_info[i + 1] = selected_label + ': ' + str(round(score, 2)) + out_json[ind] = text_info + elif len(text_info): + out_json[ind] = text_info + else: + out_json[ind] = msg + return text_info, out_json + + +def show_results(model, data, label, args): + frame_queue = deque(maxlen=args.sample_length) + result_queue = deque(maxlen=1) + + cap = cv2.VideoCapture(args.video_path) + num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) + + msg = 'Preparing action recognition ...' + text_info = {} + out_json = {} + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + frame_size = (frame_width, frame_height) + + ind = 0 + video_writer = None if args.out_file.endswith('.json') \ + else cv2.VideoWriter(args.out_file, fourcc, fps, frame_size) + prog_bar = mmengine.ProgressBar(num_frames) + backup_frames = [] + + while ind < num_frames: + ind += 1 + prog_bar.update() + ret, frame = cap.read() + if frame is None: + # drop it when encounting None + continue + backup_frames.append(np.array(frame)[:, :, ::-1]) + if ind == args.sample_length: + # provide a quick show at the beginning + frame_queue.extend(backup_frames) + backup_frames = [] + elif ((len(backup_frames) == args.input_step + and ind > args.sample_length) or ind == num_frames): + # pick a frame from the backup + # when the backup is full or reach the last frame + chosen_frame = random.choice(backup_frames) + backup_frames = [] + frame_queue.append(chosen_frame) + + ret, scores = inference(model, data, args, frame_queue) + + if ret: + num_selected_labels = min(len(label), 5) + scores_tuples = tuple(zip(label, scores)) + scores_sorted = sorted( + scores_tuples, key=itemgetter(1), reverse=True) + results = scores_sorted[:num_selected_labels] + result_queue.append(results) + + if args.out_file.endswith('.json'): + text_info, out_json = get_results_json(result_queue, text_info, + args.threshold, msg, ind, + out_json) + else: + text_info = show_results_video(result_queue, text_info, + args.threshold, msg, frame, + video_writer, args.label_color, + args.msg_color) + + cap.release() + if video_writer: + video_writer.release() + cv2.destroyAllWindows() + if args.out_file.endswith('.json'): + with open(args.out_file, 'w') as js: + json.dump(out_json, js) + + +def inference(model, data, args, frame_queue): + if len(frame_queue) != args.sample_length: + # Do no inference when there is no enough frames + return False, None + + cur_windows = list(np.array(frame_queue)) + if data['img_shape'] is None: + data['img_shape'] = frame_queue[0].shape[:2] + + cur_data = data.copy() + cur_data.update( + dict( + array=cur_windows, + modality='RGB', + frame_inds=np.arange(args.sample_length))) + + result = inference_recognizer( + model, cur_data, test_pipeline=args.test_pipeline) + scores = result.pred_score.tolist() + + if args.stride > 0: + pred_stride = int(args.sample_length * args.stride) + for _ in range(pred_stride): + frame_queue.popleft() + + # for case ``args.stride=0`` + # deque will automatically popleft one element + + return True, scores + + +def main(): + args = parse_args() + + args.device = torch.device(args.device) + + cfg = Config.fromfile(args.config) + cfg.merge_from_dict(args.cfg_options) + + model = init_recognizer(cfg, args.checkpoint, device=args.device) + data = dict(img_shape=None, modality='RGB', label=-1) + with open(args.label, 'r') as f: + label = [line.strip() for line in f] + + # prepare test pipeline from non-camera pipeline + cfg = model.cfg + sample_length = 0 + pipeline = cfg.test_pipeline + pipeline_ = pipeline.copy() + for step in pipeline: + if 'SampleFrames' in step['type']: + sample_length = step['clip_len'] * step['num_clips'] + data['num_clips'] = step['num_clips'] + data['clip_len'] = step['clip_len'] + pipeline_.remove(step) + if step['type'] in EXCLUED_STEPS: + # remove step to decode frames + pipeline_.remove(step) + pipeline_.insert(1, dict(type='ArrayDecode')) + test_pipeline = Compose(pipeline_) + + assert sample_length > 0 + args.sample_length = sample_length + args.test_pipeline = test_pipeline + + show_results(model, data, label, args) + + +if __name__ == '__main__': + main() diff --git a/demo/mmaction2_tutorial.ipynb b/demo/mmaction2_tutorial.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c5ca4e478c6553fe1ae8578524113c982a21d046 --- /dev/null +++ b/demo/mmaction2_tutorial.ipynb @@ -0,0 +1,1936 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "VcjSRFELVbNk" + }, + "source": [ + "# MMAction2 Tutorial\n", + "\n", + "Welcome to MMAction2! This is the official colab tutorial for using MMAction2. In this tutorial, you will learn\n", + "- Perform inference with a MMAction2 recognizer.\n", + "- Train a new recognizer with a new dataset.\n", + "\n", + "\n", + "Let's start!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7LqHGkGEVqpm" + }, + "source": [ + "## Install MMAction2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Bf8PpPXtVvmg", + "outputId": "9d3f4594-f151-4ee9-a19b-09f8a439ac04" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "nvcc: NVIDIA (R) Cuda compiler driver\n", + "Copyright (c) 2005-2022 NVIDIA Corporation\n", + "Built on Wed_Sep_21_10:33:58_PDT_2022\n", + "Cuda compilation tools, release 11.8, V11.8.89\n", + "Build cuda_11.8.r11.8/compiler.31833905_0\n", + "gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + "Copyright (C) 2019 Free Software Foundation, Inc.\n", + "This is free software; see the source for copying conditions. There is NO\n", + "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n", + "\n" + ] + } + ], + "source": [ + "# Check nvcc version\n", + "!nvcc -V\n", + "# Check GCC version\n", + "!gcc --version" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "ZPwKGzqydnb2", + "outputId": "27506fa7-48a2-4fe0-d377-56f940dafec4", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://download.pytorch.org/whl/cu118, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.0+cu118)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.15.1+cu118)\n", + "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.3)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.22.4)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.27.1)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (8.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.4)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n" + ] + } + ], + "source": [ + "# install dependencies: (if your colab has CUDA 11.8)\n", + "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5PAJ4ArzV5Ry", + "outputId": "eb8539a0-9524-4c48-f3e1-0b013ce0d344" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting openmim\n", + " Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m51.3/51.3 kB\u001B[0m \u001B[31m4.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n", + "Collecting colorama (from openmim)\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Collecting model-index (from openmim)\n", + " Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n", + "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n", + "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n", + "Collecting ordered-set (from model-index->openmim)\n", + " Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n", + "Installing collected packages: ordered-set, colorama, model-index, openmim\n", + "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmengine\n", + " Downloading mmengine-0.7.3-py3-none-any.whl (372 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m372.1/372.1 kB\u001B[0m \u001B[31m20.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hCollecting addict (from mmengine)\n", + " Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n", + "Collecting yapf (from mmengine)\n", + " Downloading yapf-0.33.0-py2.py3-none-any.whl (200 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m200.9/200.9 kB\u001B[0m \u001B[31m21.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n", + "Installing collected packages: addict, yapf, mmengine\n", + "Successfully installed addict-2.4.0 mmengine-0.7.3 yapf-0.33.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmcv>=2.0.0\n", + " Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m74.4/74.4 MB\u001B[0m \u001B[31m9.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (2.4.0)\n", + "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (0.7.3)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (1.22.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (23.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (8.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (6.0)\n", + "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (0.33.0)\n", + "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (4.7.0.72)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (3.7.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (2.3.0)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv>=2.0.0) (2.0.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv>=2.0.0) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv>=2.0.0) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv>=2.0.0) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.16.0)\n", + "Installing collected packages: mmcv\n", + "Successfully installed mmcv-2.0.0\n", + "Cloning into 'mmaction2'...\n", + "remote: Enumerating objects: 21284, done.\u001B[K\n", + "remote: Counting objects: 100% (394/394), done.\u001B[K\n", + "remote: Compressing objects: 100% (287/287), done.\u001B[K\n", + "remote: Total 21284 (delta 175), reused 248 (delta 103), pack-reused 20890\u001B[K\n", + "Receiving objects: 100% (21284/21284), 68.63 MiB | 16.59 MiB/s, done.\n", + "Resolving deltas: 100% (14990/14990), done.\n", + "/content/mmaction2\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Obtaining file:///content/mmaction2\n", + " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n", + " Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m13.6/13.6 MB\u001B[0m \u001B[31m76.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hCollecting einops (from mmaction2==1.0.0)\n", + " Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m42.2/42.2 kB\u001B[0m \u001B[31m4.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n", + "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n", + "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.0+cu118)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.3)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n", + "Installing collected packages: einops, decord, mmaction2\n", + " Running setup.py develop for mmaction2\n", + "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting av>=9.0 (from -r requirements/optional.txt (line 1))\n", + " Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m31.0/31.0 MB\u001B[0m \u001B[31m38.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: future in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 2)) (0.18.3)\n", + "Collecting fvcore (from -r requirements/optional.txt (line 3))\n", + " Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m50.2/50.2 kB\u001B[0m \u001B[31m6.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Requirement already satisfied: imgaug in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 4)) (0.4.0)\n", + "Requirement already satisfied: librosa in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 5)) (0.10.0.post2)\n", + "Collecting lmdb (from -r requirements/optional.txt (line 6))\n", + " Downloading lmdb-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m299.2/299.2 kB\u001B[0m \u001B[31m30.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: moviepy in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 7)) (1.0.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 8)) (23.1)\n", + "Collecting pims (from -r requirements/optional.txt (line 9))\n", + " Downloading PIMS-0.6.1.tar.gz (86 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m86.0/86.0 kB\u001B[0m \u001B[31m12.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Collecting PyTurboJPEG (from -r requirements/optional.txt (line 10))\n", + " Downloading PyTurboJPEG-1.7.1.tar.gz (11 kB)\n", + " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Requirement already satisfied: soundfile in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 11)) (0.12.1)\n", + "Requirement already satisfied: tensorboard in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 12)) (2.12.2)\n", + "Collecting wandb (from -r requirements/optional.txt (line 13))\n", + " Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m2.0/2.0 MB\u001B[0m \u001B[31m79.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (1.22.4)\n", + "Collecting yacs>=0.1.6 (from fvcore->-r requirements/optional.txt (line 3))\n", + " Downloading yacs-0.1.8-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (6.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (4.65.0)\n", + "Requirement already satisfied: termcolor>=1.1 in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (2.3.0)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (8.4.0)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (0.8.10)\n", + "Collecting iopath>=0.1.7 (from fvcore->-r requirements/optional.txt (line 3))\n", + " Downloading iopath-0.1.10.tar.gz (42 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m42.2/42.2 kB\u001B[0m \u001B[31m4.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (1.16.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (1.10.1)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (3.7.1)\n", + "Requirement already satisfied: scikit-image>=0.14.2 in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (0.19.3)\n", + "Requirement already satisfied: opencv-python in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (4.7.0.72)\n", + "Requirement already satisfied: imageio in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (2.25.1)\n", + "Requirement already satisfied: Shapely in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (2.0.1)\n", + "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (3.0.0)\n", + "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.2.2)\n", + "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.2.0)\n", + "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (4.4.2)\n", + "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.56.4)\n", + "Requirement already satisfied: pooch<1.7,>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.6.0)\n", + "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.3.5)\n", + "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (4.5.0)\n", + "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.2)\n", + "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.0.5)\n", + "Requirement already satisfied: requests<3.0,>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (2.27.1)\n", + "Requirement already satisfied: proglog<=1.0.0 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (0.1.10)\n", + "Requirement already satisfied: imageio-ffmpeg>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (0.4.8)\n", + "Collecting slicerator>=0.9.8 (from pims->-r requirements/optional.txt (line 9))\n", + " Downloading slicerator-1.1.0-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile->-r requirements/optional.txt (line 11)) (1.15.1)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.4.0)\n", + "Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.54.0)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (2.17.3)\n", + "Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.0.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (3.4.3)\n", + "Requirement already satisfied: protobuf>=3.19.6 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (3.20.3)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (67.7.2)\n", + "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (0.7.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.8.1)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (2.3.0)\n", + "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (0.40.0)\n", + "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (8.1.3)\n", + "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb->-r requirements/optional.txt (line 13))\n", + " Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m184.3/184.3 kB\u001B[0m \u001B[31m22.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (5.9.5)\n", + "Collecting sentry-sdk>=1.0.0 (from wandb->-r requirements/optional.txt (line 13))\n", + " Downloading sentry_sdk-1.22.2-py2.py3-none-any.whl (203 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m203.3/203.3 kB\u001B[0m \u001B[31m25.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hCollecting docker-pycreds>=0.4.0 (from wandb->-r requirements/optional.txt (line 13))\n", + " Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n", + "Collecting pathtools (from wandb->-r requirements/optional.txt (line 13))\n", + " Downloading pathtools-0.1.2.tar.gz (11 kB)\n", + " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "Collecting setproctitle (from wandb->-r requirements/optional.txt (line 13))\n", + " Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n", + "Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (1.4.4)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile->-r requirements/optional.txt (line 11)) (2.21)\n", + "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb->-r requirements/optional.txt (line 13))\n", + " Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)\n", + "\u001B[2K \u001B[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001B[0m \u001B[32m62.7/62.7 kB\u001B[0m \u001B[31m9.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25hRequirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (5.3.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (4.9)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements/optional.txt (line 12)) (1.3.1)\n", + "Collecting portalocker (from iopath>=0.1.7->fvcore->-r requirements/optional.txt (line 3))\n", + " Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa->-r requirements/optional.txt (line 5)) (0.39.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (3.4)\n", + "Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (3.1)\n", + "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (2023.4.12)\n", + "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (1.4.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa->-r requirements/optional.txt (line 5)) (3.1.0)\n", + "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard->-r requirements/optional.txt (line 12)) (2.1.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (2.8.2)\n", + "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->-r requirements/optional.txt (line 13))\n", + " Downloading smmap-5.0.0-py3-none-any.whl (24 kB)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (0.5.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements/optional.txt (line 12)) (3.2.2)\n", + "Building wheels for collected packages: fvcore, pims, PyTurboJPEG, iopath, pathtools\n", + " Building wheel for fvcore (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for fvcore: filename=fvcore-0.1.5.post20221221-py3-none-any.whl size=61405 sha256=25c1e50155c8788d00eec898793c96133a746a8bb076ffc5c01f5a4dc256751e\n", + " Stored in directory: /root/.cache/pip/wheels/01/c0/af/77c1cf53a1be9e42a52b48e5af2169d40ec2e89f7362489dd0\n", + " Building wheel for pims (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for pims: filename=PIMS-0.6.1-py3-none-any.whl size=82619 sha256=59a328dc88a438c60cfb6e937e04c8a7dd55ad2a2905034cd41ff80cdbba6497\n", + " Stored in directory: /root/.cache/pip/wheels/cc/bf/3e/bfa77232d942f8244145f9c713b6b38f6ef04b6fb5c021c114\n", + " Building wheel for PyTurboJPEG (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for PyTurboJPEG: filename=PyTurboJPEG-1.7.1-py3-none-any.whl size=12243 sha256=ddf6424c85ac533335abd96dd9e98b014ea1dd4f143c88cd35ecb08d6128f411\n", + " Stored in directory: /root/.cache/pip/wheels/de/6e/b1/e7ba70c328c3395555cb92ca8820babb32950d867858b1948b\n", + " Building wheel for iopath (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31531 sha256=db977a4344bebbdd710665e767caab4fbcf53cc6aea0707cd38d26c45718331e\n", + " Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d\n", + " Building wheel for pathtools (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8791 sha256=08bb5753ce029aef01f25c3e81882d93c0e040e5932e90a02a062ad058756b52\n", + " Stored in directory: /root/.cache/pip/wheels/e7/f3/22/152153d6eb222ee7a56ff8617d80ee5207207a8c00a7aab794\n", + "Successfully built fvcore pims PyTurboJPEG iopath pathtools\n", + "Installing collected packages: slicerator, pathtools, lmdb, av, yacs, smmap, setproctitle, sentry-sdk, PyTurboJPEG, portalocker, docker-pycreds, pims, iopath, gitdb, GitPython, fvcore, wandb\n", + "Successfully installed GitPython-3.1.31 PyTurboJPEG-1.7.1 av-10.0.0 docker-pycreds-0.4.0 fvcore-0.1.5.post20221221 gitdb-4.0.10 iopath-0.1.10 lmdb-1.4.1 pathtools-0.1.2 pims-0.6.1 portalocker-2.7.0 sentry-sdk-1.22.2 setproctitle-1.3.2 slicerator-1.1.0 smmap-5.0.0 wandb-0.15.2 yacs-0.1.8\n" + ] + } + ], + "source": [ + "# install MMEngine, MMCV and MMDetection using MIM\n", + "%pip install -U openmim\n", + "!mim install mmengine\n", + "!mim install \"mmcv>=2.0.0\"\n", + "\n", + "# Install mmaction2\n", + "!rm -rf mmaction2\n", + "!git clone https://github.com/open-mmlab/mmaction2.git -b main\n", + "%cd mmaction2\n", + "\n", + "!pip install -e .\n", + "\n", + "# Install some optional requirements\n", + "!pip install -r requirements/optional.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "No_zZAFpWC-a", + "outputId": "9386dd81-2308-4adb-d3cb-798de11c035e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2.0.0+cu118 True\n", + "1.0.0\n", + "11.8\n", + "GCC 9.3\n", + "OrderedDict([('sys.platform', 'linux'), ('Python', '3.10.11 (main, Apr 5 2023, 14:15:10) [GCC 9.4.0]'), ('CUDA available', True), ('numpy_random_seed', 2147483648), ('GPU 0', 'Tesla T4'), ('CUDA_HOME', '/usr/local/cuda'), ('NVCC', 'Cuda compilation tools, release 11.8, V11.8.89'), ('GCC', 'x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0'), ('PyTorch', '2.0.0+cu118'), ('PyTorch compiling details', 'PyTorch built with:\\n - GCC 9.3\\n - C++ Version: 201703\\n - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\\n - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\\n - LAPACK is enabled (usually provided by MKL)\\n - NNPACK is enabled\\n - CPU capability usage: AVX2\\n - CUDA Runtime 11.8\\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\\n - CuDNN 8.7\\n - Magma 2.6.1\\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \\n'), ('TorchVision', '0.15.1+cu118'), ('OpenCV', '4.7.0'), ('MMEngine', '0.7.3')])\n" + ] + } + ], + "source": [ + "# Check Pytorch installation\n", + "import torch, torchvision\n", + "print(torch.__version__, torch.cuda.is_available())\n", + "\n", + "# Check MMAction2 installation\n", + "import mmaction\n", + "print(mmaction.__version__)\n", + "\n", + "# Check MMCV installation\n", + "from mmcv.ops import get_compiling_cuda_version, get_compiler_version\n", + "print(get_compiling_cuda_version())\n", + "print(get_compiler_version())\n", + "\n", + "# Check MMEngine installation\n", + "from mmengine.utils.dl_utils import collect_env\n", + "print(collect_env())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pXf7oV5DWdab" + }, + "source": [ + "## Perform inference with a MMAction2 recognizer\n", + "MMAction2 already provides high level APIs to do inference and training." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "64CW6d_AaT-Q", + "outputId": "ea330d8c-2e20-4dbd-d046-51d7c9ec4f7a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-05-15 03:33:08-- https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n", + "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.216, 163.181.82.218, 163.181.82.213, ...\n", + "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.216|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 97579339 (93M) [application/octet-stream]\n", + "Saving to: โ€˜checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pthโ€™\n", + "\n", + "checkpoints/tsn_r50 100%[===================>] 93.06M 26.1MB/s in 3.6s \n", + "\n", + "2023-05-15 03:33:12 (26.2 MB/s) - โ€˜checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pthโ€™ saved [97579339/97579339]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir checkpoints\n", + "!wget -c https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \\\n", + " -O checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HNZB7NoSabzj", + "outputId": "c0c2ba71-72ff-4cac-a5b8-65590f5a6bb0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loads checkpoint by local backend from path: checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n" + ] + } + ], + "source": [ + "from mmaction.apis import inference_recognizer, init_recognizer\n", + "from mmengine import Config\n", + "\n", + "\n", + "# Choose to use a config and initialize the recognizer\n", + "config = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'\n", + "config = Config.fromfile(config)\n", + "# Setup a checkpoint file to load\n", + "checkpoint = 'checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n", + "# Initialize the recognizer\n", + "model = init_recognizer(config, checkpoint, device='cuda:0')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "rEMsBnpHapAn", + "outputId": "ec05049e-7289-4798-94fa-2b773cb23634", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "05/15 03:33:18 - mmengine - WARNING - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "05/15 03:33:18 - mmengine - WARNING - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n" + ] + } + ], + "source": [ + "# Use the recognizer to do inference\n", + "from operator import itemgetter\n", + "video = 'demo/demo.mp4'\n", + "label = 'tools/data/kinetics/label_map_k400.txt'\n", + "results = inference_recognizer(model, video)\n", + "\n", + "pred_scores = results.pred_score.tolist()\n", + "score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))\n", + "score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)\n", + "top5_label = score_sorted[:5]\n", + "\n", + "labels = open(label).readlines()\n", + "labels = [x.strip() for x in labels]\n", + "results = [(labels[k[0]], k[1]) for k in top5_label]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NIyJXqfWathq", + "outputId": "cb25aca9-e72d-4c54-f295-4c889713cb3a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The top-5 labels with corresponding scores are:\n", + "arm wrestling: 1.0\n", + "rock scissors paper: 6.434453414527752e-09\n", + "shaking hands: 2.7599860175087088e-09\n", + "clapping: 1.3454612979302283e-09\n", + "massaging feet: 5.555100823784187e-10\n" + ] + } + ], + "source": [ + "print('The top-5 labels with corresponding scores are:')\n", + "for result in results:\n", + " print(f'{result[0]}: ', result[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QuZG8kZ2fJ5d" + }, + "source": [ + "## Train a recognizer on customized dataset\n", + "\n", + "To train a new recognizer, there are usually three things to do:\n", + "1. Support a new dataset\n", + "2. Modify the config\n", + "3. Train a new recognizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "neEFyxChfgiJ" + }, + "source": [ + "### Support a new dataset\n", + "\n", + "In this tutorial, we gives an example to convert the data into the format of existing datasets. Other methods and more advanced usages can be found in the [doc](/docs/tutorials/new_dataset.md)\n", + "\n", + "Firstly, let's download a tiny dataset obtained from [Kinetics-400](https://deepmind.com/research/open-source/open-source-datasets/kinetics/). We select 30 videos with their labels as train dataset and 10 videos with their labels as test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gjsUj9JzgUlJ", + "outputId": "96a0e6e9-0dd8-4c07-9fed-22b93d5c1318" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "rm: cannot remove 'kinetics400_tiny.zip*': No such file or directory\n", + "--2023-05-15 03:33:27-- https://download.openmmlab.com/mmaction/kinetics400_tiny.zip\n", + "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.216, 163.181.82.218, 163.181.82.213, ...\n", + "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.216|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 18308682 (17M) [application/zip]\n", + "Saving to: โ€˜kinetics400_tiny.zipโ€™\n", + "\n", + "kinetics400_tiny.zi 100%[===================>] 17.46M 32.7MB/s in 0.5s \n", + "\n", + "2023-05-15 03:33:28 (32.7 MB/s) - โ€˜kinetics400_tiny.zipโ€™ saved [18308682/18308682]\n", + "\n" + ] + } + ], + "source": [ + "# download, decompress the data\n", + "!rm kinetics400_tiny.zip*\n", + "!rm -rf kinetics400_tiny\n", + "!wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip\n", + "!unzip kinetics400_tiny.zip > /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AbZ-o7V6hNw4", + "outputId": "f229f352-1b43-41b7-a374-21404f618581" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "The following NEW packages will be installed:\n", + " tree\n", + "0 upgraded, 1 newly installed, 0 to remove and 24 not upgraded.\n", + "Need to get 43.0 kB of archives.\n", + "After this operation, 115 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n", + "Fetched 43.0 kB in 1s (48.9 kB/s)\n", + "Selecting previously unselected package tree.\n", + "(Reading database ... 122519 files and directories currently installed.)\n", + "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n", + "Unpacking tree (1.8.0-1) ...\n", + "Setting up tree (1.8.0-1) ...\n", + "Processing triggers for man-db (2.9.1-1) ...\n", + "\u001B[01;34mkinetics400_tiny\u001B[00m\n", + "โ”œโ”€โ”€ kinetics_tiny_train_video.txt\n", + "โ”œโ”€โ”€ kinetics_tiny_val_video.txt\n", + "โ”œโ”€โ”€ \u001B[01;34mtrain\u001B[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ 27_CSXByd3s.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ 34XczvTaRiI.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ A-wiliK50Zw.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ D32_1gwq35E.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ D92m0HsHjcQ.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ DbX8mPslRXg.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ FMlSTTpN3VY.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ h10B9SVE-nk.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ h2YqqUhnR34.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ iRuyZSKhHRg.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ IyfILH9lBRo.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ kFC3KY2bOP8.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ LvcFDgCAXQs.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ O46YA8tI530.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ oMrZaozOvdQ.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ oXy-e_P_cAI.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ P5M-hAts7MQ.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ phDqGd0NKoo.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ PnOe3GZRVX8.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ R8HXQkdgKWA.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ RqnKtCEoEcA.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ soEcZZsBmDs.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ TkkZPZHbAKA.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ T_TMNGzVrDk.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ WaS0qwP46Us.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ Wh_YPQdH1Zg.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ WWP5HZJsg-o.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ xGY2dP0YUjA.mp4\n", + "โ”‚ย ย  โ”œโ”€โ”€ yLC9CtWU5ws.mp4\n", + "โ”‚ย ย  โ””โ”€โ”€ ZQV4U2KQ370.mp4\n", + "โ””โ”€โ”€ \u001B[01;34mval\u001B[00m\n", + " โ”œโ”€โ”€ 0pVGiAU6XEA.mp4\n", + " โ”œโ”€โ”€ AQrbRSnRt8M.mp4\n", + " โ”œโ”€โ”€ b6Q_b7vgc7Q.mp4\n", + " โ”œโ”€โ”€ ddvJ6-faICE.mp4\n", + " โ”œโ”€โ”€ IcLztCtvhb8.mp4\n", + " โ”œโ”€โ”€ ik4BW3-SCts.mp4\n", + " โ”œโ”€โ”€ jqRrH30V0k4.mp4\n", + " โ”œโ”€โ”€ SU_x2LQqSLs.mp4\n", + " โ”œโ”€โ”€ u4Rm6srmIS8.mp4\n", + " โ””โ”€โ”€ y5Iu7XkTqV0.mp4\n", + "\n", + "2 directories, 42 files\n" + ] + } + ], + "source": [ + "# Check the directory structure of the tiny data\n", + "\n", + "# Install tree first\n", + "!apt-get -q install tree\n", + "!tree kinetics400_tiny" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fTdi6dI0hY3g", + "outputId": "95f22438-566c-4496-fe0c-50e128b47b5e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "D32_1gwq35E.mp4 0\n", + "iRuyZSKhHRg.mp4 1\n", + "oXy-e_P_cAI.mp4 0\n", + "34XczvTaRiI.mp4 1\n", + "h2YqqUhnR34.mp4 0\n", + "O46YA8tI530.mp4 0\n", + "kFC3KY2bOP8.mp4 1\n", + "WWP5HZJsg-o.mp4 1\n", + "phDqGd0NKoo.mp4 1\n", + "yLC9CtWU5ws.mp4 0\n", + "27_CSXByd3s.mp4 1\n", + "IyfILH9lBRo.mp4 1\n", + "T_TMNGzVrDk.mp4 1\n", + "TkkZPZHbAKA.mp4 0\n", + "PnOe3GZRVX8.mp4 1\n", + "soEcZZsBmDs.mp4 1\n", + "FMlSTTpN3VY.mp4 1\n", + "WaS0qwP46Us.mp4 0\n", + "A-wiliK50Zw.mp4 1\n", + "oMrZaozOvdQ.mp4 1\n", + "ZQV4U2KQ370.mp4 0\n", + "DbX8mPslRXg.mp4 1\n", + "h10B9SVE-nk.mp4 1\n", + "P5M-hAts7MQ.mp4 0\n", + "R8HXQkdgKWA.mp4 0\n", + "D92m0HsHjcQ.mp4 0\n", + "RqnKtCEoEcA.mp4 0\n", + "LvcFDgCAXQs.mp4 0\n", + "xGY2dP0YUjA.mp4 0\n", + "Wh_YPQdH1Zg.mp4 0\n" + ] + } + ], + "source": [ + "# After downloading the data, we need to check the annotation format\n", + "!cat kinetics400_tiny/kinetics_tiny_train_video.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0bq0mxmEi29H" + }, + "source": [ + "According to the format defined in [`VideoDataset`](./datasets/video_dataset.py), each line indicates a sample video with the filepath and label, which are split with a whitespace." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ht_DGJA9jQar" + }, + "source": [ + "### Modify the config\n", + "\n", + "In the next step, we need to modify the config for the training.\n", + "To accelerate the process, we finetune a recognizer using a pre-trained recognizer." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "LjCcmCKOjktc" + }, + "outputs": [], + "source": [ + "cfg = Config.fromfile('./configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tc8YhFFGjp3e" + }, + "source": [ + "Given a config that trains a TSN model on kinetics400-full dataset, we need to modify some values to use it for training TSN on Kinetics400-tiny dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tlhu9byjjt-K", + "outputId": "2d984a1d-93f7-493f-fd77-e19af8285f38" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Config:\n", + "model = dict(\n", + " type='Recognizer2D',\n", + " backbone=dict(\n", + " type='ResNet',\n", + " pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',\n", + " depth=50,\n", + " norm_eval=False),\n", + " cls_head=dict(\n", + " type='TSNHead',\n", + " num_classes=2,\n", + " in_channels=2048,\n", + " spatial_type='avg',\n", + " consensus=dict(type='AvgConsensus', dim=1),\n", + " dropout_ratio=0.4,\n", + " init_std=0.01,\n", + " average_clips='prob'),\n", + " data_preprocessor=dict(\n", + " type='ActionDataPreprocessor',\n", + " mean=[123.675, 116.28, 103.53],\n", + " std=[58.395, 57.12, 57.375],\n", + " format_shape='NCHW'),\n", + " train_cfg=None,\n", + " test_cfg=None)\n", + "train_cfg = dict(\n", + " type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(\n", + " type='MultiStepLR',\n", + " begin=0,\n", + " end=100,\n", + " by_epoch=True,\n", + " milestones=[40, 80],\n", + " gamma=0.1)\n", + "]\n", + "optim_wrapper = dict(\n", + " optimizer=dict(\n", + " type='SGD', lr=7.8125e-05, momentum=0.9, weight_decay=0.0001),\n", + " clip_grad=dict(max_norm=40, norm_type=2))\n", + "default_scope = 'mmaction'\n", + "default_hooks = dict(\n", + " runtime_info=dict(type='RuntimeInfoHook'),\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=20, ignore_last=False),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(\n", + " type='CheckpointHook', interval=3, save_best='auto', max_keep_ckpts=3),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " sync_buffers=dict(type='SyncBuffersHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])\n", + "log_level = 'INFO'\n", + "load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n", + "resume = False\n", + "dataset_type = 'VideoDataset'\n", + "data_root = 'kinetics400_tiny/train/'\n", + "data_root_val = 'kinetics400_tiny/val/'\n", + "ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n", + "ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n", + "file_client_args = dict(io_backend='disk')\n", + "train_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(\n", + " type='MultiScaleCrop',\n", + " input_size=224,\n", + " scales=(1, 0.875, 0.75, 0.66),\n", + " random_crop=False,\n", + " max_wh_scale_gap=1),\n", + " dict(type='Resize', scale=(224, 224), keep_ratio=False),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "val_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=3,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='CenterCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=25,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='TenCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_train_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/train/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames', clip_len=1, frame_interval=1,\n", + " num_clips=3),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(\n", + " type='MultiScaleCrop',\n", + " input_size=224,\n", + " scales=(1, 0.875, 0.75, 0.66),\n", + " random_crop=False,\n", + " max_wh_scale_gap=1),\n", + " dict(type='Resize', scale=(224, 224), keep_ratio=False),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ]))\n", + "val_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/val/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=3,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='CenterCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " test_mode=True))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/val/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=25,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='TenCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " test_mode=True))\n", + "val_evaluator = dict(type='AccMetric')\n", + "test_evaluator = dict(type='AccMetric')\n", + "auto_scale_lr = dict(enable=False, base_batch_size=256)\n", + "work_dir = './tutorial_exps'\n", + "\n" + ] + } + ], + "source": [ + "from mmengine.runner import set_random_seed\n", + "\n", + "# Modify dataset type and path\n", + "cfg.data_root = 'kinetics400_tiny/train/'\n", + "cfg.data_root_val = 'kinetics400_tiny/val/'\n", + "cfg.ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n", + "cfg.ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n", + "\n", + "\n", + "cfg.test_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n", + "cfg.test_dataloader.dataset.data_prefix.video = 'kinetics400_tiny/val/'\n", + "\n", + "cfg.train_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n", + "cfg.train_dataloader.dataset.data_prefix.video = 'kinetics400_tiny/train/'\n", + "\n", + "cfg.val_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n", + "cfg.val_dataloader.dataset.data_prefix.video = 'kinetics400_tiny/val/'\n", + "\n", + "\n", + "# Modify num classes of the model in cls_head\n", + "cfg.model.cls_head.num_classes = 2\n", + "# We can use the pre-trained TSN model\n", + "cfg.load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n", + "\n", + "# Set up working dir to save files and logs.\n", + "cfg.work_dir = './tutorial_exps'\n", + "\n", + "# The original learning rate (LR) is set for 8-GPU training.\n", + "# We divide it by 8 since we only use one GPU.\n", + "cfg.train_dataloader.batch_size = cfg.train_dataloader.batch_size // 16\n", + "cfg.val_dataloader.batch_size = cfg.val_dataloader.batch_size // 16\n", + "cfg.optim_wrapper.optimizer.lr = cfg.optim_wrapper.optimizer.lr / 8 / 16\n", + "cfg.train_cfg.max_epochs = 10\n", + "\n", + "cfg.train_dataloader.num_workers = 2\n", + "cfg.val_dataloader.num_workers = 2\n", + "cfg.test_dataloader.num_workers = 2\n", + "\n", + "# We can initialize the logger for training and have a look\n", + "# at the final config used for training\n", + "print(f'Config:\\n{cfg.pretty_text}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tES-qnZ3k38Z" + }, + "source": [ + "### Train a new recognizer\n", + "\n", + "Finally, lets initialize the dataset and recognizer, then train a new recognizer!" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dDBWkdDRk6oz", + "outputId": "044b9e09-2038-41c9-d5a3-8a74ae11ade2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "05/15 03:33:34 - mmengine - INFO - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.11 (main, Apr 5 2023, 14:15:10) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 1853452922\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.0+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.1+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.3\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: None\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "05/15 03:33:34 - mmengine - INFO - Config:\n", + "model = dict(\n", + " type='Recognizer2D',\n", + " backbone=dict(\n", + " type='ResNet',\n", + " pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',\n", + " depth=50,\n", + " norm_eval=False),\n", + " cls_head=dict(\n", + " type='TSNHead',\n", + " num_classes=2,\n", + " in_channels=2048,\n", + " spatial_type='avg',\n", + " consensus=dict(type='AvgConsensus', dim=1),\n", + " dropout_ratio=0.4,\n", + " init_std=0.01,\n", + " average_clips='prob'),\n", + " data_preprocessor=dict(\n", + " type='ActionDataPreprocessor',\n", + " mean=[123.675, 116.28, 103.53],\n", + " std=[58.395, 57.12, 57.375],\n", + " format_shape='NCHW'),\n", + " train_cfg=None,\n", + " test_cfg=None)\n", + "train_cfg = dict(\n", + " type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(\n", + " type='MultiStepLR',\n", + " begin=0,\n", + " end=100,\n", + " by_epoch=True,\n", + " milestones=[40, 80],\n", + " gamma=0.1)\n", + "]\n", + "optim_wrapper = dict(\n", + " optimizer=dict(\n", + " type='SGD', lr=7.8125e-05, momentum=0.9, weight_decay=0.0001),\n", + " clip_grad=dict(max_norm=40, norm_type=2))\n", + "default_scope = 'mmaction'\n", + "default_hooks = dict(\n", + " runtime_info=dict(type='RuntimeInfoHook'),\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=20, ignore_last=False),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(\n", + " type='CheckpointHook', interval=3, save_best='auto', max_keep_ckpts=3),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " sync_buffers=dict(type='SyncBuffersHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])\n", + "log_level = 'INFO'\n", + "load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n", + "resume = False\n", + "dataset_type = 'VideoDataset'\n", + "data_root = 'kinetics400_tiny/train/'\n", + "data_root_val = 'kinetics400_tiny/val/'\n", + "ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n", + "ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n", + "file_client_args = dict(io_backend='disk')\n", + "train_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(\n", + " type='MultiScaleCrop',\n", + " input_size=224,\n", + " scales=(1, 0.875, 0.75, 0.66),\n", + " random_crop=False,\n", + " max_wh_scale_gap=1),\n", + " dict(type='Resize', scale=(224, 224), keep_ratio=False),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "val_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=3,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='CenterCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=25,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='TenCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_train_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/train/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames', clip_len=1, frame_interval=1,\n", + " num_clips=3),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(\n", + " type='MultiScaleCrop',\n", + " input_size=224,\n", + " scales=(1, 0.875, 0.75, 0.66),\n", + " random_crop=False,\n", + " max_wh_scale_gap=1),\n", + " dict(type='Resize', scale=(224, 224), keep_ratio=False),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ]))\n", + "val_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/val/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=3,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='CenterCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " test_mode=True))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='VideoDataset',\n", + " ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n", + " data_prefix=dict(video='kinetics400_tiny/val/'),\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleFrames',\n", + " clip_len=1,\n", + " frame_interval=1,\n", + " num_clips=25,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='TenCrop', crop_size=224),\n", + " dict(type='FormatShape', input_format='NCHW'),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " test_mode=True))\n", + "val_evaluator = dict(type='AccMetric')\n", + "test_evaluator = dict(type='AccMetric')\n", + "auto_scale_lr = dict(enable=False, base_batch_size=256)\n", + "work_dir = './tutorial_exps'\n", + "\n", + "05/15 03:33:35 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "05/15 03:33:35 - mmengine - INFO - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "Loads checkpoint by http backend from path: https://download.pytorch.org/models/resnet50-11ad3fa6.pth\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading: \"https://download.pytorch.org/models/resnet50-11ad3fa6.pth\" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "05/15 03:33:37 - mmengine - INFO - These parameters in pretrained checkpoint are not loaded: {'fc.weight', 'fc.bias'}\n", + "Loads checkpoint by local backend from path: ./checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n", + "The model and loaded state dict do not match exactly\n", + "\n", + "size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 2048]) from checkpoint, the shape in current model is torch.Size([2, 2048]).\n", + "size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([2]).\n", + "05/15 03:33:37 - mmengine - INFO - Load checkpoint from ./checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n", + "05/15 03:33:37 - mmengine - WARNING - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "05/15 03:33:37 - mmengine - INFO - Checkpoints will be saved to /content/mmaction2/tutorial_exps.\n", + "05/15 03:33:41 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:33:41 - mmengine - INFO - Epoch(train) [1][15/15] lr: 7.8125e-05 eta: 0:00:31 time: 0.2334 data_time: 0.0793 memory: 2917 grad_norm: 11.9900 loss: 0.6971 top1_acc: 1.0000 top5_acc: 1.0000 loss_cls: 0.6971\n", + "05/15 03:33:42 - mmengine - INFO - Epoch(val) [1][5/5] acc/top1: 0.3000 acc/top5: 1.0000 acc/mean1: 0.3000 data_time: 0.1994 time: 0.2254\n", + "05/15 03:33:42 - mmengine - INFO - The best checkpoint with 0.3000 acc/top1 at 1 epoch is saved to best_acc_top1_epoch_1.pth.\n", + "05/15 03:33:46 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:33:46 - mmengine - INFO - Epoch(train) [2][15/15] lr: 7.8125e-05 eta: 0:00:29 time: 0.2373 data_time: 0.1369 memory: 961 grad_norm: 12.4935 loss: 0.7158 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.7158\n", + "05/15 03:33:48 - mmengine - INFO - Epoch(val) [2][5/5] acc/top1: 0.7000 acc/top5: 1.0000 acc/mean1: 0.7000 data_time: 0.2692 time: 0.3006\n", + "05/15 03:33:48 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_1.pth is removed\n", + "05/15 03:33:48 - mmengine - INFO - The best checkpoint with 0.7000 acc/top1 at 2 epoch is saved to best_acc_top1_epoch_2.pth.\n", + "05/15 03:33:51 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:33:51 - mmengine - INFO - Epoch(train) [3][15/15] lr: 7.8125e-05 eta: 0:00:24 time: 0.2112 data_time: 0.1163 memory: 961 grad_norm: 13.4063 loss: 0.7338 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7338\n", + "05/15 03:33:51 - mmengine - INFO - Saving checkpoint at 3 epochs\n", + "05/15 03:33:53 - mmengine - INFO - Epoch(val) [3][5/5] acc/top1: 0.4000 acc/top5: 1.0000 acc/mean1: 0.4000 data_time: 0.1669 time: 0.1906\n", + "05/15 03:33:56 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:33:56 - mmengine - INFO - Epoch(train) [4][15/15] lr: 7.8125e-05 eta: 0:00:19 time: 0.1750 data_time: 0.0907 memory: 961 grad_norm: 12.4322 loss: 0.6894 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.6894\n", + "05/15 03:33:57 - mmengine - INFO - Epoch(val) [4][5/5] acc/top1: 0.7000 acc/top5: 1.0000 acc/mean1: 0.7000 data_time: 0.1791 time: 0.2030\n", + "05/15 03:34:00 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:00 - mmengine - INFO - Epoch(train) [5][15/15] lr: 7.8125e-05 eta: 0:00:16 time: 0.2016 data_time: 0.1155 memory: 961 grad_norm: 11.5982 loss: 0.6940 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.6940\n", + "05/15 03:34:02 - mmengine - INFO - Epoch(val) [5][5/5] acc/top1: 0.7000 acc/top5: 1.0000 acc/mean1: 0.7000 data_time: 0.3145 time: 0.3455\n", + "05/15 03:34:05 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:05 - mmengine - INFO - Epoch(train) [6][15/15] lr: 7.8125e-05 eta: 0:00:13 time: 0.2366 data_time: 0.1440 memory: 961 grad_norm: 12.0952 loss: 0.6667 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.6667\n", + "05/15 03:34:05 - mmengine - INFO - Saving checkpoint at 6 epochs\n", + "05/15 03:34:08 - mmengine - INFO - Epoch(val) [6][5/5] acc/top1: 0.6000 acc/top5: 1.0000 acc/mean1: 0.6000 data_time: 0.2172 time: 0.2403\n", + "05/15 03:34:10 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:10 - mmengine - INFO - Epoch(train) [7][15/15] lr: 7.8125e-05 eta: 0:00:09 time: 0.1784 data_time: 0.0942 memory: 961 grad_norm: 12.4209 loss: 0.6570 top1_acc: 1.0000 top5_acc: 1.0000 loss_cls: 0.6570\n", + "05/15 03:34:11 - mmengine - INFO - Epoch(val) [7][5/5] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000 data_time: 0.1898 time: 0.2118\n", + "05/15 03:34:11 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_2.pth is removed\n", + "05/15 03:34:12 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 7 epoch is saved to best_acc_top1_epoch_7.pth.\n", + "05/15 03:34:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:15 - mmengine - INFO - Epoch(train) [8][15/15] lr: 7.8125e-05 eta: 0:00:06 time: 0.2073 data_time: 0.1220 memory: 961 grad_norm: 11.4271 loss: 0.6241 top1_acc: 1.0000 top5_acc: 1.0000 loss_cls: 0.6241\n", + "05/15 03:34:17 - mmengine - INFO - Epoch(val) [8][5/5] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000 data_time: 0.3497 time: 0.3890\n", + "05/15 03:34:17 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_7.pth is removed\n", + "05/15 03:34:18 - mmengine - INFO - The best checkpoint with 1.0000 acc/top1 at 8 epoch is saved to best_acc_top1_epoch_8.pth.\n", + "05/15 03:34:21 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:21 - mmengine - INFO - Epoch(train) [9][15/15] lr: 7.8125e-05 eta: 0:00:03 time: 0.2309 data_time: 0.1390 memory: 961 grad_norm: 12.3066 loss: 0.6451 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.6451\n", + "05/15 03:34:21 - mmengine - INFO - Saving checkpoint at 9 epochs\n", + "05/15 03:34:23 - mmengine - INFO - Epoch(val) [9][5/5] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000 data_time: 0.2023 time: 0.2256\n", + "05/15 03:34:26 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n", + "05/15 03:34:26 - mmengine - INFO - Epoch(train) [10][15/15] lr: 7.8125e-05 eta: 0:00:00 time: 0.1733 data_time: 0.0951 memory: 961 grad_norm: 11.1461 loss: 0.5931 top1_acc: 1.0000 top5_acc: 1.0000 loss_cls: 0.5931\n", + "05/15 03:34:26 - mmengine - INFO - Saving checkpoint at 10 epochs\n", + "05/15 03:34:27 - mmengine - INFO - Epoch(val) [10][5/5] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000 data_time: 0.1836 time: 0.2048\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Recognizer2D(\n", + " (data_preprocessor): ActionDataPreprocessor()\n", + " (backbone): ResNet(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n", + " (layer1): Sequential(\n", + " (0): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " (downsample): ConvModule(\n", + " (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " )\n", + " (1): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (2): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (layer2): Sequential(\n", + " (0): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " (downsample): ConvModule(\n", + " (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " )\n", + " (1): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (2): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (3): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (layer3): Sequential(\n", + " (0): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " (downsample): ConvModule(\n", + " (conv): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " )\n", + " (1): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (2): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (3): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (4): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (5): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (layer4): Sequential(\n", + " (0): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " (downsample): ConvModule(\n", + " (conv): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n", + " (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " )\n", + " (1): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " (2): Bottleneck(\n", + " (conv1): ConvModule(\n", + " (conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv2): ConvModule(\n", + " (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (activate): ReLU(inplace=True)\n", + " )\n", + " (conv3): ConvModule(\n", + " (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", + " (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + " (relu): ReLU(inplace=True)\n", + " )\n", + " )\n", + " )\n", + " (cls_head): TSNHead(\n", + " (loss_cls): CrossEntropyLoss()\n", + " (consensus): AvgConsensus()\n", + " (avg_pool): AdaptiveAvgPool2d(output_size=(1, 1))\n", + " (dropout): Dropout(p=0.4, inplace=False)\n", + " (fc_cls): Linear(in_features=2048, out_features=2, bias=True)\n", + " )\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "import os.path as osp\n", + "import mmengine\n", + "from mmengine.runner import Runner\n", + "\n", + "# Create work_dir\n", + "mmengine.mkdir_or_exist(osp.abspath(cfg.work_dir))\n", + "\n", + "# build the runner from config\n", + "runner = Runner.from_cfg(cfg)\n", + "\n", + "# start training\n", + "runner.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zdSd7oTLlxIf" + }, + "source": [ + "### Understand the log\n", + "From the log, we can have a basic understanding the training process and know how well the recognizer is trained.\n", + "\n", + "Firstly, the ResNet-50 backbone pre-trained on ImageNet is loaded, this is a common practice since training from scratch is more cost. The log shows that all the weights of the ResNet-50 backbone are loaded except the `fc.bias` and `fc.weight`.\n", + "\n", + "Second, since the dataset we are using is small, we loaded a TSN model and finetune it for action recognition.\n", + "The original TSN is trained on original Kinetics-400 dataset which contains 400 classes but Kinetics-400 Tiny dataset only have 2 classes. Therefore, the last FC layer of the pre-trained TSN for classification has different weight shape and is not used.\n", + "\n", + "Third, after training, the recognizer is evaluated by the default evaluation. The results show that the recognizer achieves 100% top1 accuracy and 100% top5 accuracy on the val dataset,\n", + " \n", + "Not bad!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ryVoSfZVmogw" + }, + "source": [ + "## Test the trained recognizer\n", + "\n", + "After finetuning the recognizer, let's check the prediction results!" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eyY3hCMwyTct", + "outputId": "34fbbdc5-b9fd-4fd2-8030-3ba56b10adbf" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "05/15 03:34:36 - mmengine - INFO - Epoch(test) [10/10] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000 data_time: 0.0586 time: 0.7817\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'acc/top1': 0.9, 'acc/top5': 1.0, 'acc/mean1': 0.9}" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "runner.test()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "mmact_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "vscode": { + "interpreter": { + "hash": "189c342a4747645665e89db23000ac4d4edb7a87c4cd0b2f881610f468fb778d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demo/test_video_structuralize.mp4 b/demo/test_video_structuralize.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c91ec1cb203fead76a0d099c9347e2df6692b29c --- /dev/null +++ b/demo/test_video_structuralize.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed105408e9bf71c202c1ddf08f05ff4cfb2f207a7d811d4d791126c415c00261 +size 579876 diff --git a/demo/webcam_demo.py b/demo/webcam_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..cd432b5cc9afc01fa0139d3178b4292ea7a25f22 --- /dev/null +++ b/demo/webcam_demo.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import time +from collections import deque +from operator import itemgetter +from threading import Thread + +import cv2 +import numpy as np +import torch +from mmengine import Config, DictAction +from mmengine.dataset import Compose, pseudo_collate + +from mmaction.apis import init_recognizer +from mmaction.utils import get_str_type + +FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL +FONTSCALE = 1 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 +EXCLUED_STEPS = [ + 'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit', + 'PyAVDecode', 'RawFrameDecode' +] + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 webcam demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file/url') + parser.add_argument('label', help='label file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--camera-id', type=int, default=0, help='camera device id') + parser.add_argument( + '--threshold', + type=float, + default=0.01, + help='recognition score threshold') + parser.add_argument( + '--average-size', + type=int, + default=1, + help='number of latest clips to be averaged for prediction') + parser.add_argument( + '--drawing-fps', + type=int, + default=20, + help='Set upper bound FPS value of the output drawing') + parser.add_argument( + '--inference-fps', + type=int, + default=4, + help='Set upper bound FPS value of model inference') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + assert args.drawing_fps >= 0 and args.inference_fps >= 0, \ + 'upper bound FPS value of drawing and inference should be set as ' \ + 'positive number, or zero for no limit' + return args + + +def show_results(): + print('Press "Esc", "q" or "Q" to exit') + + text_info = {} + cur_time = time.time() + while True: + msg = 'Waiting for action ...' + _, frame = camera.read() + frame_queue.append(np.array(frame[:, :, ::-1])) + + if len(result_queue) != 0: + text_info = {} + results = result_queue.popleft() + for i, result in enumerate(results): + selected_label, score = result + if score < threshold: + break + location = (0, 40 + i * 20) + text = selected_label + ': ' + str(round(score * 100, 2)) + text_info[location] = text + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + elif len(text_info) != 0: + for location, text in text_info.items(): + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + else: + cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, MSGCOLOR, + THICKNESS, LINETYPE) + + cv2.imshow('camera', frame) + ch = cv2.waitKey(1) + + if ch == 27 or ch == ord('q') or ch == ord('Q'): + camera.release() + cv2.destroyAllWindows() + break + + if drawing_fps > 0: + # add a limiter for actual drawing fps <= drawing_fps + sleep_time = 1 / drawing_fps - (time.time() - cur_time) + if sleep_time > 0: + time.sleep(sleep_time) + cur_time = time.time() + + +def inference(): + score_cache = deque() + scores_sum = 0 + cur_time = time.time() + while True: + cur_windows = [] + + while len(cur_windows) == 0: + if len(frame_queue) == sample_length: + cur_windows = list(np.array(frame_queue)) + if data['img_shape'] is None: + data['img_shape'] = frame_queue.popleft().shape[:2] + + cur_data = data.copy() + cur_data['imgs'] = cur_windows + cur_data = test_pipeline(cur_data) + cur_data = pseudo_collate([cur_data]) + + # Forward the model + with torch.no_grad(): + result = model.test_step(cur_data)[0] + scores = result.pred_score.tolist() + scores = np.array(scores) + score_cache.append(scores) + scores_sum += scores + + if len(score_cache) == average_size: + scores_avg = scores_sum / average_size + num_selected_labels = min(len(label), 5) + + score_tuples = tuple(zip(label, scores_avg)) + score_sorted = sorted( + score_tuples, key=itemgetter(1), reverse=True) + results = score_sorted[:num_selected_labels] + + result_queue.append(results) + scores_sum -= score_cache.popleft() + + if inference_fps > 0: + # add a limiter for actual inference fps <= inference_fps + sleep_time = 1 / inference_fps - (time.time() - cur_time) + if sleep_time > 0: + time.sleep(sleep_time) + cur_time = time.time() + + +def main(): + global average_size, threshold, drawing_fps, inference_fps, \ + device, model, camera, data, label, sample_length, \ + test_pipeline, frame_queue, result_queue + + args = parse_args() + average_size = args.average_size + threshold = args.threshold + drawing_fps = args.drawing_fps + inference_fps = args.inference_fps + + device = torch.device(args.device) + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # Build the recognizer from a config file and checkpoint file/url + model = init_recognizer(cfg, args.checkpoint, device=args.device) + camera = cv2.VideoCapture(args.camera_id) + data = dict(img_shape=None, modality='RGB', label=-1) + + with open(args.label, 'r') as f: + label = [line.strip() for line in f] + + # prepare test pipeline from non-camera pipeline + cfg = model.cfg + sample_length = 0 + pipeline = cfg.test_pipeline + pipeline_ = pipeline.copy() + for step in pipeline: + if 'SampleFrames' in get_str_type(step['type']): + sample_length = step['clip_len'] * step['num_clips'] + data['num_clips'] = step['num_clips'] + data['clip_len'] = step['clip_len'] + pipeline_.remove(step) + if get_str_type(step['type']) in EXCLUED_STEPS: + # remove step to decode frames + pipeline_.remove(step) + test_pipeline = Compose(pipeline_) + + assert sample_length > 0 + + try: + frame_queue = deque(maxlen=sample_length) + result_queue = deque(maxlen=1) + pw = Thread(target=show_results, args=(), daemon=True) + pr = Thread(target=inference, args=(), daemon=True) + pw.start() + pr.start() + pw.join() + except KeyboardInterrupt: + pass + + +if __name__ == '__main__': + main() diff --git a/demo/webcam_demo_spatiotemporal_det.py b/demo/webcam_demo_spatiotemporal_det.py new file mode 100644 index 0000000000000000000000000000000000000000..75a534bdf65f07817aed1966d484cc02e0f54abc --- /dev/null +++ b/demo/webcam_demo_spatiotemporal_det.py @@ -0,0 +1,864 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Webcam Spatio-Temporal Action Detection Demo. + +Some codes are based on https://github.com/facebookresearch/SlowFast +""" + +import argparse +import atexit +import copy +import logging +import queue +import threading +import time +from abc import ABCMeta, abstractmethod + +import cv2 +import mmcv +import numpy as np +import torch +from mmengine import Config, DictAction +from mmengine.structures import InstanceData + +from mmaction.structures import ActionDataSample + +try: + from mmdet.apis import inference_detector, init_detector +except (ImportError, ModuleNotFoundError): + raise ImportError('Failed to import `inference_detector` and ' + '`init_detector` form `mmdet.apis`. These apis are ' + 'required in this demo! ') + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 webcam spatio-temporal detection demo') + + parser.add_argument( + '--config', + default=( + 'configs/detection/slowonly/' + 'slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py' + ), + help='spatio temporal detection config file path') + parser.add_argument( + '--checkpoint', + default=('https://download.openmmlab.com/mmaction/detection/ava/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/' + 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb' + '_20201217-16378594.pth'), + help='spatio temporal detection checkpoint file/url') + parser.add_argument( + '--action-score-thr', + type=float, + default=0.4, + help='the threshold of human action score') + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--input-video', + default='0', + type=str, + help='webcam id or input video file/url') + parser.add_argument( + '--label-map', + default='tools/data/ava/label_map.txt', + help='label map file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--output-fps', + default=15, + type=int, + help='the fps of demo video output') + parser.add_argument( + '--out-filename', + default=None, + type=str, + help='the filename of output video') + parser.add_argument( + '--show', + action='store_true', + help='Whether to show results with cv2.imshow') + parser.add_argument( + '--display-height', + type=int, + default=0, + help='Image height for human detector and draw frames.') + parser.add_argument( + '--display-width', + type=int, + default=0, + help='Image width for human detector and draw frames.') + parser.add_argument( + '--predict-stepsize', + default=8, + type=int, + help='give out a prediction per n frames') + parser.add_argument( + '--clip-vis-length', + default=8, + type=int, + help='Number of draw frames per clip.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + + args = parser.parse_args() + return args + + +class TaskInfo: + """Wapper for a clip. + + Transmit data around three threads. + + 1) Read Thread: Create task and put task into read queue. Init `frames`, + `processed_frames`, `img_shape`, `ratio`, `clip_vis_length`. + 2) Main Thread: Get data from read queue, predict human bboxes and stdet + action labels, draw predictions and put task into display queue. Init + `display_bboxes`, `stdet_bboxes` and `action_preds`, update `frames`. + 3) Display Thread: Get data from display queue, show/write frames and + delete task. + """ + + def __init__(self): + self.id = -1 + + # raw frames, used as human detector input, draw predictions input + # and output, display input + self.frames = None + + # stdet params + self.processed_frames = None # model inputs + self.frames_inds = None # select frames from processed frames + self.img_shape = None # model inputs, processed frame shape + # `action_preds` is `list[list[tuple]]`. The outer brackets indicate + # different bboxes and the intter brackets indicate different action + # results for the same bbox. tuple contains `class_name` and `score`. + self.action_preds = None # stdet results + + # human bboxes with the format (xmin, ymin, xmax, ymax) + self.display_bboxes = None # bboxes coords for self.frames + self.stdet_bboxes = None # bboxes coords for self.processed_frames + self.ratio = None # processed_frames.shape[1::-1]/frames.shape[1::-1] + + # for each clip, draw predictions on clip_vis_length frames + self.clip_vis_length = -1 + + def add_frames(self, idx, frames, processed_frames): + """Add the clip and corresponding id. + + Args: + idx (int): the current index of the clip. + frames (list[ndarray]): list of images in "BGR" format. + processed_frames (list[ndarray]): list of resize and normed images + in "BGR" format. + """ + self.frames = frames + self.processed_frames = processed_frames + self.id = idx + self.img_shape = processed_frames[0].shape[:2] + + def add_bboxes(self, display_bboxes): + """Add correspondding bounding boxes.""" + self.display_bboxes = display_bboxes + self.stdet_bboxes = display_bboxes.clone() + self.stdet_bboxes[:, ::2] = self.stdet_bboxes[:, ::2] * self.ratio[0] + self.stdet_bboxes[:, 1::2] = self.stdet_bboxes[:, 1::2] * self.ratio[1] + + def add_action_preds(self, preds): + """Add the corresponding action predictions.""" + self.action_preds = preds + + def get_model_inputs(self, device): + """Convert preprocessed images to MMAction2 STDet model inputs.""" + cur_frames = [self.processed_frames[idx] for idx in self.frames_inds] + input_array = np.stack(cur_frames).transpose((3, 0, 1, 2))[np.newaxis] + input_tensor = torch.from_numpy(input_array).to(device) + datasample = ActionDataSample() + datasample.proposals = InstanceData(bboxes=self.stdet_bboxes) + datasample.set_metainfo(dict(img_shape=self.img_shape)) + + return dict( + inputs=input_tensor, data_samples=[datasample], mode='predict') + + +class BaseHumanDetector(metaclass=ABCMeta): + """Base class for Human Dector. + + Args: + device (str): CPU/CUDA device option. + """ + + def __init__(self, device): + self.device = torch.device(device) + + @abstractmethod + def _do_detect(self, image): + """Get human bboxes with shape [n, 4]. + + The format of bboxes is (xmin, ymin, xmax, ymax) in pixels. + """ + + def predict(self, task): + """Add keyframe bboxes to task.""" + # keyframe idx == (clip_len * frame_interval) // 2 + keyframe = task.frames[len(task.frames) // 2] + + # call detector + bboxes = self._do_detect(keyframe) + + # convert bboxes to torch.Tensor and move to target device + if isinstance(bboxes, np.ndarray): + bboxes = torch.from_numpy(bboxes).to(self.device) + elif isinstance(bboxes, torch.Tensor) and bboxes.device != self.device: + bboxes = bboxes.to(self.device) + + # update task + task.add_bboxes(bboxes) + + return task + + +class MmdetHumanDetector(BaseHumanDetector): + """Wrapper for mmdetection human detector. + + Args: + config (str): Path to mmdetection config. + ckpt (str): Path to mmdetection checkpoint. + device (str): CPU/CUDA device option. + score_thr (float): The threshold of human detection score. + person_classid (int): Choose class from detection results. + Default: 0. Suitable for COCO pretrained models. + """ + + def __init__(self, config, ckpt, device, score_thr, person_classid=0): + super().__init__(device) + self.model = init_detector(config, ckpt, device=device) + self.person_classid = person_classid + self.score_thr = score_thr + + def _do_detect(self, image): + """Get bboxes in shape [n, 4] and values in pixels.""" + det_data_sample = inference_detector(self.model, image) + pred_instance = det_data_sample.pred_instances.cpu().numpy() + # We only keep human detection bboxs with score larger + # than `det_score_thr` and category id equal to `det_cat_id`. + valid_idx = np.logical_and(pred_instance.labels == self.person_classid, + pred_instance.scores > self.score_thr) + bboxes = pred_instance.bboxes[valid_idx] + # result = result[result[:, 4] >= self.score_thr][:, :4] + return bboxes + + +class StdetPredictor: + """Wrapper for MMAction2 spatio-temporal action models. + + Args: + config (str): Path to stdet config. + ckpt (str): Path to stdet checkpoint. + device (str): CPU/CUDA device option. + score_thr (float): The threshold of human action score. + label_map_path (str): Path to label map file. The format for each line + is `{class_id}: {class_name}`. + """ + + def __init__(self, config, checkpoint, device, score_thr, label_map_path): + self.score_thr = score_thr + + # load model + config.model.backbone.pretrained = None + # model = build_detector(config.model, test_cfg=config.get('test_cfg')) + # load_checkpoint(model, checkpoint, map_location='cpu') + # model.to(device) + # model.eval() + model = init_detector(config, checkpoint, device=device) + self.model = model + self.device = device + + # init label map, aka class_id to class_name dict + with open(label_map_path) as f: + lines = f.readlines() + lines = [x.strip().split(': ') for x in lines] + self.label_map = {int(x[0]): x[1] for x in lines} + try: + if config['data']['train']['custom_classes'] is not None: + self.label_map = { + id + 1: self.label_map[cls] + for id, cls in enumerate(config['data']['train'] + ['custom_classes']) + } + except KeyError: + pass + + def predict(self, task): + """Spatio-temporval Action Detection model inference.""" + # No need to do inference if no one in keyframe + if len(task.stdet_bboxes) == 0: + return task + + with torch.no_grad(): + result = self.model(**task.get_model_inputs(self.device)) + scores = result[0].pred_instances.scores + # pack results of human detector and stdet + preds = [] + for _ in range(task.stdet_bboxes.shape[0]): + preds.append([]) + for class_id in range(scores.shape[1]): + if class_id not in self.label_map: + continue + for bbox_id in range(task.stdet_bboxes.shape[0]): + if scores[bbox_id][class_id] > self.score_thr: + preds[bbox_id].append((self.label_map[class_id], + scores[bbox_id][class_id].item())) + + # update task + # `preds` is `list[list[tuple]]`. The outer brackets indicate + # different bboxes and the intter brackets indicate different action + # results for the same bbox. tuple contains `class_name` and `score`. + task.add_action_preds(preds) + + return task + + +class ClipHelper: + """Multithrading utils to manage the lifecycle of task.""" + + def __init__(self, + config, + display_height=0, + display_width=0, + input_video=0, + predict_stepsize=40, + output_fps=25, + clip_vis_length=8, + out_filename=None, + show=True, + stdet_input_shortside=256): + # stdet sampling strategy + val_pipeline = config.val_pipeline + sampler = [x for x in val_pipeline + if x['type'] == 'SampleAVAFrames'][0] + clip_len, frame_interval = sampler['clip_len'], sampler[ + 'frame_interval'] + self.window_size = clip_len * frame_interval + + # asserts + assert (out_filename or show), \ + 'out_filename and show cannot both be None' + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + assert clip_vis_length <= predict_stepsize + assert 0 < predict_stepsize <= self.window_size + + # source params + try: + self.cap = cv2.VideoCapture(int(input_video)) + self.webcam = True + except ValueError: + self.cap = cv2.VideoCapture(input_video) + self.webcam = False + assert self.cap.isOpened() + + # stdet input preprocessing params + h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + self.stdet_input_size = mmcv.rescale_size( + (w, h), (stdet_input_shortside, np.Inf)) + img_norm_cfg = dict( + mean=np.array(config.model.data_preprocessor.mean), + std=np.array(config.model.data_preprocessor.std), + to_rgb=False) + self.img_norm_cfg = img_norm_cfg + + # task init params + self.clip_vis_length = clip_vis_length + self.predict_stepsize = predict_stepsize + self.buffer_size = self.window_size - self.predict_stepsize + frame_start = self.window_size // 2 - (clip_len // 2) * frame_interval + self.frames_inds = [ + frame_start + frame_interval * i for i in range(clip_len) + ] + self.buffer = [] + self.processed_buffer = [] + + # output/display params + if display_height > 0 and display_width > 0: + self.display_size = (display_width, display_height) + elif display_height > 0 or display_width > 0: + self.display_size = mmcv.rescale_size( + (w, h), (np.Inf, max(display_height, display_width))) + else: + self.display_size = (w, h) + self.ratio = tuple( + n / o for n, o in zip(self.stdet_input_size, self.display_size)) + if output_fps <= 0: + self.output_fps = int(self.cap.get(cv2.CAP_PROP_FPS)) + else: + self.output_fps = output_fps + self.show = show + self.video_writer = None + if out_filename is not None: + self.video_writer = self.get_output_video_writer(out_filename) + display_start_idx = self.window_size // 2 - self.predict_stepsize // 2 + self.display_inds = [ + display_start_idx + i for i in range(self.predict_stepsize) + ] + + # display multi-theading params + self.display_id = -1 # task.id for display queue + self.display_queue = {} + self.display_lock = threading.Lock() + self.output_lock = threading.Lock() + + # read multi-theading params + self.read_id = -1 # task.id for read queue + self.read_id_lock = threading.Lock() + self.read_queue = queue.Queue() + self.read_lock = threading.Lock() + self.not_end = True # cap.read() flag + + # program state + self.stopped = False + + atexit.register(self.clean) + + def read_fn(self): + """Main function for read thread. + + Contains three steps: + + 1) Read and preprocess (resize + norm) frames from source. + 2) Create task by frames from previous step and buffer. + 3) Put task into read queue. + """ + was_read = True + start_time = time.time() + while was_read and not self.stopped: + # init task + task = TaskInfo() + task.clip_vis_length = self.clip_vis_length + task.frames_inds = self.frames_inds + task.ratio = self.ratio + + # read buffer + frames = [] + processed_frames = [] + if len(self.buffer) != 0: + frames = self.buffer + if len(self.processed_buffer) != 0: + processed_frames = self.processed_buffer + + # read and preprocess frames from source and update task + with self.read_lock: + before_read = time.time() + read_frame_cnt = self.window_size - len(frames) + while was_read and len(frames) < self.window_size: + was_read, frame = self.cap.read() + if not self.webcam: + # Reading frames too fast may lead to unexpected + # performance degradation. If you have enough + # resource, this line could be commented. + time.sleep(1 / self.output_fps) + if was_read: + frames.append(mmcv.imresize(frame, self.display_size)) + processed_frame = mmcv.imresize( + frame, self.stdet_input_size).astype(np.float32) + _ = mmcv.imnormalize_(processed_frame, + **self.img_norm_cfg) + processed_frames.append(processed_frame) + task.add_frames(self.read_id + 1, frames, processed_frames) + + # update buffer + if was_read: + self.buffer = frames[-self.buffer_size:] + self.processed_buffer = processed_frames[-self.buffer_size:] + + # update read state + with self.read_id_lock: + self.read_id += 1 + self.not_end = was_read + + self.read_queue.put((was_read, copy.deepcopy(task))) + cur_time = time.time() + logger.debug( + f'Read thread: {1000*(cur_time - start_time):.0f} ms, ' + f'{read_frame_cnt / (cur_time - before_read):.0f} fps') + start_time = cur_time + + def display_fn(self): + """Main function for display thread. + + Read data from display queue and display predictions. + """ + start_time = time.time() + while not self.stopped: + # get the state of the read thread + with self.read_id_lock: + read_id = self.read_id + not_end = self.not_end + + with self.display_lock: + # If video ended and we have display all frames. + if not not_end and self.display_id == read_id: + break + + # If the next task are not available, wait. + if (len(self.display_queue) == 0 or + self.display_queue.get(self.display_id + 1) is None): + time.sleep(0.02) + continue + + # get display data and update state + self.display_id += 1 + was_read, task = self.display_queue[self.display_id] + del self.display_queue[self.display_id] + display_id = self.display_id + + # do display predictions + with self.output_lock: + if was_read and task.id == 0: + # the first task + cur_display_inds = range(self.display_inds[-1] + 1) + elif not was_read: + # the last task + cur_display_inds = range(self.display_inds[0], + len(task.frames)) + else: + cur_display_inds = self.display_inds + + for frame_id in cur_display_inds: + frame = task.frames[frame_id] + if self.show: + cv2.imshow('Demo', frame) + cv2.waitKey(int(1000 / self.output_fps)) + if self.video_writer: + self.video_writer.write(frame) + + cur_time = time.time() + logger.debug( + f'Display thread: {1000*(cur_time - start_time):.0f} ms, ' + f'read id {read_id}, display id {display_id}') + start_time = cur_time + + def __iter__(self): + return self + + def __next__(self): + """Get data from read queue. + + This function is part of the main thread. + """ + if self.read_queue.qsize() == 0: + time.sleep(0.02) + return not self.stopped, None + + was_read, task = self.read_queue.get() + if not was_read: + # If we reach the end of the video, there aren't enough frames + # in the task.processed_frames, so no need to model inference + # and draw predictions. Put task into display queue. + with self.read_id_lock: + read_id = self.read_id + with self.display_lock: + self.display_queue[read_id] = was_read, copy.deepcopy(task) + + # main thread doesn't need to handle this task again + task = None + return was_read, task + + def start(self): + """Start read thread and display thread.""" + self.read_thread = threading.Thread( + target=self.read_fn, args=(), name='VidRead-Thread', daemon=True) + self.read_thread.start() + self.display_thread = threading.Thread( + target=self.display_fn, + args=(), + name='VidDisplay-Thread', + daemon=True) + self.display_thread.start() + + return self + + def clean(self): + """Close all threads and release all resources.""" + self.stopped = True + self.read_lock.acquire() + self.cap.release() + self.read_lock.release() + self.output_lock.acquire() + cv2.destroyAllWindows() + if self.video_writer: + self.video_writer.release() + self.output_lock.release() + + def join(self): + """Waiting for the finalization of read and display thread.""" + self.read_thread.join() + self.display_thread.join() + + def display(self, task): + """Add the visualized task to the display queue. + + Args: + task (TaskInfo object): task object that contain the necessary + information for prediction visualization. + """ + with self.display_lock: + self.display_queue[task.id] = (True, task) + + def get_output_video_writer(self, path): + """Return a video writer object. + + Args: + path (str): path to the output video file. + """ + return cv2.VideoWriter( + filename=path, + fourcc=cv2.VideoWriter_fourcc(*'mp4v'), + fps=float(self.output_fps), + frameSize=self.display_size, + isColor=True) + + +class BaseVisualizer(metaclass=ABCMeta): + """Base class for visualization tools.""" + + def __init__(self, max_labels_per_bbox): + self.max_labels_per_bbox = max_labels_per_bbox + + def draw_predictions(self, task): + """Visualize stdet predictions on raw frames.""" + # read bboxes from task + bboxes = task.display_bboxes.cpu().numpy() + + # draw predictions and update task + keyframe_idx = len(task.frames) // 2 + draw_range = [ + keyframe_idx - task.clip_vis_length // 2, + keyframe_idx + (task.clip_vis_length - 1) // 2 + ] + assert draw_range[0] >= 0 and draw_range[1] < len(task.frames) + task.frames = self.draw_clip_range(task.frames, task.action_preds, + bboxes, draw_range) + + return task + + def draw_clip_range(self, frames, preds, bboxes, draw_range): + """Draw a range of frames with the same bboxes and predictions.""" + # no predictions to be draw + if bboxes is None or len(bboxes) == 0: + return frames + + # draw frames in `draw_range` + left_frames = frames[:draw_range[0]] + right_frames = frames[draw_range[1] + 1:] + draw_frames = frames[draw_range[0]:draw_range[1] + 1] + + # get labels(texts) and draw predictions + draw_frames = [ + self.draw_one_image(frame, bboxes, preds) for frame in draw_frames + ] + + return list(left_frames) + draw_frames + list(right_frames) + + @abstractmethod + def draw_one_image(self, frame, bboxes, preds): + """Draw bboxes and corresponding texts on one frame.""" + + @staticmethod + def abbrev(name): + """Get the abbreviation of label name: + + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +class DefaultVisualizer(BaseVisualizer): + """Tools to visualize predictions. + + Args: + max_labels_per_bbox (int): Max number of labels to visualize for a + person box. Default: 5. + plate (str): The color plate used for visualization. Two recommended + plates are blue plate `03045e-023e8a-0077b6-0096c7-00b4d8-48cae4` + and green plate `004b23-006400-007200-008000-38b000-70e000`. These + plates are generated by https://coolors.co/. + Default: '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'. + text_fontface (int): Fontface from OpenCV for texts. + Default: cv2.FONT_HERSHEY_DUPLEX. + text_fontscale (float): Fontscale from OpenCV for texts. + Default: 0.5. + text_fontcolor (tuple): fontface from OpenCV for texts. + Default: (255, 255, 255). + text_thickness (int): Thickness from OpenCV for texts. + Default: 1. + text_linetype (int): LInetype from OpenCV for texts. + Default: 1. + """ + + def __init__( + self, + max_labels_per_bbox=5, + plate='03045e-023e8a-0077b6-0096c7-00b4d8-48cae4', + text_fontface=cv2.FONT_HERSHEY_DUPLEX, + text_fontscale=0.5, + text_fontcolor=(255, 255, 255), # white + text_thickness=1, + text_linetype=1): + super().__init__(max_labels_per_bbox=max_labels_per_bbox) + self.text_fontface = text_fontface + self.text_fontscale = text_fontscale + self.text_fontcolor = text_fontcolor + self.text_thickness = text_thickness + self.text_linetype = text_linetype + + def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + plate = plate.split('-') + self.plate = [hex2color(h) for h in plate] + + def draw_one_image(self, frame, bboxes, preds): + """Draw predictions on one image.""" + for bbox, pred in zip(bboxes, preds): + # draw bbox + box = bbox.astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + cv2.rectangle(frame, st, ed, (0, 0, 255), 2) + + # draw texts + for k, (label, score) in enumerate(pred): + if k >= self.max_labels_per_bbox: + break + text = f'{self.abbrev(label)}: {score:.4f}' + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, self.text_fontface, + self.text_fontscale, + self.text_thickness)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, self.plate[k + 1], -1) + cv2.putText(frame, text, location, self.text_fontface, + self.text_fontscale, self.text_fontcolor, + self.text_thickness, self.text_linetype) + + return frame + + +def main(args): + # init human detector + human_detector = MmdetHumanDetector(args.det_config, args.det_checkpoint, + args.device, args.det_score_thr) + + # init action detector + config = Config.fromfile(args.config) + config.merge_from_dict(args.cfg_options) + + try: + # In our spatiotemporal detection demo, different actions should have + # the same number of bboxes. + config['model']['test_cfg']['rcnn'] = dict(action_thr=0) + except KeyError: + pass + stdet_predictor = StdetPredictor( + config=config, + checkpoint=args.checkpoint, + device=args.device, + score_thr=args.action_score_thr, + label_map_path=args.label_map) + + # init clip helper + clip_helper = ClipHelper( + config=config, + display_height=args.display_height, + display_width=args.display_width, + input_video=args.input_video, + predict_stepsize=args.predict_stepsize, + output_fps=args.output_fps, + clip_vis_length=args.clip_vis_length, + out_filename=args.out_filename, + show=args.show) + + # init visualizer + vis = DefaultVisualizer() + + # start read and display thread + clip_helper.start() + + try: + # Main thread main function contains: + # 1) get data from read queue + # 2) get human bboxes and stdet predictions + # 3) draw stdet predictions and update task + # 4) put task into display queue + for able_to_read, task in clip_helper: + # get data from read queue + + if not able_to_read: + # read thread is dead and all tasks are processed + break + + if task is None: + # when no data in read queue, wait + time.sleep(0.01) + continue + + inference_start = time.time() + + # get human bboxes + human_detector.predict(task) + + # get stdet predictions + stdet_predictor.predict(task) + + # draw stdet predictions in raw frames + vis.draw_predictions(task) + logger.info(f'Stdet Results: {task.action_preds}') + + # add draw frames to display queue + clip_helper.display(task) + + logger.debug('Main thread inference time ' + f'{1000*(time.time() - inference_start):.0f} ms') + + # wait for display thread + clip_helper.join() + except KeyboardInterrupt: + pass + finally: + # close read & display thread, release all resources + clip_helper.clean() + + +if __name__ == '__main__': + main(parse_args()) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..59dbb71933a017f8f1acdfdbcfb5c9ee1c04419b --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,30 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX" +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" +ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" + +# fetch the key refer to https://forums.developer.nvidia.com/t/18-04-cuda-docker-image-is-broken/212892/9 +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub 32 +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub +RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 ffmpeg \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install MMCV +RUN pip install openmim +RUN mim install mmengine mmcv + +# Install MMAction2 +RUN conda clean --all +RUN git clone https://github.com/open-mmlab/mmaction2.git /mmaction2 +WORKDIR /mmaction2 +RUN mkdir -p /mmaction2/data +ENV FORCE_CUDA="1" +RUN git checkout main +RUN pip install cython --no-cache-dir +RUN pip install --no-cache-dir -e . diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b9525bb853e9042748d4cfa4106a52caa01c9136 --- /dev/null +++ b/docker/serve/Dockerfile @@ -0,0 +1,51 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="10.2" +ARG CUDNN="7" +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ARG MMCV="1.3.8" +ARG MMACTION="0.24.0" + +ENV PYTHONUNBUFFERED TRUE + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + g++ \ + openjdk-11-jre-headless \ + # MMDET Requirements + ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ + libsndfile1 libturbojpeg \ + && rm -rf /var/lib/apt/lists/* + +ENV PATH="/opt/conda/bin:$PATH" +RUN export FORCE_CUDA=1 + +# TORCHSEVER +RUN pip install torchserve torch-model-archiver + +# MMLAB +ARG PYTORCH +ARG CUDA +RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"] +# RUN pip install mmaction2==${MMACTION} +RUN pip install git+https://github.com/open-mmlab/mmaction2.git + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh + +RUN chmod +x /usr/local/bin/entrypoint.sh \ + && chown -R model-server /home/model-server + +COPY config.properties /home/model-server/config.properties +RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store + +EXPOSE 8080 8081 8082 + +USER model-server +WORKDIR /home/model-server +ENV TEMP=/home/model-server/tmp +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["serve"] diff --git a/docker/serve/config.properties b/docker/serve/config.properties new file mode 100644 index 0000000000000000000000000000000000000000..dd9a685150199972c02d2c8bdcd910ee5c1a3ce4 --- /dev/null +++ b/docker/serve/config.properties @@ -0,0 +1,5 @@ +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +model_store=/home/model-server/model-store +load_models=all diff --git a/docker/serve/entrypoint.sh b/docker/serve/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9aedae68fa0938c6ec096930375661ab49889b9 --- /dev/null +++ b/docker/serve/entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [[ "$1" = "serve" ]]; then + shift 1 + torchserve --start --ts-config /home/model-server/config.properties +else + eval "$@" +fi + +# prevent docker exit +tail -f /dev/null diff --git a/docs/en/Makefile b/docs/en/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..73a28c7134cd1760744f34bac4ebdedfbed40f72 --- /dev/null +++ b/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..55b3d3f8ffde0fd0e9d00e7f8b73124bba6cfe2d --- /dev/null +++ b/docs/en/_static/css/readthedocs.css @@ -0,0 +1,62 @@ +.header-logo { + background-image: url("../images/logo.png"); + background-size: 130px 40px; + height: 40px; + width: 130px; +} + +@media screen and (min-width: 1100px) { + .header-logo { + top: -12px; + } + } + + pre { + white-space: pre; + } + + @media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } + } + + + article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; + } + + /* Disable the change in tables */ + article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; + } + + table.autosummary td { + width: 50% + } + + img.align-center { + display: block; + margin-left: auto; + margin-right: auto; + } + + article.pytorch-article p.rubric { + font-weight: bold; + } diff --git a/docs/en/_static/images/logo.png b/docs/en/_static/images/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add Binary files /dev/null and b/docs/en/_static/images/logo.png differ diff --git a/docs/en/_static/js/custom.js b/docs/en/_static/js/custom.js new file mode 100644 index 0000000000000000000000000000000000000000..207dcb32ae79fa2f72220e39816011ce5c1c77c2 --- /dev/null +++ b/docs/en/_static/js/custom.js @@ -0,0 +1,10 @@ +var collapsedSections = ['Dataset Zoo']; + +$(document).ready(function () { + $('.model-summary').DataTable({ + "stateSave": false, + "lengthChange": false, + "pageLength": 20, + "order": [] + }); + }); diff --git a/docs/en/_templates/404.html b/docs/en/_templates/404.html new file mode 100644 index 0000000000000000000000000000000000000000..3dcff6e0ca7f27da4a0d379c9c34aeb087ed7f9e --- /dev/null +++ b/docs/en/_templates/404.html @@ -0,0 +1,18 @@ +{% extends "layout.html" %} + +{% block body %} + +

Page Not Found

+

+ The page you are looking for cannot be found. +

+

+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in + the content table left, or go to the homepage. +

+

+ If you cannot find documentation you want, please open an issue to tell us! +

+ +{% endblock %} diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..7cfa14770fc9c802e36c913cfd1de1ab8aef562b --- /dev/null +++ b/docs/en/advanced_guides/customize_dataset.md @@ -0,0 +1,121 @@ +# Customize Dataset + +In this tutorial, we will introduce some methods about how to customize your own dataset by online conversion. + +- [Customize Dataset](#customize-dataset) + - [General understanding of the Dataset in MMAction2](#general-understanding-of-the-dataset-in-mmaction2) + - [Customize new datasets](#customize-new-datasets) + - [Customize keypoint format for PoseDataset](#customize-keypoint-format-for-posedataset) + +## General understanding of the Dataset in MMAction2 + +MMAction2 provides task-specific `Dataset` class, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. These task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. The remaining functions are automatically handled by the superclass (i.e., `BaseActionDataset` and `BaseDataset`). The following table shows the inheritance relationship and the main method of the modules. + +| Class Name | Class Method | +| ------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `MMAction2::VideoDataset` | `load_data_list(self)`
Build data list from the annotation file. | +| `MMAction2::BaseActionDataset` | `get_data_info(self, idx)`
Given the `idx`, return the corresponding data sample from the data list. | +| `MMEngine::BaseDataset` | `__getitem__(self, idx)`
Given the `idx`, call `get_data_info` to get the data sample, then call the `pipeline` to perform transforms and augmentation in `train_pipeline` or `val_pipeline` . | + +## Customize new datasets + +Although offline conversion is the preferred method for utilizing your own data in most cases, MMAction2 offers a convenient process for creating a customized `Dataset` class. As mentioned previously, task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. It is noteworthy that the elements in the `data_list` are `dict` with fields that are essential for the subsequent processes in the `pipeline`. + +Taking `VideoDataset` as an example, `train_pipeline`/`val_pipeline` require `'filename'` in `DecordInit` and `'label'` in `PackActionInputs`. Consequently, the data samples in the `data_list` must contain 2 fields: `'filename'` and `'label'`. +Please refer to [customize pipeline](customize_pipeline.md) for more details about the `pipeline`. + +``` +data_list.append(dict(filename=filename, label=label)) +``` + +However, `AVADataset` is more complex, data samples in the `data_list` consist of several fields about the video data. Moreover, it overwrites `get_data_info(self, idx)` to convert keys that are indispensable in the spatio-temporal action detection pipeline. + +```python + +class AVADataset(BaseActionDataset): + ... + + def load_data_list(self) -> List[dict]: + ... + video_info = dict( + frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + data_list.append(video_info) + data_list.append(video_info) + return data_list + + def get_data_info(self, idx: int) -> dict: + ... + ann = data_info.pop('ann') + data_info['gt_bboxes'] = ann['gt_bboxes'] + data_info['gt_labels'] = ann['gt_labels'] + data_info['entity_ids'] = ann['entity_ids'] + return data_info +``` + +## Customize keypoint format for PoseDataset + +MMAction2 currently supports three keypoint formats: `coco`, `nturgb+d` and `openpose`. If you use one of these formats, you may simply specify the corresponding format in the following modules: + +For Graph Convolutional Networks, such as AAGCN, STGCN, ... + +- `pipeline`: argument `dataset` in `JointToBone`. +- `backbone`: argument `graph_cfg` in Graph Convolutional Networks. + +For PoseC3D: + +- `pipeline`: In `Flip`, specify `left_kp` and `right_kp` based on the symmetrical relationship between keypoints. +- `pipeline`: In `GeneratePoseTarget`, specify `skeletons`, `left_limb`, `right_limb` if `with_limb` is `True`, and `left_kp`, `right_kp` if `with_kp` is `True`. + +If using a custom keypoint format, it is necessary to include a new graph layout in both the `backbone` and `pipeline`. This layout will define the keypoints and their connection relationship. + +Taking the `coco` dataset as an example, we define a layout named `coco` in `Graph`. The `inward` connections of this layout comprise all node connections, with each **centripetal** connection consisting of a tuple of nodes. Additional settings for `coco` include specifying the number of nodes as `17` the `node 0` as the central node. + +```python + +self.num_node = 17 +self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), + (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0), + (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)] +self.center = 0 +``` + +Similarly, we define the `pairs` in `JointToBone`, adding a bone of `(0, 0)` to align the number of bones to the nodes. The `pairs` of coco dataset are shown below, and the order of `pairs` in `JointToBone` is irrelevant. + +```python + +self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0), + (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0), + (12, 0), (13, 11), (14, 12), (15, 13), (16, 14)) +``` + +To use your custom keypoint format, simply define the aforementioned settings as your graph structure and specify them in your config file as shown below, In this example, we will use `STGCN`, with `n` denoting the number of classes and `custom_dataset` defined in `Graph` and `JointToBone`. + +```python +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=n, in_channels=256)) + +train_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +val_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +test_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +``` diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md new file mode 100644 index 0000000000000000000000000000000000000000..145313a9a7ffc58bebea8fdd54c6a0b36dff14f3 --- /dev/null +++ b/docs/en/advanced_guides/customize_logging.md @@ -0,0 +1,163 @@ +# Customize Logging + +MMAction2 produces a lot of logs during the running process, such as loss, iteration time, learning rate, etc. In this section, we will introduce you how to output custom log. More details about the logging system, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html). + +- [Customize Logging](#customize-logging) + - [Flexible Logging System](#flexible-logging-system) + - [Customize log](#customize-log) + - [Export the debug log](#export-the-debug-log) + +## Flexible Logging System + +The MMAction2 logging system is configured by the `LogProcessor` in [default_runtime](https://github.com/open-mmlab/mmaction2/tree/main/configs/_base_/default_runtime.py) by default, which is equivalent to: + +```python +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) +``` + +By default, the `LogProcessor` captures all fields that begin with `loss` returned by `model.forward`. For instance, in the following model, `loss1` and `loss2` will be logged automatically without any additional configuration. + +```python +from mmengine.model import BaseModel + +class ToyModel(BaseModel): + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss1 = (feat - label).pow(2) + loss2 = (feat - label).abs() + return dict(loss1=loss1, loss2=loss2) +``` + +The output log follows the following format: + +``` +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0019 data_time: 0.0004 loss1: 0.8381 loss2: 0.9007 loss: 1.7388 +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0029 data_time: 0.0010 loss1: 0.1978 loss2: 0.4312 loss: 0.6290 +``` + +`LogProcessor` will output the log in the following format: + +- The prefix of the log: + - epoch mode(`by_epoch=True`): `Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}` + - iteration mode(`by_epoch=False`): `Iter(train) [{current_iteration}/{max_iteration}]` +- Learning rate (`lr`): The learning rate of the last iteration. +- Time: + - `time`: The averaged time for inference of the last `window_size` iterations. + - `data_time`: The averaged time for loading data of the last `window_size` iterations. + - `eta`: The estimated time of arrival to finish the training. +- Loss: The averaged loss output by model of the last `window_size` iterations. + +```{warning} +log_processor outputs the epoch based log by default(`by_epoch=True`). To get an expected log matched with the `train_cfg`, we should set the same value for `by_epoch` in `train_cfg` and `log_processor`. +``` + +Based on the rules above, the code snippet will count the average value of the loss1 and the loss2 every 20 iterations. More types of statistical methods, please refer to [mmengine.runner.LogProcessor](mmengine.runner.LogProcessor). + +## Customize log + +The logging system could not only log the `loss`, `lr`, .etc but also collect and output the custom log. For example, if we want to statistic the intermediate loss: + +The `ToyModel` calculate `loss_tmp` in forward, but don't save it into the return dict. + +```python +from mmengine.logging import MessageHub + +class ToyModel(BaseModel): + + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss_tmp = (feat - label).abs() + loss = loss_tmp.pow(2) + + message_hub = MessageHub.get_current_instance() + # update the intermediate `loss_tmp` in the message hub + message_hub.update_scalar('train/loss_tmp', loss_tmp.sum()) + return dict(loss=loss) +``` + +Add the `loss_tmp` into the config: + +```python +log_processor = dict( + type='LogProcessor', + window_size=20, + by_epoch=True, + custom_cfg=[ + # statistic the loss_tmp with the averaged value + dict( + data_src='loss_tmp', + window_size=20, + method_name='mean') + ]) +``` + +The `loss_tmp` will be added to the output log: + +``` +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0026 data_time: 0.0008 loss_tmp: 0.0097 loss: 0.0000 +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0028 data_time: 0.0013 loss_tmp: 0.0065 loss: 0.0000 +``` + +## Export the debug log + +To export the debug log to the `work_dir`, you can set log_level in config file as follows: + +``` +log_level='DEBUG' +``` + +``` +08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend +08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook +08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine" +... +``` + +Besides, logs of different ranks will be saved in `debug` mode if you are training your model with the shared storage. The hierarchy of the log is as follows: + +```text +./tmp +โ”œโ”€โ”€ tmp.log +โ”œโ”€โ”€ tmp_rank1.log +โ”œโ”€โ”€ tmp_rank2.log +โ”œโ”€โ”€ tmp_rank3.log +โ”œโ”€โ”€ tmp_rank4.log +โ”œโ”€โ”€ tmp_rank5.log +โ”œโ”€โ”€ tmp_rank6.log +โ””โ”€โ”€ tmp_rank7.log +... +โ””โ”€โ”€ tmp_rank63.log +``` + +The log of Multiple machines with independent storage: + +```text +# device: 0: +work_dir/ +โ””โ”€โ”€ exp_name_logs + โ”œโ”€โ”€ exp_name.log + โ”œโ”€โ”€ exp_name_rank1.log + โ”œโ”€โ”€ exp_name_rank2.log + โ”œโ”€โ”€ exp_name_rank3.log + ... + โ””โ”€โ”€ exp_name_rank7.log + +# device: 7: +work_dir/ +โ””โ”€โ”€ exp_name_logs + โ”œโ”€โ”€ exp_name_rank56.log + โ”œโ”€โ”€ exp_name_rank57.log + โ”œโ”€โ”€ exp_name_rank58.log + ... + โ””โ”€โ”€ exp_name_rank63.log +``` diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md new file mode 100644 index 0000000000000000000000000000000000000000..3aa02f4cb25f963545aa03f4b49eebbf3a3a189e --- /dev/null +++ b/docs/en/advanced_guides/customize_models.md @@ -0,0 +1,3 @@ +# Customize Models + +coming soon... diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..2e95db15c5c857835f5a3b812b217f546171c03d --- /dev/null +++ b/docs/en/advanced_guides/customize_optimizer.md @@ -0,0 +1,340 @@ +# Customize Optimizer + +In this tutorial, we will introduce some methods about how to build the optimizer and learning rate scheduler for your tasks. + +- [Customize Optimizer](#customize-optimizer) + - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper) + - [Use optimizers supported by PyTorch](#use-optimizers-supported-by-pytorch) + - [Parameter-wise finely configuration](#parameter-wise-finely-configuration) + - [Gradient clipping](#gradient-clipping) + - [Gradient accumulation](#gradient-accumulation) + - [Customize parameter schedules](#customize-parameter-schedules) + - [Customize learning rate schedules](#customize-learning-rate-schedules) + - [Customize momentum schedules](#customize-momentum-schedules) + - [Add new optimizers or constructors](#add-new-optimizers-or-constructors) + - [Add new optimizers](#add-new-optimizers) + - [1. Implement a new optimizer](#1-implement-a-new-optimizer) + - [2. Import the optimizer](#2-import-the-optimizer) + - [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file) + - [Add new optimizer constructors](#add-new-optimizer-constructors) + +## Build optimizers using optim_wrapper + +We use the `optim_wrapper` field to configure the strategies of optimization, which includes choices of the optimizer, parameter-wise configurations, gradient clipping and accumulation. A simple example can be: + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001) +) +``` + +In the above example, a SGD optimizer with learning rate 0.0003 and weight decay 0.0001 is built. + +### Use optimizers supported by PyTorch + +We support all the optimizers implemented by PyTorch. To use a different optimizer, just need to change the `optimizer` field of config files. For example, if you want to use `torch.optim.Adam`, the modification in the config file could be as the following. + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer = dict( + type='Adam', + lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False), +) +``` + +First we need to change the value of `type` to the desired optimizer name supported in `torch.optim`. Next we add necessary arguments of this optimizer to the `optimizer` field. The above config will build the following optimizer: + +```python +torch.optim.Adam(lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False) +``` + +### Parameter-wise finely configuration + +Some models may have parameter-specific settings for optimization, for example, no weight decay to the BatchNorm layers or using different learning rates for different network layers. +To finely configure them, we can use the `paramwise_cfg` argument in `optim_wrapper`. + +- **Set different hyper-parameter multipliers for different types of parameters.** + + For instance, we can set `norm_decay_mult=0.` in `paramwise_cfg` to change the weight decay of weight and bias of normalization layers to zero. + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4), + paramwise_cfg=dict(norm_decay_mult=0.)) + ``` + + More types of parameters are supported to configured, list as follow: + + - `lr_mult`: Multiplier for learning rate of all parameters. + - `decay_mult`: Multiplier for weight decay of all parameters. + - `bias_lr_mult`: Multiplier for learning rate of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1. + - `bias_decay_mult`: Multiplier for weight decay of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1. + - `norm_decay_mult`: Multiplier for weight decay of weigh and bias of normalization layers. Defaults to 1. + - `dwconv_decay_mult`: Multiplier for weight decay of depth-wise convolution layers. Defaults to 1. + - `bypass_duplicate`: Whether to bypass duplicated parameters. Defaults to `False`. + - `dcn_offset_lr_mult`: Multiplier for learning rate of deformable convolution layers. Defaults to 1. + +- **Set different hyper-parameter multipliers for specific parameters.** + + MMAction2 can use `custom_keys` in `paramwise_cfg` to specify different parameters to use different learning rates or weight decay. + + For example, to set all learning rates and weight decays of `backbone.layer0` to 0, the rest of `backbone` remains the same as the optimizer and the learning rate of `head` to 0.001, use the configs below. + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + paramwise_cfg=dict( + custom_keys={ + 'backbone.layer0': dict(lr_mult=0, decay_mult=0), + 'backbone': dict(lr_mult=1), + 'head': dict(lr_mult=0.1) + })) + ``` + +### Gradient clipping + +During the training process, the loss function may get close to a cliffy region and cause gradient explosion. And gradient clipping is helpful to stabilize the training process. More introduction can be found in [this page](https://paperswithcode.com/method/gradient-clipping). + +Currently we support `clip_grad` option in `optim_wrapper` for gradient clipping, refers to [PyTorch Documentation](torch.nn.utils.clip_grad_norm_). + +Here is an example: + +```python +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + # norm_type: type of the used p-norm, here norm_type is 2. + clip_grad=dict(max_norm=35, norm_type=2)) +``` + +### Gradient accumulation + +When computing resources are lacking, the batch size can only be set to a small value, which may affect the performance of models. Gradient accumulation can be used to solve this problem. We support `accumulative_counts` option in `optim_wrapper` for gradient accumulation. + +Here is an example: + +```python +train_dataloader = dict(batch_size=64) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + accumulative_counts=4) +``` + +Indicates that during training, back-propagation is performed every 4 iters. And the above is equivalent to: + +```python +train_dataloader = dict(batch_size=256) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001)) +``` + +## Customize parameter schedules + +In training, the optimzation parameters such as learing rate, momentum, are usually not fixed but changing through iterations or epochs. PyTorch supports several learning rate schedulers, which are not sufficient for complex strategies. In MMAction2, we provide `param_scheduler` for better controls of different parameter schedules. + +### Customize learning rate schedules + +Learning rate schedulers are widely used to improve performance. We support most of the PyTorch schedulers, including `ExponentialLR`, `LinearLR`, `StepLR`, `MultiStepLR`, etc. + +All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the +names of learning rate schedulers end with `LR`. + +- **Single learning rate schedule** + + In most cases, we use only one learning rate schedule for simplicity. For instance, [`MultiStepLR`](mmengine.optim.MultiStepLR) is used as the default learning rate schedule for ResNet. Here, `param_scheduler` is a dictionary. + + ```python + param_scheduler = dict( + type='MultiStepLR', + by_epoch=True, + milestones=[100, 150], + gamma=0.1) + ``` + + Or, we want to use the [`CosineAnnealingLR`](mmengine.optim.CosineAnnealingLR) scheduler to decay the learning rate: + + ```python + param_scheduler = dict( + type='CosineAnnealingLR', + by_epoch=True, + T_max=num_epochs) + ``` + +- **Multiple learning rate schedules** + + In some of the training cases, multiple learning rate schedules are applied for higher accuracy. For example ,in the early stage, training is easy to be volatile, and warmup is a technique to reduce volatility. + The learning rate will increase gradually from a minor value to the expected value by warmup and decay afterwards by other schedules. + + In MMAction2, simply combines desired schedules in `param_scheduler` as a list can achieve the warmup strategy. + + Here are some examples: + + 1. linear warmup during the first 50 iters. + + ```python + param_scheduler = [ + # linear warm-up by iters + dict(type='LinearLR', + start_factor=0.001, + by_epoch=False, # by iters + end=50), # only warm up for first 50 iters + # main learing rate schedule + dict(type='MultiStepLR', + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + ] + ``` + + 2. linear warmup and update lr by iter during the first 10 epochs. + + ```python + param_scheduler = [ + # linear warm-up by epochs in [0, 10) epochs + dict(type='LinearLR', + start_factor=0.001, + by_epoch=True, + end=10, + convert_to_iter_based=True, # Update learning rate by iter. + ), + # use CosineAnnealing schedule after 10 epochs + dict(type='CosineAnnealingLR', by_epoch=True, begin=10) + ] + ``` + + Notice that, we use `begin` and `end` arguments here to assign the valid range, which is \[`begin`, `end`) for this schedule. And the range unit is defined by `by_epoch` argument. If not specified, the `begin` is 0 and the `end` is the max epochs or iterations. + + If the ranges for all schedules are not continuous, the learning rate will stay constant in ignored range, otherwise all valid schedulers will be executed in order in a specific stage, which behaves the same as PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler). + +### Customize momentum schedules + +We support using momentum schedulers to modify the optimizer's momentum according to learning rate, which could make the loss converge in a faster way. The usage is the same as learning rate schedulers. + +All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the +names of momentum rate schedulers end with `Momentum`. + +Here is an example: + +```python +param_scheduler = [ + # the lr scheduler + dict(type='LinearLR', ...), + # the momentum scheduler + dict(type='LinearMomentum', + start_factor=0.001, + by_epoch=False, + begin=0, + end=1000) +] +``` + +## Add new optimizers or constructors + +This part will modify the MMAction2 source code or add code to the MMAction2 framework, beginners can skip it. + +### Add new optimizers + +In academic research and industrial practice, it may be necessary to use optimization methods not implemented by MMAction2, and you can add them through the following methods. + +#### 1. Implement a new optimizer + +Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`. +You need to create a new file under `mmaction/engine/optimizers`, and implement the new optimizer in the file, for example, in `mmaction/engine/optimizers/my_optimizer.py`: + +```python +from torch.optim import Optimizer +from mmaction.registry import OPTIMIZERS + + +@OPTIMIZERS.register_module() +class MyOptimizer(Optimizer): + + def __init__(self, a, b, c): + ... + + def step(self, closure=None): + ... +``` + +#### 2. Import the optimizer + +To find the above module defined above, this module should be imported during the running. First import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package. + +```python +# In mmaction/engine/optimizers/__init__.py +... +from .my_optimizer import MyOptimizer # MyOptimizer maybe other class name + +__all__ = [..., 'MyOptimizer'] +``` + +During running, we will automatically import the `mmaction.engine` package and register the `MyOptimizer` at the same time. + +#### 3. Specify the optimizer in the config file + +Then you can use `MyOptimizer` in the `optim_wrapper.optimizer` field of config files. + +```python +optim_wrapper = dict( + optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)) +``` + +### Add new optimizer constructors + +Some models may have some parameter-specific settings for optimization, like different weight decay rate for all `BatchNorm` layers. + +Although we already can use [the `optim_wrapper.paramwise_cfg` field](#parameter-wise-finely-configuration) to +configure various parameter-specific optimizer settings. It may still not cover your need. + +Of course, you can modify it. By default, we use the [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor) +class to deal with the construction of optimizer. And during the construction, it fine-grainedly configures the optimizer settings of +different parameters according to the `paramwise_cfg`๏ผŒwhich could also serve as a template for new optimizer constructor. + +You can overwrite these behaviors by add new optimizer constructors. + +```python +# In mmaction/engine/optimizers/my_optim_constructor.py +from mmengine.optim import DefaultOptimWrapperConstructor +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class MyOptimWrapperConstructor: + + def __init__(self, optim_wrapper_cfg, paramwise_cfg=None): + ... + + def __call__(self, model): + ... +``` + +And then, import it and use it almost like [the optimizer tutorial](#add-new-optimizers). + +1. Import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package. + + ```python + # In mmaction/engine/optimizers/__init__.py + ... + from .my_optim_constructor import MyOptimWrapperConstructor + + __all__ = [..., 'MyOptimWrapperConstructor'] + ``` + +2. Use `MyOptimWrapperConstructor` in the `optim_wrapper.constructor` field of config files. + + ```python + optim_wrapper = dict( + constructor=dict(type='MyOptimWrapperConstructor'), + optimizer=..., + paramwise_cfg=..., + ) + ``` diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md new file mode 100644 index 0000000000000000000000000000000000000000..ed33bbbb7682253cd005b9cf73b9127618107f87 --- /dev/null +++ b/docs/en/advanced_guides/customize_pipeline.md @@ -0,0 +1,148 @@ +# Customize Data Pipeline + +In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations) for your tasks. + +- [Customize Data Pipeline](#customize-data-pipeline) + - [Design of Data Pipeline](#design-of-data-pipeline) + - [Modify the Training/Testing Pipeline](#modify-the-trainingtest-pipeline) + - [Loading](#loading) + - [Sampling Frames and Other Processing](#sampling-frames-and-other-processing) + - [Formatting](#formatting) + - [Add New Data Transforms](#add-new-data-transforms) + +## Design of Data Pipeline + +The data pipeline refers to the procedure of handling the data sample dict when indexing a sample from the dataset, and comprises a series of data transforms. Each data transform accepts a `dict` as input, processes it, and produces a `dict` as output for the subsequent data transform in the sequence. + +Below is an example data pipeline for training SlowFast on Kinetics using `VideoDataset`. The pipeline initially employs [`decord`](https://github.com/dmlc/decord) to read the raw videos and randomly sample one video clip, which comprises `32` frames with a frame interval of `2`. Subsequently, it applies random resized crop and random horizontal flip to all frames before formatting the data shape as `NCTHW`, which is `(1, 3, 32, 224, 224)` in this example. + +```python +train_pipeline = [ + dict(type='DecordInit',), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +A comprehensive list of all available data transforms in MMAction2 can be found in the [mmaction.datasets.transforms](mmaction.datasets.transforms). + +## Modify the Training/Testing Pipeline + +The data pipeline in MMAction2 is highly adaptable, as nearly every step of the data preprocessing can be configured from the config file. However, the wide array of options may be overwhelming for some users. + +Below are some general practices and guidance for building a data pipeline for action recognition tasks. + +### Loading + +At the beginning of a data pipeline, it is customary to load videos. However, if the frames have already been extracted, you should utilize `RawFrameDecode` and modify the dataset type to `RawframeDataset`. + +```python +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +If you need to load data from files with distinct formats (e.g., `pkl`, `bin`, etc.) or from specific locations, you may create a new loading transform and include it at the beginning of the data pipeline. Please refer to [Add New Data Transforms](#add-new-data-transforms) for more details. + +### Sampling Frames and Other Processing + +During training and testing, we may have different strategies to sample frames from the video. + +For instance, when testing SlowFast, we uniformly sample multiple clips as follows: + +```python +test_pipeline = [ + ... + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + ... +] +``` + +In the above example, 10 video clips, each comprising 32 frames, will be uniformly sampled from each video. `test_mode=True` is employed to accomplish this, as opposed to random sampling during training. + +Another example involves `TSN/TSM` models, which sample multiple segments from the video: + +```python +train_pipeline = [ + ... + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + ... +] +``` + +Typically, the data augmentations in the data pipeline handles only video-level transforms, such as resizing or cropping, but not transforms like video normalization or mixup/cutmix. This is because we can do video normalization and mixup/cutmix on batched video data +to accelerate processing using GPUs. To configure video normalization and mixup/cutmix, please use the [mmaction.models.utils.data_preprocessor](mmaction.models.utils.data_preprocessor). + +### Formatting + +Formatting involves collecting training data from the data information dict and converting it into a format that is compatible with the model. + +In most cases, you can simply employ [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs), and it will +convert the image in `NumPy Array` format to `PyTorch Tensor`, and pack the ground truth category information and +other meta information as a dict-like object [`ActionDataSample`](mmaction.structures.ActionDataSample). + +```python +train_pipeline = [ + ... + dict(type='PackActionInputs'), +] +``` + +## Add New Data Transforms + +1. To create a new data transform, write a new transform class in a python file named, for example, `my_transforms.py`. The data transform classes must inherit + the [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) class and override the `transform` method which takes a `dict` as input and returns a `dict`. Finally, place `my_transforms.py` in the folder `mmaction/datasets/transforms/`. + + ```python + from mmcv.transforms import BaseTransform + from mmaction.datasets import TRANSFORMS + + @TRANSFORMS.register_module() + class MyTransform(BaseTransform): + def __init__(self, msg): + self.msg = msg + + def transform(self, results): + # Modify the data information dict `results`. + print(msg, 'MMAction2.') + return results + ``` + +2. Import the new class in the `mmaction/datasets/transforms/__init__.py`. + + ```python + ... + from .my_transform import MyTransform + + __all__ = [ + ..., 'MyTransform' + ] + ``` + +3. Use it in config files. + + ```python + train_pipeline = [ + ... + dict(type='MyTransform', msg='Hello!'), + ... + ] + ``` diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md new file mode 100644 index 0000000000000000000000000000000000000000..915d888ecddeca9fd635fa1b55db4f7f2d8aa938 --- /dev/null +++ b/docs/en/advanced_guides/dataflow.md @@ -0,0 +1,3 @@ +# Dataflow in MMAction2 + +coming soon... diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md new file mode 100644 index 0000000000000000000000000000000000000000..82fab764a856d26c5575a22f24743411b4e54a5f --- /dev/null +++ b/docs/en/advanced_guides/depoly.md @@ -0,0 +1,3 @@ +# How to deploy MMAction2 models + +coming soon... diff --git a/docs/en/api.rst b/docs/en/api.rst new file mode 100644 index 0000000000000000000000000000000000000000..f3f688462bc92067c883eb4c61bc9246c271f659 --- /dev/null +++ b/docs/en/api.rst @@ -0,0 +1,140 @@ +mmaction.apis +-------------- +.. automodule:: mmaction.apis + :members: + +mmaction.datasets +-------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmaction.datasets + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmaction.datasets.transforms + :members: + +mmaction.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmaction.engine.hooks + :members: + +optimizers +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.engine.optimizers + :members: + +runner +^^^^^^^^^^ +.. automodule:: mmaction.engine.runner + :members: + + +mmaction.evaluation +-------------------- + +functional +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.evaluation.functional + :members: + +metrics +^^^^^^^^^^ +.. automodule:: mmaction.evaluation.metrics + :members: + + +mmaction.models +-------------- + +backbones +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.backbones + :members: + +common +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.common + :members: + +data_preprocessors +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.data_preprocessors + :members: + +heads +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.heads + :members: + +localizers +^^^^^^^^^^ +.. automodule:: mmaction.models.localizers + :members: + + +losses +^^^^^^^^^^ +.. automodule:: mmaction.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmaction.models.necks + :members: + +roi_heads +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.roi_heads + :members: + +recognizers +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.seg_heads + :members: + +task_modules +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.task_modules + :members: + + +utils +^^^^^^^^^^ +.. automodule:: mmaction.models.utils + :members: + + +mmaction.structures +-------------------- + +structures +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.structures + :members: + +bbox +^^^^^^^^^^ +.. automodule:: mmaction.structures.bbox + :members: + + +mmaction.testing +---------------- +.. automodule:: mmaction.testing + :members: + +mmaction.visualization +-------------------- +.. automodule:: mmaction.visualization + :members: + +mmaction.utils +-------------- +.. automodule:: mmaction.utils + :members: diff --git a/docs/en/conf.py b/docs/en/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..4b4383705944ef0c34601904b31dcd8f39423954 --- /dev/null +++ b/docs/en/conf.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- + +project = 'MMAction2' +copyright = '2020, OpenMMLab' +author = 'MMAction2 Authors' +version_file = '../.././mmaction/version.py' + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_markdown_tables', + 'sphinx_copybutton', + 'sphinx_tabs.tabs', + 'notfound.extension', + 'sphinxcontrib.jquery', +] + +# numpy and torch are required +autodoc_mock_imports = ['mmaction.version', 'PIL'] + +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- +source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'} + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pytorch_sphinx_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". + +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] +html_theme_options = { + # 'logo_url': 'https://mmaction2.readthedocs.io/en/latest/', + 'menu': [ + { + 'name': + 'Tutorial', + 'url': + 'https://colab.research.google.com/github/' + 'open-mmlab/mmaction2/blob/master/demo/mmaction2_tutorial.ipynb' + }, + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmaction2' + }, + { + 'name': + 'Upstream', + 'children': [{ + 'name': + 'MMCV', + 'url': + 'https://github.com/open-mmlab/mmcv', + 'description': + 'Foundational library for computer vision' + }, { + 'name': + 'MMPreTrain', + 'url': + 'https://github.com/open-mmlab/mmpretrain', + 'description': + 'Open source pre-training toolbox based on PyTorch' + }, { + 'name': + 'MMDetection', + 'url': + 'https://github.com/open-mmlab/mmdetection', + 'description': + 'Object detection toolbox and benchmark' + }, { + 'name': + 'MMPose', + 'url': + 'https://github.com/open-mmlab/mmpose', + 'description': + 'Open-source toolbox for pose estimation based on PyTorch.' + }] + }, + ], + # Specify the language of shared menu + 'menu_lang': + 'en' +} + +language = 'en' +master_doc = 'index' + +html_static_path = ['_static'] +html_css_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css', + 'css/readthedocs.css' +] +html_js_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js', + 'js/custom.js' +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +# The not found page +notfound_template = '404.html' + + +def builder_inited_handler(app): + if subprocess.run(['python', './stat.py']).returncode != 0: + raise RuntimeError('Failed to run the script `stat.py`.') + if subprocess.run(['python', './project_zoo.py']).returncode != 0: + raise RuntimeError('Failed to run the script `project_zoo.py`.') + if subprocess.run(['python', './dataset_zoo.py']).returncode != 0: + raise RuntimeError('Failed to run the script `dataset_zoo.py`.') + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..6980475ede264d1d8bf59434d0e18cf50b6a04b2 --- /dev/null +++ b/docs/en/dataset_zoo.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +import re +from pathlib import Path + +from utils import replace_link + +DATASETS_ROOT = Path('dataset_zoo') # Path to save generated paper pages. +MODELZOO_TEMPLATE = """\ +# Dataset Zoo Summary + +In this page, we list [all datasets](#all-supported-datasets) we support. You can click the link to jump to the corresponding dataset pages. + +## All supported datasets + +* Number of datasets: {num_datasets} +{dataset_msg} + +""" # noqa: E501 + + +def generate_datasets_pages(): + dataset_list = Path('../../tools/data').glob('*/README.md') + num_datasets = 0 + dataset_msgs = [] + + for file in dataset_list: + num_datasets += 1 + + copy = DATASETS_ROOT / file.parent.with_suffix('.md').name + + with open(file, 'r') as f: + content = f.read() + + title = re.match(r'^# Preparing (.*)', content).group(1) + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + file) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + file) + dataset_msgs.append(f'\t - [{title}]({copy})') + + with open(copy, 'w') as f: + f.write(content) + + dataset_msg = '\n'.join(dataset_msgs) + + modelzoo = MODELZOO_TEMPLATE.format( + num_datasets=num_datasets, + dataset_msg=dataset_msg, + ) + + with open('datasetzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +DATASETS_ROOT.mkdir(exist_ok=True) +generate_datasets_pages() diff --git a/docs/en/docutils.conf b/docs/en/docutils.conf new file mode 100644 index 0000000000000000000000000000000000000000..ddd79c377666db4a615151f0676f7fec32d38359 --- /dev/null +++ b/docs/en/docutils.conf @@ -0,0 +1,2 @@ +[html writers] +table_style: colwidths-auto diff --git a/docs/en/get_started/contribution_guide.md b/docs/en/get_started/contribution_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..452cc17cd2cff980e329a830c91aaa39d8c734ba --- /dev/null +++ b/docs/en/get_started/contribution_guide.md @@ -0,0 +1,61 @@ +# How to contribute to MMAction2 + +All kinds of contributions are welcome, including but not limited to the following. + +- Fixes (typo, bugs) +- New features and components +- Add documentation or translate the documentation into other languages +- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](../projectzoo.md) for details + +## Workflow + +1. Fork and pull the latest mmaction2 +2. Checkout a new branch with a meaningful name (do not use main branch for PRs) +3. Commit your changes +4. Create a PR + +```{note} +- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first. +- If you are the author of some papers and would like to include your method to mmaction2, please contact us. We will much appreciate your contribution. +``` + +## Code style + +### Python + +We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. + +We use the following tools for linting and formatting: + +- [flake8](http://flake8.pycqa.org/en/latest/): linter +- [yapf](https://github.com/google/yapf): formatter +- [isort](https://github.com/timothycrosley/isort): sort imports +- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files. +- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. +- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. + +Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/main/setup.cfg). + +We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, +fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. +The config for a pre-commit hook is stored in [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/main/.pre-commit-config.yaml). + +After you clone the repository, you will need to install initialize pre-commit hook. + +``` +pip install -U pre-commit +``` + +From the repository folder + +```shell +pre-commit install +``` + +After this on every commit check code linters and formatter will be enforced. + +> Before you create a PR, make sure that your code lints and is formatted by yapf. + +### C++ and CUDA + +We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..a0d4a84a5516ed50e25a494f03a7305f90fa7ad6 --- /dev/null +++ b/docs/en/get_started/faq.md @@ -0,0 +1,132 @@ +# FAQ + +## Outline + +We list some common issues faced by many users and their corresponding solutions here. + +- [FAQ](#faq) + - [Outline](#outline) + - [Installation](#installation) + - [Data](#data) + - [Training](#training) + - [Testing](#testing) + +Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. +If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md) and make sure to fill in all required information in the template. + +## Installation + +- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"** + + 1. Uninstall existing mmcv in the environment using `pip uninstall mmcv` + 2. Install mmcv following the [installation instruction](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html#install-mmcv) + +- **"OSError: MoviePy Error: creation of None failed because of the following error"** + + Refer to [install.md](https://github.com/open-mmlab/mmaction2/blob/master/docs/install.md#requirements) + + 1. For Windows users, [ImageMagick](https://www.imagemagick.org/script/index.php) will not be automatically detected by MoviePy, there is a need to modify `moviepy/config_defaults.py` file by providing the path to the ImageMagick binary called `magick`, like `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"` + 2. For Linux users, there is a need to modify the `/etc/ImageMagick-6/policy.xml` file by commenting out `` to ``, if ImageMagick is not detected by moviepy. + +- **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"** + + You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html#installation) to install them. + +## Data + +- **FileNotFound like `No such file or directory: xxx/xxx/img_00300.jpg`** + + In our repo, we set `start_index=1` as default value for rawframe dataset, and `start_index=0` as default value for video dataset. + If users encounter FileNotFound error for the first or last frame of the data, there is a need to check the files begin with offset 0 or 1, + that is `xxx_00000.jpg` or `xxx_00001.jpg`, and then change the `start_index` value of data pipeline in configs. + +- **How should we preprocess the videos in the dataset? Resizing them to a fix size(all videos with the same height-width ratio) like `340x256` (1) or resizing them so that the short edges of all videos are of the same length (256px or 320px) (2)** + + We have tried both preprocessing approaches and found (2) is a better solution in general, so we use (2) with short edge length 256px as the default preprocessing setting. We benchmarked these preprocessing approaches and you may find the results in [TSN Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn) and [SlowOnly Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/slowonly). + +- **Mismatched data pipeline items lead to errors like `KeyError: 'total_frames'`** + + We have both pipeline for processing videos and frames. + + **For videos**, We should decode them on the fly in the pipeline, so pairs like `DecordInit & DecordDecode`, `OpenCVInit & OpenCVDecode`, `PyAVInit & PyAVDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py#L14-L16). + + **For Frames**, the image has been decoded offline, so pipeline item `RawFrameDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py#L17). + + `KeyError: 'total_frames'` is caused by incorrectly using `RawFrameDecode` step for videos, since when the input is a video, it can not get the `total_frames` beforehand. + +## Training + +- **How to just use trained recognizer models for backbone pre-training?** + + In order to use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`. + + And to use backbone for pre-training, you can change `pretrained` value in the backbone dict of config files to the checkpoint path / url. + When training, the unexpected keys will be ignored. + +- **How to fix stages of backbone when finetuning a model?** + + You can refer to [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L791) and [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L369-L370). + Reminding to set `find_unused_parameters = True` in config files for distributed training or testing. + + Actually, users can set `frozen_stages` to freeze stages in backbones except C3D model, since almost all backbones inheriting from `ResNet` and `ResNet3D` support the inner function `_freeze_stages()`. + +- **How to set memcached setting in config files?** + + In MMAction2, you can pass memcached kwargs to `class DecordInit` for video dataset or `RawFrameDecode` for rawframes dataset. + For more details, you can refer to \[`class FileClient`\] in MMEngine for more details. + Here is an example to use memcached for rawframes dataset: + + ```python + mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path') + + train_pipeline = [ + ... + dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg), + ... + ] + ``` + +- **How to set `load_from` value in config files to finetune models?** + + In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md), + users can directly change it by setting `load_from` in their configs. + +- **How to use `RawFrameDataset` for training?** + + In MMAction2 1.x version, most of the configs take `VideoDataset` as the default dataset type, which is much more friendly to file storage. If you want to use `RawFrameDataset` instead, there are two steps to modify: + + - Dataset: + modify dataset in `train_dataloader`/`val_dataloader`/`test_dataloader` from + + ``` + dataset=dict( + type=VideoDataset, + data_prefix=dict(video=xxx), + ...) + ``` + + to + + ``` + dataset=dict( + type=RawFrameDataset, + data_prefix=dict(img=xxx), + filename_tmpl='{:05}.jpg', + ...) + ``` + + remaining fields of `dataset` don't need to be modified. Please make sure that `filename_tmpl` is matching with your frame data, and you can refer to [config document](../user_guides/config.md) for more details about config file. + + - Transforms: delete `dict(type='DecordInit', **file_client_args)`, modify `dict(type='DecordDecode')` to `dict(type='RawFrameDecode', **file_client_args)` in `train_pipeline`/`val_pipeline`/`test_pipeline`, and please make sure that `file_client_args = dict(io_backend='disk')` has been defined in your config. + + For more modifications about customizing datasets, please refer to [prepare dataset](../user_guides/prepare_dataset.md) and [customize dataset](../advanced_guides/customize_dataset.md). + +## Testing + +- **How to make predicted score normalized by softmax within \[0, 1\]?** + + change this in the config, make `model.cls_head.average_clips = 'prob'`. + +- **What if the model is too large and the GPU memory can not fit even only one testing sample?** + + By default, the 3d models are tested with 10clips x 3crops, which are 30 views in total. For extremely large models, the GPU memory can not fit even only one testing sample (cuz there are 30 views). To handle this, you can set `max_testing_views=n` in `model['test_cfg']` of the config file. If so, n views will be used as a batch during forwarding to save GPU memory used. diff --git a/docs/en/get_started/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md new file mode 100644 index 0000000000000000000000000000000000000000..790f3895a0557f8a60a38d332c6c8d73bc00862d --- /dev/null +++ b/docs/en/get_started/guide_to_framework.md @@ -0,0 +1,761 @@ +# A 20-Minute Guide to MMAction2 FrameWork + +In this tutorial, we will demonstrate the overall architecture of our `MMACTION2 1.0` through a step-by-step example of video action recognition. + +The structure of this tutorial is as follows: + +- [A 20-Minute Guide to MMAction2 FrameWork](#a-20-minute-guide-to-mmaction2-framework) + - [Step0: Prepare Data](#step0-prepare-data) + - [Step1: Build a Pipeline](#step1-build-a-pipeline) + - [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) + - [Step3: Build a Recognizer](#step3-build-a-recognizer) + - [Step4: Build a Evaluation Metric](#step4-build-a-evaluation-metric) + - [Step5: Train and Test with Native PyTorch](#step5-train-and-test-with-native-pytorch) + - [Step6: Train and Test with MMEngine (Recommended)](#step6-train-and-test-with-mmengine-recommended) + +First, we need to initialize the `scope` for registry, to ensure that each module is registered under the scope of `mmaction`. For more detailed information about registry, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html). + +```python +from mmaction.utils import register_all_modules + +register_all_modules(init_default_scope=True) +``` + +## Step0: Prepare Data + +Please download our self-made [kinetics400_tiny](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset and extract it to the `$MMACTION2/data` directory. +The directory structure after extraction should be as follows: + +``` +mmaction2 +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ kinetics400_tiny +โ”‚ โ”‚ โ”œโ”€โ”€ kinetics_tiny_train_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ kinetics_tiny_val_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ train +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 27_CSXByd3s.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 34XczvTaRiI.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ A-wiliK50Zw.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ val +โ”‚ โ”‚ โ”œโ”€โ”€ 0pVGiAU6XEA.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ AQrbRSnRt8M.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +``` + +Here are some examples from the annotation file `kinetics_tiny_train_video.txt`: + +``` +D32_1gwq35E.mp4 0 +iRuyZSKhHRg.mp4 1 +oXy-e_P_cAI.mp4 0 +34XczvTaRiI.mp4 1 +h2YqqUhnR34.mp4 0 +``` + +Each line in the file represents the annotation of a video, where the first item denotes the video filename (e.g., `D32_1gwq35E.mp4`), and the second item represents the corresponding label (e.g., label `0` for `D32_1gwq35E.mp4`). In this dataset, there are only `two` categories. + +## Step1: Build a Pipeline + +In order to `decode`, `sample`, `resize`, `crop`, `format`, and `pack` the input video and corresponding annotation, we need to design a pipeline to handle these processes. Specifically, we design seven `Transform` classes to build this video processing pipeline. Note that all `Transform` classes in OpenMMLab must inherit from the `BaseTransform` class in `mmcv`, implement the abstract method `transform`, and be registered to the `TRANSFORMS` registry. For more detailed information about data transform, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html). + +```python +import mmcv +import decord +import numpy as np +from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor +from mmaction.structures import ActionDataSample + + +@TRANSFORMS.register_module() +class VideoInit(BaseTransform): + def transform(self, results): + container = decord.VideoReader(results['filename']) + results['total_frames'] = len(container) + results['video_reader'] = container + return results + + +@TRANSFORMS.register_module() +class VideoSample(BaseTransform): + def __init__(self, clip_len, num_clips, test_mode=False): + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + + def transform(self, results): + total_frames = results['total_frames'] + interval = total_frames // self.clip_len + + if self.test_mode: + # Make the sampling during testing deterministic + np.random.seed(42) + + inds_of_all_clips = [] + for i in range(self.num_clips): + bids = np.arange(self.clip_len) * interval + offset = np.random.randint(interval, size=bids.shape) + inds = bids + offset + inds_of_all_clips.append(inds) + + results['frame_inds'] = np.concatenate(inds_of_all_clips) + results['clip_len'] = self.clip_len + results['num_clips'] = self.num_clips + return results + + +@TRANSFORMS.register_module() +class VideoDecode(BaseTransform): + def transform(self, results): + frame_inds = results['frame_inds'] + container = results['video_reader'] + + imgs = container.get_batch(frame_inds).asnumpy() + imgs = list(imgs) + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoResize(BaseTransform): + def __init__(self, r_size): + self.r_size = (np.inf, r_size) + + def transform(self, results): + img_h, img_w = results['img_shape'] + new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size) + + imgs = [mmcv.imresize(img, (new_w, new_h)) + for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoCrop(BaseTransform): + def __init__(self, c_size): + self.c_size = c_size + + def transform(self, results): + img_h, img_w = results['img_shape'] + center_x, center_y = img_w // 2, img_h // 2 + x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2 + y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2 + imgs = [img[y1:y2, x1:x2] for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoFormat(BaseTransform): + def transform(self, results): + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = results['imgs'] + + # [num_clips*clip_len, H, W, C] + imgs = np.array(imgs) + # [num_clips, clip_len, H, W, C] + imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:]) + # [num_clips, C, clip_len, H, W] + imgs = imgs.transpose(0, 4, 1, 2, 3) + + results['imgs'] = imgs + return results + + +@TRANSFORMS.register_module() +class VideoPack(BaseTransform): + def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')): + self.meta_keys = meta_keys + + def transform(self, results): + packed_results = dict() + inputs = to_tensor(results['imgs']) + data_sample = ActionDataSample() + data_sample.set_gt_label(results['label']) + metainfo = {k: results[k] for k in self.meta_keys if k in results} + data_sample.set_metainfo(metainfo) + packed_results['inputs'] = inputs + packed_results['data_samples'] = data_sample + return packed_results +``` + +Below, we provide a code snippet (using `D32_1gwq35E.mp4 0` from the annotation file) to demonstrate how to use the pipeline. + +```python +import os.path as osp +from mmengine.dataset import Compose + +pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +pipeline = Compose(pipeline_cfg) +data_prefix = 'data/kinetics400_tiny/train' +results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0) +packed_results = pipeline(results) + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# Get metainfo of the inputs +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# Get label of the inputs +print('label: ', data_sample.gt_label) +``` + +``` +shape of the inputs: torch.Size([1, 3, 16, 224, 224]) +image_shape: (224, 224) +num_clips: 1 +clip_len: 16 +label: tensor([0]) +``` + +## Step2: Build a Dataset and DataLoader + +All `Dataset` classes in OpenMMLab must inherit from the `BaseDataset` class in `mmengine`. We can customize annotation loading process by overriding the `load_data_list` method. Additionally, we can add more information to the `results` dict that is passed as input to the `pipeline` by overriding the `get_data_info` method. For more detailed information about `BaseDataset` class, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html). + +```python +import os.path as osp +from mmengine.fileio import list_from_file +from mmengine.dataset import BaseDataset +from mmaction.registry import DATASETS + + +@DATASETS.register_module() +class DatasetZelda(BaseDataset): + def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''), + test_mode=False, modality='RGB', **kwargs): + self.modality = modality + super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root, + data_prefix=data_prefix, test_mode=test_mode, + **kwargs) + + def load_data_list(self): + data_list = [] + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split() + filename, label = line_split + label = int(label) + filename = osp.join(self.data_prefix['video'], filename) + data_list.append(dict(filename=filename, label=label)) + return data_list + + def get_data_info(self, idx: int) -> dict: + data_info = super().get_data_info(idx) + data_info['modality'] = self.modality + return data_info +``` + +Next, we will demonstrate how to use dataset and dataloader to index data. We will use the `Runner.build_dataloader` method to construct the dataloader. For more detailed information about dataloader, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader). + +```python +from mmaction.registry import DATASETS + +train_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +val_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +train_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_train_video.txt', + pipeline=train_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='train')) + +val_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_val_video.txt', + pipeline=val_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='val')) + +train_dataset = DATASETS.build(train_dataset_cfg) + +packed_results = train_dataset[0] + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# Get metainfo of the inputs +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# Get label of the inputs +print('label: ', data_sample.gt_label) + +from mmengine.runner import Runner + +BATCH_SIZE = 2 + +train_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset_cfg) + +val_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=val_dataset_cfg) + +train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg) +val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg) + +batched_packed_results = next(iter(train_data_loader)) + +batched_inputs = batched_packed_results['inputs'] +batched_data_sample = batched_packed_results['data_samples'] + +assert len(batched_inputs) == BATCH_SIZE +assert len(batched_data_sample) == BATCH_SIZE +``` + +The terminal output should be the same as the one shown in the [Step1: Build a Pipeline](#step1-build-a-pipeline). + +## Step3: Build a Recognizer + +Next, we will construct the `recognizer`, which mainly consists of three parts: `data preprocessor` for batching and normalizing the data, `backbone` for feature extraction, and `cls_head` for classification. + +The implementation of `data_preprocessor` is as follows: + +```python +import torch +from mmengine.model import BaseDataPreprocessor, stack_batch +from mmaction.registry import MODELS + + +@MODELS.register_module() +class DataPreprocessorZelda(BaseDataPreprocessor): + def __init__(self, mean, std): + super().__init__() + + self.register_buffer( + 'mean', + torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1), + False) + self.register_buffer( + 'std', + torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1), + False) + + def forward(self, data, training=False): + data = self.cast_data(data) + inputs = data['inputs'] + batch_inputs = stack_batch(inputs) # Batching + batch_inputs = (batch_inputs - self.mean) / self.std # Normalization + data['inputs'] = batch_inputs + return data +``` + +Here is the usage of data_preprocessor: feed the `batched_packed_results` obtained from the [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) into the `data_preprocessor` for batching and normalization. + +```python +from mmaction.registry import MODELS + +data_preprocessor_cfg = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375]) + +data_preprocessor = MODELS.build(data_preprocessor_cfg) + +preprocessed_inputs = data_preprocessor(batched_packed_results) +print(preprocessed_inputs['inputs'].shape) +``` + +``` +torch.Size([2, 1, 3, 16, 224, 224]) +``` + +The implementations of `backbone`, `cls_head` and `recognizer` are as follows: + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel, BaseModule, Sequential +from mmengine.structures import LabelData +from mmaction.registry import MODELS + + +@MODELS.register_module() +class BackBoneZelda(BaseModule): + def __init__(self, init_cfg=None): + if init_cfg is None: + init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"), + dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)] + + super(BackBoneZelda, self).__init__(init_cfg=init_cfg) + + self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7), + stride=(1, 2, 2), padding=(1, 3, 3)), + nn.BatchNorm3d(64), nn.ReLU()) + self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), + padding=(0, 1, 1)) + + self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1), + nn.BatchNorm3d(128), nn.ReLU()) + + def forward(self, imgs): + # imgs: [batch_size*num_views, 3, T, H, W] + # features: [batch_size*num_views, 128, T/2, H//8, W//8] + features = self.conv(self.maxpool(self.conv1(imgs))) + return features + + +@MODELS.register_module() +class ClsHeadZelda(BaseModule): + def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None): + if init_cfg is None: + init_cfg = dict(type='Normal', layer='Linear', std=0.01) + + super(ClsHeadZelda, self).__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.in_channels = in_channels + self.average_clips = average_clips + + if dropout != 0: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + self.fc = nn.Linear(self.in_channels, self.num_classes) + self.pool = nn.AdaptiveAvgPool3d(1) + self.loss_fn = nn.CrossEntropyLoss() + + def forward(self, x): + N, C, T, H, W = x.shape + x = self.pool(x) + x = x.view(N, C) + assert x.shape[1] == self.in_channels + + if self.dropout is not None: + x = self.dropout(x) + + cls_scores = self.fc(x) + return cls_scores + + def loss(self, feats, data_samples): + cls_scores = self(feats) + labels = torch.stack([x.gt_label for x in data_samples]) + labels = labels.squeeze() + + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + + loss_cls = self.loss_fn(cls_scores, labels) + return dict(loss_cls=loss_cls) + + def predict(self, feats, data_samples): + cls_scores = self(feats) + num_views = cls_scores.shape[0] // len(data_samples) + # assert num_views == data_samples[0].num_clips + cls_scores = self.average_clip(cls_scores, num_views) + + for ds, sc in zip(data_samples, cls_scores): + pred = LabelData(item=sc) + ds.pred_scores = pred + return data_samples + + def average_clip(self, cls_scores, num_views): + if self.average_clips not in ['score', 'prob', None]: + raise ValueError(f'{self.average_clips} is not supported. ' + f'Currently supported ones are ' + f'["score", "prob", None]') + + total_views = cls_scores.shape[0] + cls_scores = cls_scores.view(total_views // num_views, num_views, -1) + + if self.average_clips is None: + return cls_scores + elif self.average_clips == 'prob': + cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1) + elif self.average_clips == 'score': + cls_scores = cls_scores.mean(dim=1) + + return cls_scores + + +@MODELS.register_module() +class RecognizerZelda(BaseModel): + def __init__(self, backbone, cls_head, data_preprocessor): + super().__init__(data_preprocessor=data_preprocessor) + + self.backbone = MODELS.build(backbone) + self.cls_head = MODELS.build(cls_head) + + def extract_feat(self, inputs): + inputs = inputs.view((-1, ) + inputs.shape[2:]) + return self.backbone(inputs) + + def loss(self, inputs, data_samples): + feats = self.extract_feat(inputs) + loss = self.cls_head.loss(feats, data_samples) + return loss + + def predict(self, inputs, data_samples): + feats = self.extract_feat(inputs) + predictions = self.cls_head.predict(feats, data_samples) + return predictions + + def forward(self, inputs, data_samples=None, mode='tensor'): + if mode == 'tensor': + return self.extract_feat(inputs) + elif mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode: {mode}') +``` + +The `init_cfg` is used for model weight initialization. For more information on model weight initialization, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html). The usage of the above modules is as follows: + +```python +import torch +import copy +from mmaction.registry import MODELS + +model_cfg = dict( + type='RecognizerZelda', + backbone=dict(type='BackBoneZelda'), + cls_head=dict( + type='ClsHeadZelda', + num_classes=2, + in_channels=128, + average_clips='prob'), + data_preprocessor = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375])) + +model = MODELS.build(model_cfg) + +# Train +model.train() +model.init_weights() +data_batch_train = copy.deepcopy(batched_packed_results) +data = model.data_preprocessor(data_batch_train, training=True) +loss = model(**data, mode='loss') +print('loss dict: ', loss) + +# Test +with torch.no_grad(): + model.eval() + data_batch_test = copy.deepcopy(batched_packed_results) + data = model.data_preprocessor(data_batch_test, training=False) + predictions = model(**data, mode='predict') +print('Label of Sample[0]', predictions[0].gt_label) +print('Scores of Sample[0]', predictions[0].pred_score) +``` + +```shell +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.bias - torch.Size([64]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.weight - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.bias - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.bias - torch.Size([128]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.weight - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.bias - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.weight - torch.Size([2, 128]): +NormalInit: mean=0, std=0.01, bias=0 + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.bias - torch.Size([2]): +NormalInit: mean=0, std=0.01, bias=0 + +loss dict: {'loss_cls': tensor(0.6853, grad_fn=)} +Label of Sample[0] tensor([0]) +Scores of Sample[0] tensor([0.5240, 0.4760]) +``` + +## Step4: Build a Evaluation Metric + +Note that all `Metric` classes in `OpenMMLab` must inherit from the `BaseMetric` class in `mmengine` and implement the abstract methods, `process` and `compute_metrics`. For more information on evaluation, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html). + +```python +import copy +from collections import OrderedDict +from mmengine.evaluator import BaseMetric +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import METRICS + + +@METRICS.register_module() +class AccuracyMetric(BaseMetric): + def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'): + super().__init__(collect_device=collect_device, prefix=prefix) + self.topk = topk + + def process(self, data_batch, data_samples): + data_samples = copy.deepcopy(data_samples) + for data_sample in data_samples: + result = dict() + scores = data_sample['pred_score'].cpu().numpy() + label = data_sample['gt_label'].item() + result['scores'] = scores + result['label'] = label + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + eval_results = OrderedDict() + labels = [res['label'] for res in results] + scores = [res['scores'] for res in results] + topk_acc = top_k_accuracy(scores, labels, self.topk) + for k, acc in zip(self.topk, topk_acc): + eval_results[f'topk{k}'] = acc + return eval_results +``` + +```python +from mmaction.registry import METRICS + +metric_cfg = dict(type='AccuracyMetric', topk=(1, 5)) + +metric = METRICS.build(metric_cfg) + +data_samples = [d.to_dict() for d in predictions] + +metric.process(batched_packed_results, data_samples) +acc = metric.compute_metrics(metric.results) +print(acc) +``` + +```shell +OrderedDict([('topk1', 0.5), ('topk5', 1.0)]) +``` + +## Step5: Train and Test with Native PyTorch + +```python +import torch.optim as optim +from mmengine import track_iter_progress + + +device = 'cuda' # or 'cpu' +max_epochs = 10 + +optimizer = optim.Adam(model.parameters(), lr=0.01) + +for epoch in range(max_epochs): + model.train() + losses = [] + for data_batch in track_iter_progress(train_data_loader): + data = model.data_preprocessor(data_batch, training=True) + loss_dict = model(**data, mode='loss') + loss = loss_dict['loss_cls'] + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + + print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader)) + + with torch.no_grad(): + model.eval() + for data_batch in track_iter_progress(val_data_loader): + data = model.data_preprocessor(data_batch, training=False) + predictions = model(**data, mode='predict') + data_samples = [d.to_dict() for d in predictions] + metric.process(data_batch, data_samples) + + acc = metric.acc = metric.compute_metrics(metric.results) + for name, topk in acc.items(): + print(f'{name}: ', topk) +``` + +## Step6: Train and Test with MMEngine (Recommended) + +For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). + +```python +from mmengine.runner import Runner + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +val_cfg = dict(type='ValLoop') + +optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01)) + +runner = Runner(model=model_cfg, work_dir='./work_dirs/guide', + train_dataloader=train_dataloader_cfg, + train_cfg=train_cfg, + val_dataloader=val_dataloader_cfg, + val_cfg=val_cfg, + optim_wrapper=optim_wrapper, + val_evaluator=[metric_cfg], + default_scope='mmaction') +runner.train() +``` diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..dc2153c1e68b695f63451a218d17d9300133c0f7 --- /dev/null +++ b/docs/en/get_started/installation.md @@ -0,0 +1,209 @@ +# Installation + +## Prerequisites + +In this section we demonstrate how to prepare an environment with PyTorch. + +MMAction2 works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 10.2+ and PyTorch 1.8+. + +```{note} +If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation. +``` + +**Step 1.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html). + +**Step 2.** Create a conda environment and activate it. + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +``` + +**Step 3.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g. + +On GPU platforms: + +```shell +conda install pytorch torchvision -c pytorch +``` + +```{warning} +This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment. +``` + +On CPU platforms: + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +## Best Practices + +We recommend that users follow our best practices to install MMAction2. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information. + +**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional) and [MMPose](https://github.com/open-mmlab/mmpose) (optional) using [MIM](https://github.com/open-mmlab/mim). + +```shell +pip install -U openmim +mim install mmengine +mim install mmcv +mim install mmdet +mim install mmpose +``` + +**Step 2.** Install MMAction2. + +According to your needs, we support two install modes: + +- [Install from source (Recommended)](#build-mmaction2-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided. +- [Install as a Python package](#install-as-a-python-package): You just want to call MMAction2's APIs or import MMAction2's modules in your project. + +### Build MMAction2 from source + +In this case, install mmaction2 from source: + +```shell +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +pip install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without re-installation. +``` + +Optionally, if you want to contribute to MMAction2 or experience experimental functions, please checkout to the `dev-1.x` branch: + +```shell +git checkout dev-1.x +``` + +### Install as a Python package + +Just install with pip. + +```shell +pip install mmaction2 +``` + +## Verify the installation + +To verify whether MMAction2 is installed correctly, we provide some sample codes to run an inference demo. + +**Step 1.** Download the config and checkpoint files. + +```shell +mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest . +``` + +**Step 2.** Verify the inference demo. + +Option (a). If you install mmaction2 from source, you can run the following command: + +```shell +# The demo.mp4 and label_map_k400.txt are both from Kinetics-400 +python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \ + tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \ + demo/demo.mp4 tools/data/kinetics/label_map_k400.txt +``` + +You will see the top-5 labels with corresponding scores in your terminal. + +Option (b). If you install mmaction2 as a python package, you can run the following codes in your python interpreter, which will do the similar verification: + +```python +from operator import itemgetter +from mmaction.apis import init_recognizer, inference_recognizer + +config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py' +checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' +video_file = 'demo/demo.mp4' +label_file = 'tools/data/kinetics/label_map_k400.txt' +model = init_recognizer(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +pred_result = inference_recognizer(model, video_file) + +pred_scores = pred_result.pred_score.tolist() +score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) +score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) +top5_label = score_sorted[:5] + +labels = open(label_file).readlines() +labels = [x.strip() for x in labels] +results = [(labels[k[0]], k[1]) for k in top5_label] + +print('The top-5 labels with corresponding scores are:') +for result in results: + print(f'{result[0]}: ', result[1]) +``` + +## Customize Installation + +### CUDA versions + +When installing PyTorch, you may need to specify the version of CUDA. If you are +not clear on which to choose, follow our recommendations: + +- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must. +- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight. + +Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information. + +```{note} +Installing CUDA runtime libraries is enough if you follow our best practices, +because no CUDA code will be compiled locally. However if you hope to compile +MMCV from source or develop other CUDA operators, you need to install the +complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), +and its version should match the CUDA version of PyTorch. i.e., the specified +version of cudatoolkit in `conda install` command. +``` + +### Install MMCV without MIM + +MMCV contains C++ and CUDA extensions, so it depends on PyTorch in a complex +way. MIM solves such dependencies automatically and makes the installation +easier. However, it is not a must. + +To install MMCV with pip instead of MIM, please follow +[MMCV installation guides](https://mmcv.readthedocs.io/en/latest/get_started/installation.html). +This requires manually specifying a find-url based on PyTorch version and its CUDA version. + +For example, the following command install mmcv built for PyTorch 1.10.x and CUDA 11.3. + +```shell +pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html +``` + +### Install on CPU-only platforms + +MMAction2 can be built for CPU-only environment. In CPU mode you can train, test or inference a model. + +Some functionalities are gone in this mode, usually GPU-compiled ops. But don't +worry, almost all models in MMAction2 don't depend on these ops. + +### Using MMAction2 with Docker + +We provide a [Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile) +to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03. + +```shell +# build an image with PyTorch 1.8.1, CUDA 10.2, CUDNN 7. +# If you prefer other versions, just modified the Dockerfile +docker build -f ./docker/Dockerfile --rm -t mmaction2 . +``` + +Run it with + +```shell +docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2 +``` + +## Troubleshooting + +1. When migrating from the old version `0.x` to the new version `1.x`, you may encounter issues with mismatched versions of dependent libraries. Below is a display of the versions of each dependent library after following the aforementioned installation process, as shown by `pip list` command. Please ensure that the versions of each dependent library displayed in your terminal are greater than or equal to (i.e., `>=`) the versions shown below for each dependent library. + +```shell +mmaction2 1.0.0 +mmcv 2.0.0 +mmdet 3.0.0 +mmengine 0.7.2 +mmpose 1.0.0 +``` diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..64f0d523eb4c886282bccc5196cf7c696a806f43 --- /dev/null +++ b/docs/en/get_started/overview.md @@ -0,0 +1,97 @@ +# Overview + +## What is MMAction2 + +MMAction2 is an open source toolkit based on PyTorch, supporting numerous video understanding models, including **action recognition, skeleton-based action recognition, spatio-temporal action detection and temporal action localization**. Moreover, it supports widely-used academic datasets and offers many useful tools, assisting users in exploring various aspects of models and datasets, as well as implementing high-quality algorithms. Generally, the toolkit boasts the following features: + +**One-stop, Multi-model**: MMAction2 supports various video understanding tasks and implements state-of-the-art models for action recognition, localization, detection. + +**Modular Design**: The modular design of MMAction2 enables users to define and reuse modules in the model as required. + +**Various Useful Tools**: MMAction2 provides an array of analysis tools, such as visualizers, validation scripts, evaluators, etc., to aid users in troubleshooting, fine-tuning, or comparing models. + +**Powered by OpenMMLab**: Similar to other algorithm libraries in the OpenMMLab family, MMAction2 adheres to OpenMMLab's rigorous development guidelines and interface conventions, considerably reducing the learning cost for users familiar with other OpenMMLab projects. Furthermore, due to the unified interfaces among OpenMMLab projects, it is easy to call models implemented in other OpenMMLab projects (such as MMClassification) in MMAction2, which greatly facilitates cross-domain research and real-world applications. + + + + +
+

Action Recognition


+

Skeleton-based Action Recognition

+ + + +
+

Spatio-Temporal Action Detection


+

Spatio-Temporal Action Detection

+ +## How to use the documentation + +We have prepared a wealth of documents to meet your various needs: + +
+For the basic usage of MMAction2 + +- [Installation](installation.md) +- [Quick Run](quick_run.md) +- [Inference with existing models](../user_guides/inference.md) + +
+ +
+For training on supported dataset + +- [Learn about Configs](../user_guides/config.md) +- [Prepare Dataset](../user_guides/prepare_dataset.md) +- [Training and Test](../user_guides/train_test.md) + +
+ +
+For looking for some common issues + +- [FAQ](faq.md) +- [Useful tools](../useful_tools.md) + +
+ +
+For a general understanding about MMAction2 + +- [A 20-Minute Guide to MMAction2 FrameWork](guide_to_framework.md) +- [Dataflow in MMAction2](../advanced_guides/dataflow.md) + +
+ +
+For advanced usage about custom training + +- [Customize Model](../advanced_guides/customize_models.md) +- [Customize Dataset](../advanced_guides/customize_dataset.md) +- [Customize Data Pipeline](../advanced_guides/customize_pipeline.md) +- [Customize Optimizer](../advanced_guides/customize_optimizer.md) +- [Customize Logging](../advanced_guides/customize_logging.md) + +
+ +
+For supported model zoo and dataset zoo + +- [Model Zoo](../modelzoo_statistics.md) +- [Dataset Zoo](../datasetzoo_statistics.md) + +
+ +
+For migration from MMAction2 0.x + +- [Migration](../migration.md) + +
+ +
+For researchers and developers who are willing to contribute to MMAction2 + +- [How to contribute to MMAction2](contribution_guide.md) + +
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md new file mode 100644 index 0000000000000000000000000000000000000000..63c7f2429eaf8afff17ddde530110d07a079ec40 --- /dev/null +++ b/docs/en/get_started/quick_run.md @@ -0,0 +1,219 @@ +# Quick Run + +This chapter will introduce you to the fundamental functionalities of MMAction2. We assume that you have [installed MMAction2 from source](installation.md#best-practices). + +- [Quick Run](#quick-run) + - [Inference](#inference) + - [Prepare a Dataset](#prepare-a-dataset) + - [Modify the Config](#modify-the-config) + - [Modify Dataset](#modify-dataset) + - [Modify Runtime Config](#modify-runtime-config) + - [Modify Model Config](#modify-model-config) + - [Browse the Dataset](#browse-the-dataset) + - [Training](#training) + - [Testing](#testing) + +## Inference + +Run the following command in the root directory of MMAction2: + +```shell +python demo/demo_inferencer.py demo/demo.mp4 \ + --rec tsn --print-result \ + --label-file tools/data/kinetics/label_map_k400.txt +``` + +You should be able to see a pop-up video and the inference result printed out in the console. + +
+ +
+
+ +```bash +# Inference result +{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]} +``` + +```{note} +If you are running MMAction2 on a server without a GUI or via an SSH tunnel with X11 forwarding disabled, you may not see the pop-up window. +``` + +A detailed description of MMAction2's inference interface can be found [here](https://github.com/open-mmlab/mmaction2/tree/main/demo/README.md#inferencer). + +In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example. + +## Prepare a Dataset + +Since the variety of video dataset formats are not conducive to switching datasets, MMAction2 proposes a uniform [data format](../user_guides/2_data_prepare.md), and provides [dataset preparer](../user_guides/data_prepare/dataset_preparer.md) for commonly used video datasets. Usually, to use those datasets in MMAction2, you just need to follow the steps to get them ready for use. + +```{note} +But here, efficiency means everything. +``` + +To get started, please download our pre-prepared [kinetics400_tiny.zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) and extract it to the `data/` directory in the root directory of MMAction2. This will provide you with the necessary videos and annotation file. + +```Bash +wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip +mkdir -p data/ +unzip kinetics400_tiny.zip -d data/ +``` + +## Modify the Config + +After preparing the dataset, the next step is to modify the config file to specify the location of the training set and training parameters. + +In this example, we will train a TSN using resnet50 as its backbone. Since MMAction2 already has a config file for the full Kinetics400 dataset (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`), we just need to make some modifications on top of it. + +### Modify Dataset + +We first need to modify the path to the dataset. Open `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` and replace keys as followed: + +```Python +data_root = 'data/kinetics400_tiny/train' +data_root_val = 'data/kinetics400_tiny/val' +ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt' +ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt' +``` + +### Modify Runtime Config + +Additionally, due to the reduced size of the dataset, we recommend decreasing the training batch size to 4 and the number of training epochs to 10 accordingly. Furthermore, we suggest shortening the validation and weight storage intervals to 1 round each, and modifying the learning rate decay strategy. Modify the corresponding keys in `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as following lines to take effect. + +```python +# set training batch size to 4 +train_dataloader['batch_size'] = 4 + +# Save checkpoints every epoch, and only keep the latest checkpoint +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1)) +# Set the maximum number of epochs to 10, and validate the model every 1 epochs +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +# adjust learning rate schedule according to 10 epochs +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[4, 8], + gamma=0.1) +] +``` + +### Modify Model Config + +Further, due to the small size of tiny Kinetics dataset, it is recommended to load a pre-trained model on the original Kinetics dataset. Additionally, the model needs to be modified according to the actual number of classes. Please directly add the following lines to `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`. + +```python +model = dict( + cls_head=dict(num_classes=2)) +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' +``` + +Here, we have rewritten the corresponding parameters in the base configuration directly through the inheritance ({external+mmengine:doc}`MMEngine: Config `) mechanism of the config. The original fields are distributed in `configs/_base_/models/tsn_r50.py`, `configs/_base_/schedules/sgd_100e.py` and `configs/_base_/default_runtime.py`. + +```{note} +For a more detailed description of config, please refer to [here](../user_guides/1_config.md). +``` + +## Browse the Dataset + +Before we start the training, we can also visualize the frames processed by training-time data transforms. It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](https://github.com/open-mmlab/mmaction2/tree/main/tools/analysis_tools/browse_dataset.py) script. + +```Bash +python tools/visualizations/browse_dataset.py \ + configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + browse_out --mode pipeline +``` + +The transformed videos will be saved to `browse_out` folder. + +
+ +
+ +```{note} +For details on the parameters and usage of this script, please refer to [here](../user_guides/useful_tools.md). +``` + +```{tip} +In addition to satisfying our curiosity, visualization can also help us check the parts that may affect the model's performance before training, such as problems in configs, datasets and data transforms. +``` + +we can further visualize the learning rate schedule to make sure that the config is as expected by following script: + +```Bash +python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +The training learning rate schedule will be displayed in a pop-up window. + +
+ +
+ +```{note} +The learning rate is auto scaled according to the actual batchsize. +``` + +## Training + +Start the training by running the following command: + +```Bash +python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +Depending on the system environment, MMAction2 will automatically use the best device for training. If a GPU is available, a single GPU training will be started by default. When you start to see the output of the losses, you have successfully started the training. + +```Bash +03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:15 - mmengine - INFO - Epoch(train) [1][8/8] lr: 1.5625e-04 eta: 0:00:15 time: 0.2151 data_time: 0.0845 memory: 1314 grad_norm: 8.5647 loss: 0.7267 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7267 +03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:16 - mmengine - INFO - Epoch(train) [2][8/8] lr: 1.5625e-04 eta: 0:00:12 time: 0.1979 data_time: 0.0717 memory: 1314 grad_norm: 8.4709 loss: 0.7130 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7130 +03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:18 - mmengine - INFO - Epoch(train) [3][8/8] lr: 1.5625e-04 eta: 0:00:10 time: 0.1691 data_time: 0.0478 memory: 1314 grad_norm: 8.2910 loss: 0.6900 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.6900 +03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs +03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 1.2716 time: 1.3658 +03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth. +``` + +Without extra configurations, model weights will be saved to `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`, while the logs will be stored in `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/TIMESTAMP/`. Next, we just need to wait with some patience for training to finish. + +```{note} +For advanced usage of training, such as CPU training, multi-GPU training, and cluster training, please refer to [Training and Testing](../user_guides/train_test.md). +``` + +## Testing + +After 10 epochs, we observe that TSN performs best in the 6th epoch, with `acc/top1` reaching 1.0000: + +```Bash +03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000data_time: 1.0210 time: 1.1091 +``` + +```{note} +The result is pretty high due to pre-trained on original Kinetics400, you may see a different result. +``` + +However, this value only reflects the validation performance of TSN on the mini Kinetics dataset, While test results are usually higher due to more augmentation in test pipeline. + +Start testing: + +```Bash +python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth +``` + +And get the outputs like: + +```Bash +03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 0.0420 time: 1.0795 +``` + +The model achieves an top1-accuracy of 1.0000 on this dataset. + +```{note} +For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md). +``` diff --git a/docs/en/index.rst b/docs/en/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..ad136df520d82eedb6e257bc996738c3285b7c5f --- /dev/null +++ b/docs/en/index.rst @@ -0,0 +1,96 @@ +Welcome to MMAction2's documentation! +===================================== + +You can switch between Chinese and English documents in the lower-left corner of the layout. + +.. toctree:: + :maxdepth: 1 + :caption: Get Started + + get_started/overview.md + get_started/installation.md + get_started/quick_run.md + get_started/guide_to_framework.md + get_started/contribution_guide.md + get_started/faq.md + +.. toctree:: + :maxdepth: 1 + :caption: User Guides + + user_guides/inference.md + user_guides/config.md + user_guides/train_test.md + user_guides/prepare_dataset.md + user_guides/finetune.md + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Guides + + advanced_guides/dataflow.md + advanced_guides/customize_models.md + advanced_guides/customize_dataset.md + advanced_guides/customize_pipeline.md + advanced_guides/customize_optimizer.md + advanced_guides/customize_logging.md + advanced_guides/deploy.md + useful_tools.md + +.. toctree:: + :maxdepth: 1 + :caption: Model Zoo + + modelzoo_statistics.md + model_zoo/recognition.md + model_zoo/recognition_audio.md + model_zoo/skeleton.md + model_zoo/detection.md + model_zoo/retrieval.md + model_zoo/localization.md + +.. toctree:: + :maxdepth: 1 + :caption: Dataset Zoo + :glob: + + datasetzoo_statistics.md + dataset_zoo/* + +.. toctree:: + :maxdepth: 1 + :caption: Projects + + projectzoo.md + +.. toctree:: + :maxdepth: 1 + :caption: Migration + + migration.md + +.. toctree:: + :maxdepth: 1 + :caption: API Reference + + api.rst + +.. toctree:: + :maxdepth: 1 + :caption: Notes + + notes/ecosystem.md + notes/changelog.md + +.. toctree:: + :caption: Switch Language + + switch_language.md + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/en/make.bat b/docs/en/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..2119f51099bf37e4fdb6071dce9f451ea44c62dd --- /dev/null +++ b/docs/en/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/en/migration.md b/docs/en/migration.md new file mode 100644 index 0000000000000000000000000000000000000000..ea1935169dda03121d5fb2a8ffb2687cdd504e8c --- /dev/null +++ b/docs/en/migration.md @@ -0,0 +1,498 @@ +# Migration from MMAction2 0.x + +MMAction2 1.x introduced major refactorings and modifications including some BC-breaking changes. We provide this tutorial to help you migrate your projects from MMAction2 0.x smoothly. + +## New dependencies + +MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started/installation.md) + +1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is a foundational library for training deep learning model introduced in OpenMMLab 2.0 architecture. +2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0` which is more compact and efficient than `mmcv-full==2.0.0`. + +## Configuration files + +In MMAction2 1.x, we refactored the structure of configuration files. The configuration files with the old style will be incompatible. + +In this section, we will introduce all changes of the configuration files. And we assume you are already familiar with the [config files](./user_guides/config.md). + +### Model settings + +No changes in `model.backbone` and `model.neck`. For `model.cls_head`, we move the `average_clips` inside it, which is originally set in `model.test_cfg`. + +### Data settings + +#### Changes in **`data`** + +- The original `data` field is splited to `train_dataloader`, `val_dataloader` and + `test_dataloader`. This allows us to configure them in fine-grained. For example, + you can specify different sampler and batch size during training and test. +- The `videos_per_gpu` is renamed to `batch_size`. +- The `workers_per_gpu` is renamed to `num_workers`. + + + + + + + + + +
Original + +```python +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict(...), + val=dict(...), + test=dict(...), +) +``` + +
New + +```python +train_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=True) # necessary +) + +val_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=False) # necessary +) + +test_dataloader = val_dataloader +``` + +
+ +#### Changes in **`pipeline`** + +- The original formatting transforms **`ToTensor`**, **`Collect`** are combined as `PackActionInputs`. +- We don't recommend to do **`Normalize`** in the dataset pipeline. Please remove it from pipelines and set it in the `model.data_preprocessor` field. + + + + + + + + + +
Original + +```python + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +``` + +
New + +```python +model.data_preprocessor = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +``` + +
+ +#### Changes in **`evaluation`** + +- The **`evaluation`** field is splited to `val_evaluator` and `test_evaluator`. And it won't support `interval` and `save_best` arguments. +- The `interval` is moved to `train_cfg.val_interval` and the `save_best` is moved to `default_hooks.checkpoint.save_best`. +- The 'mean_average_precision', 'mean_class_accuracy', 'mmit_mean_average_precision', 'top_k_accuracy' are combined as `AccMetric`, and you could use `metric_list` to specify which metric to calculate. +- The `AVAMetric` is used to evaluate AVA Dataset. +- The `ANetMetric` is used to evaluate ActivityNet Dataset. + + + + + + + + + +
Original + +```python +evaluation = dict( + interval=5, + metrics=['top_k_accuracy', 'mean_class_accuracy']) +``` + +
New + +```python +val_evaluator = dict( + type='AccMetric', + metric_list=('top_k_accuracy', 'mean_class_accuracy')) +test_evaluator = val_evaluator +``` + +
+ +### Schedule settings + +#### Changes in **`optimizer`** and **`optimizer_config`** + +- Now we use `optim_wrapper` field to configure the optimization process. And the + `optimizer` becomes a sub field of `optim_wrapper`. +- `paramwise_cfg` is also a sub field of `optim_wrapper` parallel to `optimizer`. +- `optimizer_config` is removed now, and all configurations of it are moved to `optim_wrapper`. +- `grad_clip` is renamed to `clip_grad`. + + + + + + + + + +
Original + +```python +optimizer = dict( + type='AdamW', + lr=0.0015, + weight_decay=0.3, + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + )) + +optimizer_config = dict(grad_clip=dict(max_norm=1.0)) +``` + +
New + +```python +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3), + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + ), + clip_gard=dict(max_norm=1.0), +) +``` + +
+ +#### Changes in **`lr_config`** + +- The `lr_config` field is removed and we use new `param_scheduler` to replace it. +- The `warmup` related arguments are removed, since we use schedulers combination to implement this + functionality. + +The new schedulers combination mechanism is very flexible, and you can use it to design many kinds of learning +rate / momentum curves. + + + + + + + + + +
Original + +```python +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_iters=5, + warmup_ratio=0.01, + warmup_by_epoch=True) +``` + +
New + +```python +param_scheduler = [ + # warmup + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + end=5, + # Update the learning rate after every iters. + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', by_epoch=True, begin=5), +] +``` + +
+ +#### Changes in **`runner`** + +Most configuration in the original `runner` field is moved to `train_cfg`, `val_cfg` and `test_cfg`, which +configure the loop in training, validation and test. + + + + + + + + + +
Original + +```python +runner = dict(type='EpochBasedRunner', max_epochs=100) +``` + +
New + +```python +# The `val_interval` is the original `evaluation.interval`. +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') # Use the default validation loop. +test_cfg = dict(type='TestLoop') # Use the default test loop. +``` + +
+ +In fact, in OpenMMLab 2.0, we introduced `Loop` to control the behaviors in training, validation and test. And +the functionalities of `Runner` are also changed. You can find more details in the [MMEngine tutorials](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). + +### Runtime settings + +#### Changes in **`checkpoint_config`** and **`log_config`** + +The `checkpoint_config` are moved to `default_hooks.checkpoint` and the `log_config` are moved to `default_hooks.logger`. +And we move many hooks settings from the script code to the `default_hooks` field in the runtime configuration. + +```python +default_hooks = dict( + # update runtime information, e.g. current iter and lr. + runtime_info=dict(type='RuntimeInfoHook'), + + # record the time of every iterations. + timer=dict(type='IterTimerHook'), + + # print log every 100 iterations. + logger=dict(type='LoggerHook', interval=100), + + # enable the parameter scheduler. + param_scheduler=dict(type='ParamSchedulerHook'), + + # save checkpoint per epoch, and automatically save the best checkpoint. + checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'), + + # set sampler seed in distributed environment. + sampler_seed=dict(type='DistSamplerSeedHook'), + + # synchronize model buffers at the end of each epoch. + sync_buffers=dict(type='SyncBuffersHook') +) +``` + +In addition, we splited the original logger to logger and visualizer. The logger is used to record +information and the visualizer is used to show the logger in different backends, like terminal, TensorBoard +and Wandb. + + + + + + + + + +
Original + +```python +log_config = dict( + interval=100, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + ]) +``` + +
New + +```python +default_hooks = dict( + ... + logger=dict(type='LoggerHook', interval=100), +) + +visualizer = dict( + type='ActionVisualizer', + vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], +) +``` + +
+ +#### Changes in **`load_from`** and **`resume_from`** + +- The `resume_from` is removed. And we use `resume` and `load_from` to replace it. + - If `resume=True` and `load_from` is not None, resume training from the checkpoint in `load_from`. + - If `resume=True` and `load_from` is None, try to resume from the latest checkpoint in the work directory. + - If `resume=False` and `load_from` is not None, only load the checkpoint, not resume training. + - If `resume=False` and `load_from` is None, do not load nor resume. + +#### Changes in **`dist_params`** + +The `dist_params` field is a sub field of `env_cfg` now. And there are some new configurations in the `env_cfg`. + +```python +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) +``` + +#### Changes in **`workflow`** + +`Workflow` related functionalities are removed. + +#### New field **`visualizer`** + +The visualizer is a new design in OpenMMLab 2.0 architecture. We use a visualizer instance in the runner to handle results & log visualization and save to different backends. + +```python +visualizer = dict( + type='ActionVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + # Uncomment the below line to save the log and visualization results to TensorBoard. + # dict(type='TensorboardVisBackend') + ] +) +``` + +#### New field **`default_scope`** + +The start point to search module for all registries. The `default_scope` in MMAction2 is `mmaction`. See [the registry tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/registry.html) for more details. + +## Packages + +### `mmaction.apis` + +The documentation can be found [here](mmaction.apis). + +| Function | Changes | +| :--------------------: | :---------------------------------------------: | +| `init_recognizer` | No changes | +| `inference_recognizer` | No changes | +| `train_model` | Removed, use `runner.train` to train. | +| `multi_gpu_test` | Removed, use `runner.test` to test. | +| `single_gpu_test` | Removed, use `runner.test` to test. | +| `set_random_seed` | Removed, use `mmengine.runner.set_random_seed`. | +| `init_random_seed` | Removed, use `mmengine.dist.sync_random_seed`. | + +### `mmaction.core` + +The `mmaction.core` package is renamed to [`mmaction.engine`](mmaction.engine). + +| Sub package | Changes | +| :----------: | :-------------------------------------------------------------------------------------------------: | +| `evaluation` | Removed, use the metrics in `mmaction.evaluation`. | +| `hooks` | Moved to `mmaction.engine.hooks` | +| `optimizer` | Moved to `mmaction.engine.optimizers` | +| `utils` | Removed, the distributed environment related functions can be found in the `mmengine.dist` package. | + +### `mmaction.datasets` + +The documentation can be found [here](mmaction.datasets) + +#### Changes in [`BaseActionDataset`](mmaction.datasets.BaseActionDataset): + +| Method | Changes | +| :--------------------: | :-------------------------------------------: | +| `prepare_train_frames` | Replaced by `get_data_info` | +| `preprare_test_frames` | Replaced by `get_data_info` | +| `evaluate` | Removed, use `mmengine.evaluator.Evaluator` | +| `dump_results` | Removed, use `mmengine.evaluator.DumpResults` | +| `load_annotations` | Replaced by `load_data_list` | + +Now, you can write a new Dataset class inherited from `BaseActionDataset` and overwrite `load_data_list` only. To load more data information, you could overwrite `get_data_info` like `RawframeDataset` and `AVADataset`. +The `mmaction.datasets.pipelines` is renamed to `mmaction.datasets.transforms` and the `mmaction.datasets.pipelines.augmentations` is renamed to `mmaction.datasets.pipelines.processing`. + +### `mmaction.models` + +The documentation can be found [here](mmaction.models). The interface of all **backbones**, **necks** and **losses** didn't change. + +#### Changes in [`BaseRecognizer`](mmaction.models.BaseRecognizer): + +| Method | Changes | +| :-------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| `extract_feat` | Enhanced method, which now supports output features of three stages (`backbone`, `neck`, `head`) and can handle different modes, such as `train_mode` and `test_mode`. | +| `forward` | Now only accepts three arguments: `inputs`, `data_samples` and `mode`. See [the documentation](mmaction.models.BaseRecognizer) for more details. | +| `forward_train` | Replaced by `loss`. | +| `forward_test` | Replaced by `predict`. | +| `train_step` | The `optimizer` argument is replaced by `optim_wrapper` and it accepts [`OptimWrapper`](mmengine.optim.OptimWrapper). | +| `val_step` | The original `val_step` is the same as `train_step`, now it calls `predict`. | +| `test_step` | New method, and it's the same as `val_step`. | + +#### Changes in [BaseHead](mmaction.models.BaseHead): + +| Method | Changes | +| :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| `forward` | No changes | +| `loss` | It accepts `feats` and `data_samples` instead of `cls_score` and `labels` to calculate loss. The `data_samples` is a list of [ActionDataSample](mmaction.structures.ActionDataSample). | +| `predict` | New method. It accepts `feats` and `data_samples` to predict classification scores. | + +### `mmaction.utils` + +| Function | Changes | +| :---------------------: | :-----------------------------------------------------------: | +| `collect_env` | No changes | +| `get_root_logger` | Removed, use `mmengine.MMLogger.get_current_instance` | +| `setup_multi_processes` | Removed, use `mmengine.utils.dl_utils.setup_multi_processes`. | + +### Other changes + +- We moved the definition of all registries in different packages to the `mmaction.registry` package. diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md new file mode 100644 index 0000000000000000000000000000000000000000..04acb16e91ee21086d2ba7dda4b3623c28a9ad83 --- /dev/null +++ b/docs/en/notes/changelog.md @@ -0,0 +1,1082 @@ +# Changelog + +## 1.2.0 (10/12/2023) + +**Highlights** + +- Support the Training of ActionClip +- Support VindLU multi-modality algorithm +- Support MobileOne TSN/TSM + +**New Features** + +- Support the Training of ActionClip ([2620](https://github.com/open-mmlab/mmaction2/pull/2620)) +- Support video retrieval dataset MSVD ([2622](https://github.com/open-mmlab/mmaction2/pull/2622)) +- Support VindLU multi-modality algorithm ([2667](https://github.com/open-mmlab/mmaction2/pull/2667)) +- Support Dense Regression Network for Video Grounding ([2668](https://github.com/open-mmlab/mmaction2/pull/2668)) + +**Improvements** + +- Support Video Demos ([2602](https://github.com/open-mmlab/mmaction2/pull/2602)) +- Support Audio Demos ([2603](https://github.com/open-mmlab/mmaction2/pull/2603)) +- Add README_zh-CN.md for Swin and VideoMAE ([2621](https://github.com/open-mmlab/mmaction2/pull/2621)) +- Support MobileOne TSN/TSM ([2656](https://github.com/open-mmlab/mmaction2/pull/2656)) +- Support SlowOnly K700 feature to train localization models ([2673](https://github.com/open-mmlab/mmaction2/pull/2673)) + +**Bug Fixes** + +- Refine ActionDataSample structure ([2658](https://github.com/open-mmlab/mmaction2/pull/2658)) +- Fix MPS device ([2619](https://github.com/open-mmlab/mmaction2/pull/2619)) + +## 1.1.0 (7/3/2023) + +**Highlights** + +- Support HACS-segments dataset(ICCV'2019), MultiSports dataset(ICCV'2021), Kinetics-710 dataset(Arxiv'2022) +- Support rich projects: gesture recognition, spatio-temporal action detection tutorial, and knowledge distillation +- Support TCANet(CVPR'2021) +- Support VideoMAE V2(CVPR'2023), and VideoMAE(NeurIPS'2022) on action detection +- Support CLIP-based multi-modality models: ActionCLIP(Arxiv'2021) and CLIP4clip(ArXiv'2022) +- Support [Pure Python style Configuration File](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and downloading datasets by MIM + +**New Features** + +- Support HACS-segments dataset ([2224](https://github.com/open-mmlab/mmaction2/pull/2224)) +- Support TCANet ([2271](https://github.com/open-mmlab/mmaction2/pull/2271)) +- Support MultiSports dataset ([2280](https://github.com/open-mmlab/mmaction2/pull/2280)) +- Support spatio-temporal action detection tutorial ([2428](https://github.com/open-mmlab/mmaction2/pull/2428)) +- Support knowledge distillation based on MMRazor ([2458](https://github.com/open-mmlab/mmaction2/pull/2458)) +- Support VideoMAE V2 ([2460](https://github.com/open-mmlab/mmaction2/pull/2460)) +- Support ActionCLIP ([2470](https://github.com/open-mmlab/mmaction2/pull/2470)) +- Support CLIP4clip ([2489](https://github.com/open-mmlab/mmaction2/pull/2489)) +- Support Kinetics-710 dataset ([2534](https://github.com/open-mmlab/mmaction2/pull/2534)) +- Support gesture recognition project ([2539](https://github.com/open-mmlab/mmaction2/pull/2539)) +- Support VideoMAE on action detection ([2547](https://github.com/open-mmlab/mmaction2/pull/2547)) +- Support downloading datasets by MIM ([2465](https://github.com/open-mmlab/mmaction2/pull/1465)) +- Support new config ([2542](https://github.com/open-mmlab/mmaction2/pull/2542)) + +**Improvements** + +- Refactor TSM init_weights ([2396](https://github.com/open-mmlab/mmaction2/pull/2396)) +- Add unit test for Recognizer 2D ([2432](https://github.com/open-mmlab/mmaction2/pull/2432)) +- Enhance inference APIs ([2472](https://github.com/open-mmlab/mmaction2/pull/2472)) +- Support converting ST-GCN and PoseC3D to ONNX ([2543](https://github.com/open-mmlab/mmaction2/pull/2543)) +- Support feature extraction head ([2525](https://github.com/open-mmlab/mmaction2/pull/2525)) + +**Bug Fixes** + +- Fix CircleCI ([2351](https://github.com/open-mmlab/mmaction2/pull/2351)) +- Fix demo skeleton script ([2380](https://github.com/open-mmlab/mmaction2/pull/2380)) +- Fix docker file branch ([2397](https://github.com/open-mmlab/mmaction2/pull/2397)) +- Fix NTU pose extraction script ([2402](https://github.com/open-mmlab/mmaction2/pull/2402)) +- Rename typing and enhance collect_env script ([2420](https://github.com/open-mmlab/mmaction2/pull/2420)) +- Fix multi-label classification ([2425](https://github.com/open-mmlab/mmaction2/pull/2425), [2466](https://github.com/open-mmlab/mmaction2/pull/2466), [2532](https://github.com/open-mmlab/mmaction2/pull/2532)) +- Fix lfb configs ([2426](https://github.com/open-mmlab/mmaction2/pull/2426)) +- Fix a warning caused by `torch.div` ([2449](https://github.com/open-mmlab/mmaction2/pull/2449)) +- Fix incompatibility of ImgAug and latest Numpy ([2451](https://github.com/open-mmlab/mmaction2/pull/2451)) +- Fix MViT with_cls_token argument ([2480](https://github.com/open-mmlab/mmaction2/pull/2480)) +- Fix timm BC-breaking for TSN ([2497](https://github.com/open-mmlab/mmaction2/pull/2497)) +- Close FileHandler in Windows to make the temporary directory can be deleted ([2565](https://github.com/open-mmlab/mmaction2/pull/2565)) +- Update minimum PyTorch version to 1.8.1 ([2568](https://github.com/open-mmlab/mmaction2/pull/2568)) + +**Documentation** + +- Fix document links in README ([2358](https://github.com/open-mmlab/mmaction2/pull/2358), [2372](https://github.com/open-mmlab/mmaction2/pull/2372), [2376](https://github.com/open-mmlab/mmaction2/pull/2376), [2382](https://github.com/open-mmlab/mmaction2/pull/2382)) +- Update installation document ([2362](https://github.com/open-mmlab/mmaction2/pull/2362)) +- Update upstream library version requirement ([2383](https://github.com/open-mmlab/mmaction2/pull/2383)) +- Fix Colab tutorial ([2384](https://github.com/open-mmlab/mmaction2/pull/2384), [2391](https://github.com/open-mmlab/mmaction2/pull/2391), [2475](https://github.com/open-mmlab/mmaction2/pull/2475)) +- Refine documents ([2404](https://github.com/open-mmlab/mmaction2/pull/2404)) +- Update outdated config in readme ([2419](https://github.com/open-mmlab/mmaction2/pull/2419)) +- Update OpenMMLab related repo list ([2429](https://github.com/open-mmlab/mmaction2/pull/2429)) +- Fix UniFormer README and metafile ([2450](https://github.com/open-mmlab/mmaction2/pull/2450)) +- Add finetune document ([2457](https://github.com/open-mmlab/mmaction2/pull/2457)) +- Update FAQ document ([2476](https://github.com/open-mmlab/mmaction2/pull/2476), [2482](https://github.com/open-mmlab/mmaction2/pull/2482) +- Update download datasets document ([2495](https://github.com/open-mmlab/mmaction2/pull/2495)) +- Translate Chinese document ([2516](https://github.com/open-mmlab/mmaction2/pull/2516), [2506](https://github.com/open-mmlab/mmaction2/pull/2506), [2499](https://github.com/open-mmlab/mmaction2/pull/2499)) +- Refactor model zoo and dataset zoo ([2552](https://github.com/open-mmlab/mmaction2/pull/2552)) +- Refactor Chinese document ([2567](https://github.com/open-mmlab/mmaction2/pull/2567)) + +## 1.0.0 (4/6/2023) + +**Highlights** + +- Support RGB-PoseC3D(CVPR'2022). +- Support training UniFormer V2(Arxiv'2022). +- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects. +- Refactor and provide more user-friendly documentation. + +**New Features** + +- Support RGB-PoseC3D ([2182](https://github.com/open-mmlab/mmaction2/pull/2182)) +- Support training UniFormer V2 ([2221](https://github.com/open-mmlab/mmaction2/pull/2221)) +- Support MSG3D and CTRGCN in projects. ([2269](https://github.com/open-mmlab/mmaction2/pull/2269), [2291](https://github.com/open-mmlab/mmaction2/pull/2291)) + +**Improvements** + +- Use MMEngine to calculate FLOPs ([2300](https://github.com/open-mmlab/mmaction2/pull/2300)) +- Speed up LFB training ([2294](https://github.com/open-mmlab/mmaction2/pull/2294)) +- Support multiprocessing on AVA evaluation ([2146](https://github.com/open-mmlab/mmaction2/pull/2146)) +- Add a demo for exporting spatial-temporal detection model to ONNX ([2225](https://github.com/open-mmlab/mmaction2/pull/2225)) +- Update spatial-temporal detection related folders ([2262](https://github.com/open-mmlab/mmaction2/pull/2262)) + +**Bug Fixes** + +- Fix flip config of TSM for sth v1/v2 dataset ([#2247](https://github.com/open-mmlab/mmaction2/pull/2247)) +- Fix circle ci ([2336](https://github.com/open-mmlab/mmaction2/pull/2336), [2334](https://github.com/open-mmlab/mmaction2/pull/2334)) +- Fix accepting an unexpected argument local-rank in PyTorch 2.0 ([2320](https://github.com/open-mmlab/mmaction2/pull/2320)) +- Fix TSM config link ([2315](https://github.com/open-mmlab/mmaction2/pull/2315)) +- Fix numpy version requirement in CI ([2284](https://github.com/open-mmlab/mmaction2/pull/2284)) +- Fix NTU pose extraction script ([2246](https://github.com/open-mmlab/mmaction2/pull/2246)) +- Fix TSM-MobileNet V2 ([2332](https://github.com/open-mmlab/mmaction2/pull/2332)) +- Fix command bugs in localization tasks' README ([2244](https://github.com/open-mmlab/mmaction2/pull/2244)) +- Fix duplicate name in DecordInit and SampleAVAFrame ([2251](https://github.com/open-mmlab/mmaction2/pull/2251)) +- Fix channel order when showing video ([2308](https://github.com/open-mmlab/mmaction2/pull/2308)) +- Specify map_location to cpu when using \_load_checkpoint ([2252](https://github.com/open-mmlab/mmaction2/pull/2254)) + +**Documentation** + +- Refactor and provide more user-friendly documentation ([2341](https://github.com/open-mmlab/mmaction2/pull/2341), [2312](https://github.com/open-mmlab/mmaction2/pull/2312), [2325](https://github.com/open-mmlab/mmaction2/pull/2325)) +- Add README_zh-CN ([2252](https://github.com/open-mmlab/mmaction2/pull/2252)) +- Add social networking links ([2294](https://github.com/open-mmlab/mmaction2/pull/2294)) +- Fix sthv2 dataset annotations preparation document ([2248](https://github.com/open-mmlab/mmaction2/pull/2248)) + +## 1.0.0rc3 (2/10/2023) + +**Highlights** + +- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022). +- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning. + +**New Features** + +- Support UniFormer V1/V2 ([#2153](https://github.com/open-mmlab/mmaction2/pull/2153)) +- Support training MViT, and MaskFeat fine-tuning ([#2186](https://github.com/open-mmlab/mmaction2/pull/2186)) +- Support a unified inference interface: Inferencer ([#2164](https://github.com/open-mmlab/mmaction2/pull/2164)) + +**Improvements** + +- Support load data list from multi-backends ([#2176](https://github.com/open-mmlab/mmaction2/pull/2176)) + +**Bug Fixes** + +- Upgrade isort to fix CI ([#2198](https://github.com/open-mmlab/mmaction2/pull/2198)) +- Fix bug in skeleton demo ([#2214](https://github.com/open-mmlab/mmaction2/pull/2214)) + +**Documentation** + +- Add Chinese documentation for config.md ([#2188](https://github.com/open-mmlab/mmaction2/pull/2188)) +- Add readme for Omnisource ([#2205](https://github.com/open-mmlab/mmaction2/pull/2205)) + +## 1.0.0rc2 (1/10/2023) + +**Highlights** + +- Support Action Recognition model VideoMAE(NeurIPS'2022), MViT V2(CVPR'2022), C2D and skeleton-based action recognition model STGCN++ +- Support Omni-Source training on ImageNet and Kinetics datasets +- Support exporting spatial-temporal detection models to ONNX + +**New Features** + +- Support VideoMAE ([#1942](https://github.com/open-mmlab/mmaction2/pull/1942)) +- Support MViT V2 ([#2007](https://github.com/open-mmlab/mmaction2/pull/2007)) +- Support C2D ([#2022](https://github.com/open-mmlab/mmaction2/pull/2022)) +- Support AVA-Kinetics dataset ([#2080](https://github.com/open-mmlab/mmaction2/pull/2080)) +- Support STGCN++ ([#2156](https://github.com/open-mmlab/mmaction2/pull/2156)) +- Support exporting spatial-temporal detection models to ONNX ([#2148](https://github.com/open-mmlab/mmaction2/pull/2148)) +- Support Omni-Source training on ImageNet and Kinetics datasets ([#2143](https://github.com/open-mmlab/mmaction2/pull/2143)) + +**Improvements** + +- Support repeat batch data augmentation ([#2170](https://github.com/open-mmlab/mmaction2/pull/2170)) +- Support calculating FLOPs tool powered by fvcore ([#1997](https://github.com/open-mmlab/mmaction2/pull/1997)) +- Support Spatial-temporal detection demo ([#2019](https://github.com/open-mmlab/mmaction2/pull/2019)) +- Add SyncBufferHook and add randomness config in train.py ([#2044](https://github.com/open-mmlab/mmaction2/pull/2044)) +- Refactor gradcam ([#2049](https://github.com/open-mmlab/mmaction2/pull/2049)) +- Support init_cfg in Swin and ViTMAE ([#2055](https://github.com/open-mmlab/mmaction2/pull/2055)) +- Refactor STGCN and related pipelines ([#2087](https://github.com/open-mmlab/mmaction2/pull/2087)) +- Refactor visualization tools ([#2092](https://github.com/open-mmlab/mmaction2/pull/2092)) +- Update `SampleFrames` transform and improve most models' performance ([#1942](https://github.com/open-mmlab/mmaction2/pull/1942)) +- Support real-time webcam demo ([#2152](https://github.com/open-mmlab/mmaction2/pull/2152)) +- Refactor and enhance 2s-AGCN ([#2130](https://github.com/open-mmlab/mmaction2/pull/2130)) +- Support adjusting fps in `SampleFrame` ([#2157](https://github.com/open-mmlab/mmaction2/pull/2157)) + +**Bug Fixes** + +- Fix CI upstream library dependency ([#2000](https://github.com/open-mmlab/mmaction2/pull/2000)) +- Fix SlowOnly readme typos and results ([#2006](https://github.com/open-mmlab/mmaction2/pull/2006)) +- Fix VideoSwin readme ([#2010](https://github.com/open-mmlab/mmaction2/pull/2010)) +- Fix tools and mim error ([#2028](https://github.com/open-mmlab/mmaction2/pull/2028)) +- Fix Imgaug wrapper ([#2024](https://github.com/open-mmlab/mmaction2/pull/2024)) +- Remove useless scripts ([#2032](https://github.com/open-mmlab/mmaction2/pull/2032)) +- Fix multi-view inference ([#2045](https://github.com/open-mmlab/mmaction2/pull/2045)) +- Update mmcv maximum version to 1.8.0 ([#2047](https://github.com/open-mmlab/mmaction2/pull/2047)) +- Fix torchserver dependency ([#2053](https://github.com/open-mmlab/mmaction2/pull/2053)) +- Fix `gen_ntu_rgbd_raw` script ([#2076](https://github.com/open-mmlab/mmaction2/pull/2076)) +- Update AVA-Kinetics experiment configs and results ([#2099](https://github.com/open-mmlab/mmaction2/pull/2099)) +- Add `joint.pkl` and `bone.pkl` used in multi-stream fusion tool ([#2106](https://github.com/open-mmlab/mmaction2/pull/2106)) +- Fix lint CI config ([#2110](https://github.com/open-mmlab/mmaction2/pull/2110)) +- Update testing accuracy for modified `SampleFrames` ([#2117](https://github.com/open-mmlab/mmaction2/pull/2117)), ([#2121](https://github.com/open-mmlab/mmaction2/pull/2121)), ([#2122](https://github.com/open-mmlab/mmaction2/pull/2122)), ([#2124](https://github.com/open-mmlab/mmaction2/pull/2124)), ([#2125](https://github.com/open-mmlab/mmaction2/pull/2125)), ([#2126](https://github.com/open-mmlab/mmaction2/pull/2126)), ([#2129](https://github.com/open-mmlab/mmaction2/pull/2129)), ([#2128](https://github.com/open-mmlab/mmaction2/pull/2128)) +- Fix timm related bug ([#1976](https://github.com/open-mmlab/mmaction2/pull/1976)) +- Fix `check_videos.py` script ([#2134](https://github.com/open-mmlab/mmaction2/pull/2134)) +- Update CI maximum torch version to 1.13.0 ([#2118](https://github.com/open-mmlab/mmaction2/pull/2118)) + +**Documentation** + +- Add MMYOLO description in README ([#2011](https://github.com/open-mmlab/mmaction2/pull/2011)) +- Add v1.x introduction in README ([#2023](https://github.com/open-mmlab/mmaction2/pull/2023)) +- Fix link in README ([#2035](https://github.com/open-mmlab/mmaction2/pull/2035)) +- Refine some docs ([#2038](https://github.com/open-mmlab/mmaction2/pull/2038)), ([#2040](https://github.com/open-mmlab/mmaction2/pull/2040)), ([#2058](https://github.com/open-mmlab/mmaction2/pull/2058)) +- Update TSN/TSM Readme ([#2082](https://github.com/open-mmlab/mmaction2/pull/2082)) +- Add chinese document ([#2083](https://github.com/open-mmlab/mmaction2/pull/2083)) +- Adjust document structure ([#2088](https://github.com/open-mmlab/mmaction2/pull/2088)) +- Fix Sth-Sth and Jester dataset links ([#2103](https://github.com/open-mmlab/mmaction2/pull/2103)) +- Fix doc link ([#2131](https://github.com/open-mmlab/mmaction2/pull/2131)) + +## 1.0.0rc1 (10/14/2022) + +**Highlights** + +- Support Video Swin Transformer + +**New Features** + +- Support Video Swin Transformer ([#1939](https://github.com/open-mmlab/mmaction2/pull/1939)) + +**Improvements** + +- Add colab tutorial for 1.x ([#1956](https://github.com/open-mmlab/mmaction2/pull/1956)) +- Support skeleton-based action recognition demo ([#1920](https://github.com/open-mmlab/mmaction2/pull/1920)) + +**Bug Fixes** + +- Fix link in doc ([#1986](https://github.com/open-mmlab/mmaction2/pull/1986), [#1967](https://github.com/open-mmlab/mmaction2/pull/1967), [#1951](https://github.com/open-mmlab/mmaction2/pull/1951), [#1926](https://github.com/open-mmlab/mmaction2/pull/1926),[#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1927](https://github.com/open-mmlab/mmaction2/pull/1927), [#1925](https://github.com/open-mmlab/mmaction2/pull/1925)) +- Fix CI ([#1987](https://github.com/open-mmlab/mmaction2/pull/1987), [#1930](https://github.com/open-mmlab/mmaction2/pull/1930), [#1923](https://github.com/open-mmlab/mmaction2/pull/1923)) +- Fix pre-commit hook config ([#1971](https://github.com/open-mmlab/mmaction2/pull/1971)) +- Fix TIN config ([#1912](https://github.com/open-mmlab/mmaction2/pull/1912)) +- Fix UT for BMN and BSN ([#1966](https://github.com/open-mmlab/mmaction2/pull/1966)) +- Fix UT for Recognizer2D ([#1937](https://github.com/open-mmlab/mmaction2/pull/1937)) +- Fix BSN and BMN configs for localization ([#1913](https://github.com/open-mmlab/mmaction2/pull/1913)) +- Modeify ST-GCN configs ([#1913](https://github.com/open-mmlab/mmaction2/pull/1914)) +- Fix typo in migration doc ([#1931](https://github.com/open-mmlab/mmaction2/pull/1931)) +- Remove Onnx related tools ([#1928](https://github.com/open-mmlab/mmaction2/pull/1928)) +- Update TANet readme ([#1916](https://github.com/open-mmlab/mmaction2/pull/1916), [#1890](https://github.com/open-mmlab/mmaction2/pull/1890)) +- Update 2S-AGCN readme ([#1915](https://github.com/open-mmlab/mmaction2/pull/1915)) +- Fix TSN configs ([#1905](https://github.com/open-mmlab/mmaction2/pull/1905)) +- Fix configs for detection ([#1903](https://github.com/open-mmlab/mmaction2/pull/1903)) +- Fix typo in TIN config ([#1904](https://github.com/open-mmlab/mmaction2/pull/1904)) +- Fix PoseC3D readme ([#1899](https://github.com/open-mmlab/mmaction2/pull/1899)) +- Fix ST-GCN configs ([#1891](https://github.com/open-mmlab/mmaction2/pull/1891)) +- Fix audio recognition readme ([#1898](https://github.com/open-mmlab/mmaction2/pull/1898)) +- Fix TSM readme ([#1887](https://github.com/open-mmlab/mmaction2/pull/1887)) +- Fix SlowOnly readme ([#1889](https://github.com/open-mmlab/mmaction2/pull/1889)) +- Fix TRN readme ([#1888](https://github.com/open-mmlab/mmaction2/pull/1888)) +- Fix typo in get_started doc ([#1895](https://github.com/open-mmlab/mmaction2/pull/1895)) + +## 1.0.0rc0 (09/01/2022) + +We are excited to announce the release of MMAction2 v1.0.0rc0. +MMAction2 1.0.0beta is the first version of MMAction2 1.x, a part of the OpenMMLab 2.0 projects. +Built upon the new [training engine](https://github.com/open-mmlab/mmengine). + +**Highlights** + +- **New engines**. MMAction2 1.x is based on MMEngine\](https://github.com/open-mmlab/mmengine), which provides a general and powerful runner that allows more flexible customizations and significantly simplifies the entrypoints of high-level interfaces. + +- **Unified interfaces**. As a part of the OpenMMLab 2.0 projects, MMAction2 1.x unifies and refactors the interfaces and internal logics of train, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logics to allow the emergence of multi-task/modality algorithms. + +- **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://github.com/open-mmlab/mmaction2/blob/main/docs/en/migration.md). + +**Breaking Changes** + +In this release, we made lots of major refactoring and modifications. Please refer to the [migration guide](../migration.md) for details and migration instructions. + +## 0.24.0 (05/05/2022) + +**Highlights** + +- Support different seeds + +**New Features** + +- Add lateral norm in multigrid config ([#1567](https://github.com/open-mmlab/mmaction2/pull/1567)) +- Add openpose 25 joints in graph config ([#1578](https://github.com/open-mmlab/mmaction2/pull/1578)) +- Support MLU Backend ([#1608](https://github.com/open-mmlab/mmaction2/pull/1608)) + +**Bug and Typo Fixes** + +- Fix local_rank ([#1558](https://github.com/open-mmlab/mmaction2/pull/1558)) +- Fix install typo ([#1571](https://github.com/open-mmlab/mmaction2/pull/1571)) +- Fix the inference API doc ([#1580](https://github.com/open-mmlab/mmaction2/pull/1580)) +- Fix zh-CN demo.md and getting_started.md ([#1587](https://github.com/open-mmlab/mmaction2/pull/1587)) +- Remove Recommonmark ([#1595](https://github.com/open-mmlab/mmaction2/pull/1595)) +- Fix inference with ndarray ([#1603](https://github.com/open-mmlab/mmaction2/pull/1603)) +- Fix the log error when `IterBasedRunner` is used ([#1606](https://github.com/open-mmlab/mmaction2/pull/1606)) + +## 0.23.0 (04/01/2022) + +**Highlights** + +- Support different seeds +- Provide multi-node training & testing script +- Update error log + +**New Features** + +- Support different seeds([#1502](https://github.com/open-mmlab/mmaction2/pull/1502)) +- Provide multi-node training & testing script([#1521](https://github.com/open-mmlab/mmaction2/pull/1521)) +- Update error log([#1546](https://github.com/open-mmlab/mmaction2/pull/1546)) + +**Documentations** + +- Update gpus in Slowfast readme([#1497](https://github.com/open-mmlab/mmaction2/pull/1497)) +- Fix work_dir in multigrid config([#1498](https://github.com/open-mmlab/mmaction2/pull/1498)) +- Add sub bn docs([#1503](https://github.com/open-mmlab/mmaction2/pull/1503)) +- Add shortcycle sampler docs([#1513](https://github.com/open-mmlab/mmaction2/pull/1513)) +- Update Windows Declaration([#1520](https://github.com/open-mmlab/mmaction2/pull/1520)) +- Update the link for ST-GCN([#1544](https://github.com/open-mmlab/mmaction2/pull/1544)) +- Update install commands([#1549](https://github.com/open-mmlab/mmaction2/pull/1549)) + +**Bug and Typo Fixes** + +- Update colab tutorial install cmds([#1522](https://github.com/open-mmlab/mmaction2/pull/1522)) +- Fix num_iters_per_epoch in analyze_logs.py([#1530](https://github.com/open-mmlab/mmaction2/pull/1530)) +- Fix distributed_sampler([#1532](https://github.com/open-mmlab/mmaction2/pull/1532)) +- Fix cd dir error([#1545](https://github.com/open-mmlab/mmaction2/pull/1545)) +- Update arg names([#1548](https://github.com/open-mmlab/mmaction2/pull/1548)) + +**ModelZoo** + +## 0.22.0 (03/05/2022) + +**Highlights** + +- Support Multigrid training strategy +- Support CPU training +- Support audio demo +- Support topk customizing in models/heads/base.py + +**New Features** + +- Support Multigrid training strategy([#1378](https://github.com/open-mmlab/mmaction2/pull/1378)) +- Support STGCN in demo_skeleton.py([#1391](https://github.com/open-mmlab/mmaction2/pull/1391)) +- Support CPU training([#1407](https://github.com/open-mmlab/mmaction2/pull/1407)) +- Support audio demo([#1425](https://github.com/open-mmlab/mmaction2/pull/1425)) +- Support topk customizing in models/heads/base.py([#1452](https://github.com/open-mmlab/mmaction2/pull/1452)) + +**Documentations** + +- Add OpenMMLab platform([#1393](https://github.com/open-mmlab/mmaction2/pull/1393)) +- Update links([#1394](https://github.com/open-mmlab/mmaction2/pull/1394)) +- Update readme in configs([#1404](https://github.com/open-mmlab/mmaction2/pull/1404)) +- Update instructions to install mmcv-full([#1426](https://github.com/open-mmlab/mmaction2/pull/1426)) +- Add shortcut([#1433](https://github.com/open-mmlab/mmaction2/pull/1433)) +- Update modelzoo([#1439](https://github.com/open-mmlab/mmaction2/pull/1439)) +- add video_structuralize in readme([#1455](https://github.com/open-mmlab/mmaction2/pull/1455)) +- Update OpenMMLab repo information([#1482](https://github.com/open-mmlab/mmaction2/pull/1482)) + +**Bug and Typo Fixes** + +- Update train.py([#1375](https://github.com/open-mmlab/mmaction2/pull/1375)) +- Fix printout bug([#1382](<(https://github.com/open-mmlab/mmaction2/pull/1382)>)) +- Update multi processing setting([#1395](https://github.com/open-mmlab/mmaction2/pull/1395)) +- Setup multi processing both in train and test([#1405](https://github.com/open-mmlab/mmaction2/pull/1405)) +- Fix bug in nondistributed multi-gpu training([#1406](https://github.com/open-mmlab/mmaction2/pull/1406)) +- Add variable fps in ava_dataset.py([#1409](https://github.com/open-mmlab/mmaction2/pull/1409)) +- Only support distributed training([#1414](https://github.com/open-mmlab/mmaction2/pull/1414)) +- Set test_mode for AVA configs([#1432](https://github.com/open-mmlab/mmaction2/pull/1432)) +- Support single label([#1434](https://github.com/open-mmlab/mmaction2/pull/1434)) +- Add check copyright([#1447](https://github.com/open-mmlab/mmaction2/pull/1447)) +- Support Windows CI([#1448](https://github.com/open-mmlab/mmaction2/pull/1448)) +- Fix wrong device of class_weight in models/losses/cross_entropy_loss.py([#1457](https://github.com/open-mmlab/mmaction2/pull/1457)) +- Fix bug caused by distributed([#1459](https://github.com/open-mmlab/mmaction2/pull/1459)) +- Update readme([#1460](https://github.com/open-mmlab/mmaction2/pull/1460)) +- Fix lint caused by colab automatic upload([#1461](https://github.com/open-mmlab/mmaction2/pull/1461)) +- Refine CI([#1471](https://github.com/open-mmlab/mmaction2/pull/1471)) +- Update pre-commit([#1474](https://github.com/open-mmlab/mmaction2/pull/1474)) +- Add deprecation message for deploy tool([#1483](https://github.com/open-mmlab/mmaction2/pull/1483)) + +**ModelZoo** + +- Support slowfast_steplr([#1421](https://github.com/open-mmlab/mmaction2/pull/1421)) + +## 0.21.0 (31/12/2021) + +**Highlights** + +- Support 2s-AGCN +- Support publish models in Windows +- Improve some sthv1 related models +- Support BABEL + +**New Features** + +- Support 2s-AGCN([#1248](https://github.com/open-mmlab/mmaction2/pull/1248)) +- Support skip postproc in ntu_pose_extraction([#1295](https://github.com/open-mmlab/mmaction2/pull/1295)) +- Support publish models in Windows([#1325](https://github.com/open-mmlab/mmaction2/pull/1325)) +- Add copyright checkhook in pre-commit-config([#1344](https://github.com/open-mmlab/mmaction2/pull/1344)) + +**Documentations** + +- Add MMFlow ([#1273](https://github.com/open-mmlab/mmaction2/pull/1273)) +- Revise README.md and add projects.md ([#1286](https://github.com/open-mmlab/mmaction2/pull/1286)) +- Add 2s-AGCN in Updates([#1289](https://github.com/open-mmlab/mmaction2/pull/1289)) +- Add MMFewShot([#1300](https://github.com/open-mmlab/mmaction2/pull/1300)) +- Add MMHuman3d([#1304](https://github.com/open-mmlab/mmaction2/pull/1304)) +- Update pre-commit([#1313](https://github.com/open-mmlab/mmaction2/pull/1313)) +- Use share menu from the theme instead([#1328](https://github.com/open-mmlab/mmaction2/pull/1328)) +- Update installation command([#1340](https://github.com/open-mmlab/mmaction2/pull/1340)) + +**Bug and Typo Fixes** + +- Update the inference part in notebooks([#1256](https://github.com/open-mmlab/mmaction2/pull/1256)) +- Update the map_location([#1262](<(https://github.com/open-mmlab/mmaction2/pull/1262)>)) +- Fix bug that start_index is not used in RawFrameDecode([#1278](https://github.com/open-mmlab/mmaction2/pull/1278)) +- Fix bug in init_random_seed([#1282](https://github.com/open-mmlab/mmaction2/pull/1282)) +- Fix bug in setup.py([#1303](https://github.com/open-mmlab/mmaction2/pull/1303)) +- Fix interrogate error in workflows([#1305](https://github.com/open-mmlab/mmaction2/pull/1305)) +- Fix typo in slowfast config([#1309](https://github.com/open-mmlab/mmaction2/pull/1309)) +- Cancel previous runs that are not completed([#1327](https://github.com/open-mmlab/mmaction2/pull/1327)) +- Fix missing skip_postproc parameter([#1347](https://github.com/open-mmlab/mmaction2/pull/1347)) +- Update ssn.py([#1355](https://github.com/open-mmlab/mmaction2/pull/1355)) +- Use latest youtube-dl([#1357](https://github.com/open-mmlab/mmaction2/pull/1357)) +- Fix test-best([#1362](https://github.com/open-mmlab/mmaction2/pull/1362)) + +**ModelZoo** + +- Improve some sthv1 related models([#1306](https://github.com/open-mmlab/mmaction2/pull/1306)) +- Support BABEL([#1332](https://github.com/open-mmlab/mmaction2/pull/1332)) + +## 0.20.0 (07/10/2021) + +**Highlights** + +- Support TorchServe +- Add video structuralize demo +- Support using 3D skeletons for skeleton-based action recognition +- Benchmark PoseC3D on UCF and HMDB + +**New Features** + +- Support TorchServe ([#1212](https://github.com/open-mmlab/mmaction2/pull/1212)) +- Support 3D skeletons pre-processing ([#1218](https://github.com/open-mmlab/mmaction2/pull/1218)) +- Support video structuralize demo ([#1197](https://github.com/open-mmlab/mmaction2/pull/1197)) + +**Documentations** + +- Revise README.md and add projects.md ([#1214](https://github.com/open-mmlab/mmaction2/pull/1214)) +- Add CN docs for Skeleton dataset, PoseC3D and ST-GCN ([#1228](https://github.com/open-mmlab/mmaction2/pull/1228), [#1237](https://github.com/open-mmlab/mmaction2/pull/1237), [#1236](https://github.com/open-mmlab/mmaction2/pull/1236)) +- Add tutorial for custom dataset training for skeleton-based action recognition ([#1234](https://github.com/open-mmlab/mmaction2/pull/1234)) + +**Bug and Typo Fixes** + +- Fix tutorial link ([#1219](https://github.com/open-mmlab/mmaction2/pull/1219)) +- Fix GYM links ([#1224](https://github.com/open-mmlab/mmaction2/pull/1224)) + +**ModelZoo** + +- Benchmark PoseC3D on UCF and HMDB ([#1223](https://github.com/open-mmlab/mmaction2/pull/1223)) +- Add ST-GCN + 3D skeleton model for NTU60-XSub ([#1236](https://github.com/open-mmlab/mmaction2/pull/1236)) + +## 0.19.0 (07/10/2021) + +**Highlights** + +- Support ST-GCN +- Refactor the inference API +- Add code spell check hook + +**New Features** + +- Support ST-GCN ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123)) + +**Improvement** + +- Add label maps for every dataset ([#1127](https://github.com/open-mmlab/mmaction2/pull/1127)) +- Remove useless code MultiGroupCrop ([#1180](https://github.com/open-mmlab/mmaction2/pull/1180)) +- Refactor Inference API ([#1191](https://github.com/open-mmlab/mmaction2/pull/1191)) +- Add code spell check hook ([#1208](https://github.com/open-mmlab/mmaction2/pull/1208)) +- Use docker in CI ([#1159](https://github.com/open-mmlab/mmaction2/pull/1159)) + +**Documentations** + +- Update metafiles to new OpenMMLAB protocols ([#1134](https://github.com/open-mmlab/mmaction2/pull/1134)) +- Switch to new doc style ([#1160](https://github.com/open-mmlab/mmaction2/pull/1160)) +- Improve the ERROR message ([#1203](https://github.com/open-mmlab/mmaction2/pull/1203)) +- Fix invalid URL in getting_started ([#1169](https://github.com/open-mmlab/mmaction2/pull/1169)) + +**Bug and Typo Fixes** + +- Compatible with new MMClassification ([#1139](https://github.com/open-mmlab/mmaction2/pull/1139)) +- Add missing runtime dependencies ([#1144](https://github.com/open-mmlab/mmaction2/pull/1144)) +- Fix THUMOS tag proposals path ([#1156](https://github.com/open-mmlab/mmaction2/pull/1156)) +- Fix LoadHVULabel ([#1194](https://github.com/open-mmlab/mmaction2/pull/1194)) +- Switch the default value of `persistent_workers` to False ([#1202](https://github.com/open-mmlab/mmaction2/pull/1202)) +- Fix `_freeze_stages` for MobileNetV2 ([#1193](https://github.com/open-mmlab/mmaction2/pull/1193)) +- Fix resume when building rawframes ([#1150](https://github.com/open-mmlab/mmaction2/pull/1150)) +- Fix device bug for class weight ([#1188](https://github.com/open-mmlab/mmaction2/pull/1188)) +- Correct Arg names in extract_audio.py ([#1148](https://github.com/open-mmlab/mmaction2/pull/1148)) + +**ModelZoo** + +- Add TSM-MobileNetV2 ported from TSM ([#1163](https://github.com/open-mmlab/mmaction2/pull/1163)) +- Add ST-GCN for NTURGB+D-XSub-60 ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123)) + +## 0.18.0 (02/09/2021) + +**Improvement** + +- Add CopyRight ([#1099](https://github.com/open-mmlab/mmaction2/pull/1099)) +- Support NTU Pose Extraction ([#1076](https://github.com/open-mmlab/mmaction2/pull/1076)) +- Support Caching in RawFrameDecode ([#1078](https://github.com/open-mmlab/mmaction2/pull/1078)) +- Add citations & Support python3.9 CI & Use fixed-version sphinx ([#1125](https://github.com/open-mmlab/mmaction2/pull/1125)) + +**Documentations** + +- Add Descriptions of PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053)) + +**Bug and Typo Fixes** + +- Fix SSV2 checkpoints ([#1101](https://github.com/open-mmlab/mmaction2/pull/1101)) +- Fix CSN normalization ([#1116](https://github.com/open-mmlab/mmaction2/pull/1116)) +- Fix typo ([#1121](https://github.com/open-mmlab/mmaction2/pull/1121)) +- Fix new_crop_quadruple bug ([#1108](https://github.com/open-mmlab/mmaction2/pull/1108)) + +## 0.17.0 (03/08/2021) + +**Highlights** + +- Support PyTorch 1.9 +- Support Pytorchvideo Transforms +- Support PreciseBN + +**New Features** + +- Support Pytorchvideo Transforms ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008)) +- Support PreciseBN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038)) + +**Improvements** + +- Remove redundant augmentations in config files ([#996](https://github.com/open-mmlab/mmaction2/pull/996)) +- Make resource directory to hold common resource pictures ([#1011](https://github.com/open-mmlab/mmaction2/pull/1011)) +- Remove deprecated FrameSelector ([#1010](https://github.com/open-mmlab/mmaction2/pull/1010)) +- Support Concat Dataset ([#1000](https://github.com/open-mmlab/mmaction2/pull/1000)) +- Add `to-mp4` option to resize_videos.py ([#1021](https://github.com/open-mmlab/mmaction2/pull/1021)) +- Add option to keep tail frames ([#1050](https://github.com/open-mmlab/mmaction2/pull/1050)) +- Update MIM support ([#1061](https://github.com/open-mmlab/mmaction2/pull/1061)) +- Calculate Top-K accurate and inaccurate classes ([#1047](https://github.com/open-mmlab/mmaction2/pull/1047)) + +**Bug and Typo Fixes** + +- Fix bug in PoseC3D demo ([#1009](https://github.com/open-mmlab/mmaction2/pull/1009)) +- Fix some problems in resize_videos.py ([#1012](https://github.com/open-mmlab/mmaction2/pull/1012)) +- Support torch1.9 ([#1015](https://github.com/open-mmlab/mmaction2/pull/1015)) +- Remove redundant code in CI ([#1046](https://github.com/open-mmlab/mmaction2/pull/1046)) +- Fix bug about persistent_workers ([#1044](https://github.com/open-mmlab/mmaction2/pull/1044)) +- Support TimeSformer feature extraction ([#1035](https://github.com/open-mmlab/mmaction2/pull/1035)) +- Fix ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025)) + +**ModelZoo** + +- Add TSM-R50 sthv1 models trained by PytorchVideo RandAugment and AugMix ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008)) +- Update SlowOnly SthV1 checkpoints ([#1034](https://github.com/open-mmlab/mmaction2/pull/1034)) +- Add SlowOnly Kinetics400 checkpoints trained with Precise-BN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038)) +- Add CSN-R50 from scratch checkpoints ([#1045](https://github.com/open-mmlab/mmaction2/pull/1045)) +- TPN Kinetics-400 Checkpoints trained with the new ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025)) + +**Documentation** + +- Add Chinese translation of feature_extraction.md ([#1020](https://github.com/open-mmlab/mmaction2/pull/1020)) +- Fix the code snippet in getting_started.md ([#1023](https://github.com/open-mmlab/mmaction2/pull/1023)) +- Fix TANet config table ([#1028](https://github.com/open-mmlab/mmaction2/pull/1028)) +- Add description to PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053)) + +## 0.16.0 (01/07/2021) + +**Highlights** + +- Support using backbone from pytorch-image-models(timm) +- Support PIMS Decoder +- Demo for skeleton-based action recognition +- Support Timesformer + +**New Features** + +- Support using backbones from pytorch-image-models(timm) for TSN ([#880](https://github.com/open-mmlab/mmaction2/pull/880)) +- Support torchvision transformations in preprocessing pipelines ([#972](https://github.com/open-mmlab/mmaction2/pull/972)) +- Demo for skeleton-based action recognition ([#972](https://github.com/open-mmlab/mmaction2/pull/972)) +- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839)) + +**Improvements** + +- Add a tool to find invalid videos ([#907](https://github.com/open-mmlab/mmaction2/pull/907), [#950](https://github.com/open-mmlab/mmaction2/pull/950)) +- Add an option to specify spectrogram_type ([#909](https://github.com/open-mmlab/mmaction2/pull/909)) +- Add json output to video demo ([#906](https://github.com/open-mmlab/mmaction2/pull/906)) +- Add MIM related docs ([#918](https://github.com/open-mmlab/mmaction2/pull/918)) +- Rename lr to scheduler ([#916](https://github.com/open-mmlab/mmaction2/pull/916)) +- Support `--cfg-options` for demos ([#911](https://github.com/open-mmlab/mmaction2/pull/911)) +- Support number counting for flow-wise filename template ([#922](https://github.com/open-mmlab/mmaction2/pull/922)) +- Add Chinese tutorial ([#941](https://github.com/open-mmlab/mmaction2/pull/941)) +- Change ResNet3D default values ([#939](https://github.com/open-mmlab/mmaction2/pull/939)) +- Adjust script structure ([#935](https://github.com/open-mmlab/mmaction2/pull/935)) +- Add font color to args in long_video_demo ([#947](https://github.com/open-mmlab/mmaction2/pull/947)) +- Polish code style with Pylint ([#908](https://github.com/open-mmlab/mmaction2/pull/908)) +- Support PIMS Decoder ([#946](https://github.com/open-mmlab/mmaction2/pull/946)) +- Improve Metafiles ([#956](https://github.com/open-mmlab/mmaction2/pull/956), [#979](https://github.com/open-mmlab/mmaction2/pull/979), [#966](https://github.com/open-mmlab/mmaction2/pull/966)) +- Add links to download Kinetics400 validation ([#920](https://github.com/open-mmlab/mmaction2/pull/920)) +- Audit the usage of shutil.rmtree ([#943](https://github.com/open-mmlab/mmaction2/pull/943)) +- Polish localizer related codes([#913](https://github.com/open-mmlab/mmaction2/pull/913)) + +**Bug and Typo Fixes** + +- Fix spatiotemporal detection demo ([#899](https://github.com/open-mmlab/mmaction2/pull/899)) +- Fix docstring for 3D inflate ([#925](https://github.com/open-mmlab/mmaction2/pull/925)) +- Fix bug of writing text to video with TextClip ([#952](https://github.com/open-mmlab/mmaction2/pull/952)) +- Fix mmcv install in CI ([#977](https://github.com/open-mmlab/mmaction2/pull/977)) + +**ModelZoo** + +- Add TSN with Swin Transformer backbone as an example for using pytorch-image-models(timm) backbones ([#880](https://github.com/open-mmlab/mmaction2/pull/880)) +- Port CSN checkpoints from VMZ ([#945](https://github.com/open-mmlab/mmaction2/pull/945)) +- Release various checkpoints for UCF101, HMDB51 and Sthv1 ([#938](https://github.com/open-mmlab/mmaction2/pull/938)) +- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839)) +- Update TSM modelzoo ([#981](https://github.com/open-mmlab/mmaction2/pull/981)) + +## 0.15.0 (31/05/2021) + +**Highlights** + +- Support PoseC3D +- Support ACRN +- Support MIM + +**New Features** + +- Support PoseC3D ([#786](https://github.com/open-mmlab/mmaction2/pull/786), [#890](https://github.com/open-mmlab/mmaction2/pull/890)) +- Support MIM ([#870](https://github.com/open-mmlab/mmaction2/pull/870)) +- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891)) +- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864)) + +**Improvements** + +- Add `metric_options` for evaluation to docs ([#873](https://github.com/open-mmlab/mmaction2/pull/873)) +- Support creating a new label map based on custom classes for demos about spatio temporal demo ([#879](https://github.com/open-mmlab/mmaction2/pull/879)) +- Improve document about AVA dataset preparation ([#878](https://github.com/open-mmlab/mmaction2/pull/878)) +- Provide a script to extract clip-level feature ([#856](https://github.com/open-mmlab/mmaction2/pull/856)) + +**Bug and Typo Fixes** + +- Fix issues about resume ([#877](https://github.com/open-mmlab/mmaction2/pull/877), [#878](https://github.com/open-mmlab/mmaction2/pull/878)) +- Correct the key name of `eval_results` dictionary for metric 'mmit_mean_average_precision' ([#885](https://github.com/open-mmlab/mmaction2/pull/885)) + +**ModelZoo** + +- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864)) +- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891)) + +## 0.14.0 (30/04/2021) + +**Highlights** + +- Support TRN +- Support Diving48 + +**New Features** + +- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755)) +- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835)) +- Support Webcam Demo for Spatio-temporal Action Detection Models ([#795](https://github.com/open-mmlab/mmaction2/pull/795)) + +**Improvements** + +- Add softmax option for pytorch2onnx tool ([#781](https://github.com/open-mmlab/mmaction2/pull/781)) +- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755)) +- Test with onnx models and TensorRT engines ([#758](https://github.com/open-mmlab/mmaction2/pull/758)) +- Speed up AVA Testing ([#784](https://github.com/open-mmlab/mmaction2/pull/784)) +- Add `self.with_neck` attribute ([#796](https://github.com/open-mmlab/mmaction2/pull/796)) +- Update installation document ([#798](https://github.com/open-mmlab/mmaction2/pull/798)) +- Use a random master port ([#809](https://github.com/open-mmlab/mmaction2/pull/8098)) +- Update AVA processing data document ([#801](https://github.com/open-mmlab/mmaction2/pull/801)) +- Refactor spatio-temporal augmentation ([#782](https://github.com/open-mmlab/mmaction2/pull/782)) +- Add QR code in CN README ([#812](https://github.com/open-mmlab/mmaction2/pull/812)) +- Add Alternative way to download Kinetics ([#817](https://github.com/open-mmlab/mmaction2/pull/817), [#822](https://github.com/open-mmlab/mmaction2/pull/822)) +- Refactor Sampler ([#790](https://github.com/open-mmlab/mmaction2/pull/790)) +- Use EvalHook in MMCV with backward compatibility ([#793](https://github.com/open-mmlab/mmaction2/pull/793)) +- Use MMCV Model Registry ([#843](https://github.com/open-mmlab/mmaction2/pull/843)) + +**Bug and Typo Fixes** + +- Fix a bug in pytorch2onnx.py when `num_classes <= 4` ([#800](https://github.com/open-mmlab/mmaction2/pull/800), [#824](https://github.com/open-mmlab/mmaction2/pull/824)) +- Fix `demo_spatiotemporal_det.py` error ([#803](https://github.com/open-mmlab/mmaction2/pull/803), [#805](https://github.com/open-mmlab/mmaction2/pull/805)) +- Fix loading config bugs when resume ([#820](https://github.com/open-mmlab/mmaction2/pull/820)) +- Make HMDB51 annotation generation more robust ([#811](https://github.com/open-mmlab/mmaction2/pull/811)) + +**ModelZoo** + +- Update checkpoint for 256 height in something-V2 ([#789](https://github.com/open-mmlab/mmaction2/pull/789)) +- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835)) + +## 0.13.0 (31/03/2021) + +**Highlights** + +- Support LFB +- Support using backbone from MMCls/TorchVision +- Add Chinese documentation + +**New Features** + +- Support LFB ([#553](https://github.com/open-mmlab/mmaction2/pull/553)) +- Support using backbones from MMCls for TSN ([#679](https://github.com/open-mmlab/mmaction2/pull/679)) +- Support using backbones from TorchVision for TSN ([#720](https://github.com/open-mmlab/mmaction2/pull/720)) +- Support Mixup and Cutmix for recognizers ([#681](https://github.com/open-mmlab/mmaction2/pull/681)) +- Support Chinese documentation ([#665](https://github.com/open-mmlab/mmaction2/pull/665), [#680](https://github.com/open-mmlab/mmaction2/pull/680), [#689](https://github.com/open-mmlab/mmaction2/pull/689), [#701](https://github.com/open-mmlab/mmaction2/pull/701), [#702](https://github.com/open-mmlab/mmaction2/pull/702), [#703](https://github.com/open-mmlab/mmaction2/pull/703), [#706](https://github.com/open-mmlab/mmaction2/pull/706), [#716](https://github.com/open-mmlab/mmaction2/pull/716), [#717](https://github.com/open-mmlab/mmaction2/pull/717), [#731](https://github.com/open-mmlab/mmaction2/pull/731), [#733](https://github.com/open-mmlab/mmaction2/pull/733), [#735](https://github.com/open-mmlab/mmaction2/pull/735), [#736](https://github.com/open-mmlab/mmaction2/pull/736), [#737](https://github.com/open-mmlab/mmaction2/pull/737), [#738](https://github.com/open-mmlab/mmaction2/pull/738), [#739](https://github.com/open-mmlab/mmaction2/pull/739), [#740](https://github.com/open-mmlab/mmaction2/pull/740), [#742](https://github.com/open-mmlab/mmaction2/pull/742), [#752](https://github.com/open-mmlab/mmaction2/pull/752), [#759](https://github.com/open-mmlab/mmaction2/pull/759), [#761](https://github.com/open-mmlab/mmaction2/pull/761), [#772](https://github.com/open-mmlab/mmaction2/pull/772), [#775](https://github.com/open-mmlab/mmaction2/pull/775)) + +**Improvements** + +- Add slowfast config/json/log/ckpt for training custom classes of AVA ([#678](https://github.com/open-mmlab/mmaction2/pull/678)) +- Set RandAugment as Imgaug default transforms ([#585](https://github.com/open-mmlab/mmaction2/pull/585)) +- Add `--test-last` & `--test-best` for `tools/train.py` to test checkpoints after training ([#608](https://github.com/open-mmlab/mmaction2/pull/608)) +- Add fcn_testing in TPN ([#684](https://github.com/open-mmlab/mmaction2/pull/684)) +- Remove redundant recall functions ([#741](https://github.com/open-mmlab/mmaction2/pull/741)) +- Recursively remove pretrained step for testing ([#695](https://github.com/open-mmlab/mmaction2/pull/695)) +- Improve demo by limiting inference fps ([#668](https://github.com/open-mmlab/mmaction2/pull/668)) + +**Bug and Typo Fixes** + +- Fix a bug about multi-class in VideoDataset ([#723](https://github.com/open-mmlab/mmaction2/pull/678)) +- Reverse key-value in anet filelist generation ([#686](https://github.com/open-mmlab/mmaction2/pull/686)) +- Fix flow norm cfg typo ([#693](https://github.com/open-mmlab/mmaction2/pull/693)) + +**ModelZoo** + +- Add LFB for AVA2.1 ([#553](https://github.com/open-mmlab/mmaction2/pull/553)) +- Add TSN with ResNeXt-101-32x4d backbone as an example for using MMCls backbones ([#679](https://github.com/open-mmlab/mmaction2/pull/679)) +- Add TSN with Densenet161 backbone as an example for using TorchVision backbones ([#720](https://github.com/open-mmlab/mmaction2/pull/720)) +- Add slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb ([#690](https://github.com/open-mmlab/mmaction2/pull/690)) +- Add slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb ([#704](https://github.com/open-mmlab/mmaction2/pull/704)) +- Add slowonly_nl_kinetics_pretrained_r50_4x16x1(8x8x1)\_20e_ava_rgb ([#730](https://github.com/open-mmlab/mmaction2/pull/730)) + +## 0.12.0 (28/02/2021) + +**Highlights** + +- Support TSM-MobileNetV2 +- Support TANet +- Support GPU Normalize + +**New Features** + +- Support TSM-MobileNetV2 ([#415](https://github.com/open-mmlab/mmaction2/pull/415)) +- Support flip with label mapping ([#591](https://github.com/open-mmlab/mmaction2/pull/591)) +- Add seed option for sampler ([#642](https://github.com/open-mmlab/mmaction2/pull/642)) +- Support GPU Normalize ([#586](https://github.com/open-mmlab/mmaction2/pull/586)) +- Support TANet ([#595](https://github.com/open-mmlab/mmaction2/pull/595)) + +**Improvements** + +- Training custom classes of ava dataset ([#555](https://github.com/open-mmlab/mmaction2/pull/555)) +- Add CN README in homepage ([#592](https://github.com/open-mmlab/mmaction2/pull/592), [#594](https://github.com/open-mmlab/mmaction2/pull/594)) +- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625)) +- Refactor config: Specify `train_cfg` and `test_cfg` in `model` ([#629](https://github.com/open-mmlab/mmaction2/pull/629)) +- Provide an alternative way to download older kinetics annotations ([#597](https://github.com/open-mmlab/mmaction2/pull/597)) +- Update FAQ for + - 1). data pipeline about video and frames ([#598](https://github.com/open-mmlab/mmaction2/pull/598)) + - 2). how to show results ([#598](https://github.com/open-mmlab/mmaction2/pull/598)) + - 3). batch size setting for batchnorm ([#657](https://github.com/open-mmlab/mmaction2/pull/657)) + - 4). how to fix stages of backbone when finetuning models ([#658](https://github.com/open-mmlab/mmaction2/pull/658)) +- Modify default value of `save_best` ([#600](https://github.com/open-mmlab/mmaction2/pull/600)) +- Use BibTex rather than latex in markdown ([#607](https://github.com/open-mmlab/mmaction2/pull/607)) +- Add warnings of uninstalling mmdet and supplementary documents ([#624](https://github.com/open-mmlab/mmaction2/pull/624)) +- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625)) + +**Bug and Typo Fixes** + +- Fix value of `pem_low_temporal_iou_threshold` in BSN ([#556](https://github.com/open-mmlab/mmaction2/pull/556)) +- Fix ActivityNet download script ([#601](https://github.com/open-mmlab/mmaction2/pull/601)) + +**ModelZoo** + +- Add TSM-MobileNetV2 for Kinetics400 ([#415](https://github.com/open-mmlab/mmaction2/pull/415)) +- Add deeper SlowFast models ([#605](https://github.com/open-mmlab/mmaction2/pull/605)) + +## 0.11.0 (31/01/2021) + +**Highlights** + +- Support imgaug +- Support spatial temporal demo +- Refactor EvalHook, config structure, unittest structure + +**New Features** + +- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492)) +- Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511)) +- Add spatial temporal demo ([#547](https://github.com/open-mmlab/mmaction2/pull/547), [#566](https://github.com/open-mmlab/mmaction2/pull/566)) + +**Improvements** + +- Refactor EvalHook ([#395](https://github.com/open-mmlab/mmaction2/pull/395)) +- Refactor AVA hook ([#567](https://github.com/open-mmlab/mmaction2/pull/567)) +- Add repo citation ([#545](https://github.com/open-mmlab/mmaction2/pull/545)) +- Add dataset size of Kinetics400 ([#503](https://github.com/open-mmlab/mmaction2/pull/503)) +- Add lazy operation docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504)) +- Add class_weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509)) +- add some explanation about the resampling in slowfast ([#502](https://github.com/open-mmlab/mmaction2/pull/502)) +- Modify paper title in README.md ([#512](https://github.com/open-mmlab/mmaction2/pull/512)) +- Add alternative ways to download Kinetics ([#521](https://github.com/open-mmlab/mmaction2/pull/521)) +- Add OpenMMLab projects link in README ([#530](https://github.com/open-mmlab/mmaction2/pull/530)) +- Change default preprocessing to shortedge to 256 ([#538](https://github.com/open-mmlab/mmaction2/pull/538)) +- Add config tag in dataset README ([#540](https://github.com/open-mmlab/mmaction2/pull/540)) +- Add solution for markdownlint installation issue ([#497](https://github.com/open-mmlab/mmaction2/pull/497)) +- Add dataset overview in readthedocs ([#548](https://github.com/open-mmlab/mmaction2/pull/548)) +- Modify the trigger mode of the warnings of missing mmdet ([#583](https://github.com/open-mmlab/mmaction2/pull/583)) +- Refactor config structure ([#488](https://github.com/open-mmlab/mmaction2/pull/488), [#572](https://github.com/open-mmlab/mmaction2/pull/572)) +- Refactor unittest structure ([#433](https://github.com/open-mmlab/mmaction2/pull/433)) + +**Bug and Typo Fixes** + +- Fix a bug about ava dataset validation ([#527](https://github.com/open-mmlab/mmaction2/pull/527)) +- Fix a bug about ResNet pretrain weight initialization ([#582](https://github.com/open-mmlab/mmaction2/pull/582)) +- Fix a bug in CI due to MMCV index ([#495](https://github.com/open-mmlab/mmaction2/pull/495)) +- Remove invalid links of MiT and MMiT ([#516](https://github.com/open-mmlab/mmaction2/pull/516)) +- Fix frame rate bug for AVA preparation ([#576](https://github.com/open-mmlab/mmaction2/pull/576)) + +**ModelZoo** + +## 0.10.0 (31/12/2020) + +**Highlights** + +- Support Spatio-Temporal Action Detection (AVA) +- Support precise BN + +**New Features** + +- Support precise BN ([#501](https://github.com/open-mmlab/mmaction2/pull/501/)) +- Support Spatio-Temporal Action Detection (AVA) ([#351](https://github.com/open-mmlab/mmaction2/pull/351)) +- Support to return feature maps in `inference_recognizer` ([#458](https://github.com/open-mmlab/mmaction2/pull/458)) + +**Improvements** + +- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468)) +- Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351)) +- Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454)) +- Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225)) +- Speed up confusion matrix calculation ([#465](https://github.com/open-mmlab/mmaction2/pull/465)) +- Use title case in modelzoo statistics ([#456](https://github.com/open-mmlab/mmaction2/pull/456)) +- Add FAQ documents for easy troubleshooting. ([#413](https://github.com/open-mmlab/mmaction2/pull/413), [#420](https://github.com/open-mmlab/mmaction2/pull/420), [#439](https://github.com/open-mmlab/mmaction2/pull/439)) +- Support Spatio-Temporal Action Detection with context ([#471](https://github.com/open-mmlab/mmaction2/pull/471)) +- Add class weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509)) +- Add Lazy OPs docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504)) + +**Bug and Typo Fixes** + +- Fix typo in default argument of BaseHead ([#446](https://github.com/open-mmlab/mmaction2/pull/446)) +- Fix potential bug about `output_config` overwrite ([#463](https://github.com/open-mmlab/mmaction2/pull/463)) + +**ModelZoo** + +- Add SlowOnly, SlowFast for AVA2.1 ([#351](https://github.com/open-mmlab/mmaction2/pull/351)) + +## 0.9.0 (30/11/2020) + +**Highlights** + +- Support GradCAM utils for recognizers +- Support ResNet Audio model + +**New Features** + +- Automatically add modelzoo statistics to readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327)) +- Support GYM99 ([#331](https://github.com/open-mmlab/mmaction2/pull/331), [#336](https://github.com/open-mmlab/mmaction2/pull/336)) +- Add AudioOnly Pathway from AVSlowFast. ([#355](https://github.com/open-mmlab/mmaction2/pull/355)) +- Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324)) +- Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345)) +- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291)) + +**Improvements** + +- Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312)) +- Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274)) +- Update docs bout test crops ([#359](https://github.com/open-mmlab/mmaction2/pull/359)) +- Polish code format using pylint manually ([#338](https://github.com/open-mmlab/mmaction2/pull/338)) +- Update unittest coverage ([#358](https://github.com/open-mmlab/mmaction2/pull/358), [#322](https://github.com/open-mmlab/mmaction2/pull/322), [#325](https://github.com/open-mmlab/mmaction2/pull/325)) +- Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323)) +- Update colab tutorial ([#367](https://github.com/open-mmlab/mmaction2/pull/367)) +- set default batch_size of evaluation and testing to 1 ([#250](https://github.com/open-mmlab/mmaction2/pull/250)) +- Rename the preparation docs to `README.md` ([#388](https://github.com/open-mmlab/mmaction2/pull/388)) +- Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329)) +- Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310)) +- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359)) + +**Bug and Typo Fixes** + +- Fix rename Kinetics classnames bug ([#384](https://github.com/open-mmlab/mmaction2/pull/384)) +- Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314)) +- Fix a bug about `tmp_folder` in `OpenCVInit` ([#357](https://github.com/open-mmlab/mmaction2/pull/357)) +- Fix `get_thread_id` when not using disk as backend ([#354](https://github.com/open-mmlab/mmaction2/pull/354), [#357](https://github.com/open-mmlab/mmaction2/pull/357)) +- Fix the bug of HVU object `num_classes` from 1679 to 1678 ([#307](https://github.com/open-mmlab/mmaction2/pull/307)) +- Fix typo in `export_model.md` ([#399](https://github.com/open-mmlab/mmaction2/pull/399)) +- Fix OmniSource training configs ([#321](https://github.com/open-mmlab/mmaction2/pull/321)) +- Fix Issue #306: Bug of SampleAVAFrames ([#317](https://github.com/open-mmlab/mmaction2/pull/317)) + +**ModelZoo** + +- Add SlowOnly model for GYM99, both RGB and Flow ([#336](https://github.com/open-mmlab/mmaction2/pull/336)) +- Add auto modelzoo statistics in readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327)) +- Add TSN for HMDB51 pretrained on Kinetics400, Moments in Time and ImageNet. ([#372](https://github.com/open-mmlab/mmaction2/pull/372)) + +## v0.8.0 (31/10/2020) + +**Highlights** + +- Support [OmniSource](https://arxiv.org/abs/2003.13042) +- Support C3D +- Support video recognition with audio modality +- Support HVU +- Support X3D + +**New Features** + +- Support AVA dataset preparation ([#266](https://github.com/open-mmlab/mmaction2/pull/266)) +- Support the training of video recognition dataset with multiple tag categories ([#235](https://github.com/open-mmlab/mmaction2/pull/235)) +- Support joint training with multiple training datasets of multiple formats, including images, untrimmed videos, etc. ([#242](https://github.com/open-mmlab/mmaction2/pull/242)) +- Support to specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216)) +- Implement X3D models, support testing with model weights converted from SlowFast ([#288](https://github.com/open-mmlab/mmaction2/pull/288)) +- Support specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216)) + +**Improvements** + +- Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232)) +- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258)) +- Support data preparation for Kinetics-600 and Kinetics-700 ([#254](https://github.com/open-mmlab/mmaction2/pull/254)) +- Use `metric_dict` to replace hardcoded arguments in `evaluate` function ([#286](https://github.com/open-mmlab/mmaction2/pull/286)) +- Add `cfg-options` in arguments to override some settings in the used config for convenience ([#212](https://github.com/open-mmlab/mmaction2/pull/212)) +- Rename the old evaluating protocol `mean_average_precision` as `mmit_mean_average_precision` since it is only used on MMIT and is not the `mAP` we usually talk about. Add `mean_average_precision`, which is the real `mAP` ([#235](https://github.com/open-mmlab/mmaction2/pull/235)) +- Add accurate setting (Three crop * 2 clip) and report corresponding performance for TSM model ([#241](https://github.com/open-mmlab/mmaction2/pull/241)) +- Add citations in each preparing_dataset.md in `tools/data/dataset` ([#289](https://github.com/open-mmlab/mmaction2/pull/289)) +- Update the performance of audio-visual fusion on Kinetics-400 ([#281](https://github.com/open-mmlab/mmaction2/pull/281)) +- Support data preparation of OmniSource web datasets, including GoogleImage, InsImage, InsVideo and KineticsRawVideo ([#294](https://github.com/open-mmlab/mmaction2/pull/294)) +- Use `metric_options` dict to provide metric args in `evaluate` ([#286](https://github.com/open-mmlab/mmaction2/pull/286)) + +**Bug Fixes** + +- Register `FrameSelector` in `PIPELINES` ([#268](https://github.com/open-mmlab/mmaction2/pull/268)) +- Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245)) +- Fix multi-node dist test ([#292](https://github.com/open-mmlab/mmaction2/pull/292)) +- Fix the data preparation bug for `something-something` dataset ([#278](https://github.com/open-mmlab/mmaction2/pull/278)) +- Fix the invalid config url in slowonly README data benchmark ([#249](https://github.com/open-mmlab/mmaction2/pull/249)) +- Validate that the performance of models trained with videos have no significant difference comparing to the performance of models trained with rawframes ([#256](https://github.com/open-mmlab/mmaction2/pull/256)) +- Correct the `img_norm_cfg` used by TSN-3seg-R50 UCF-101 model, improve the Top-1 accuracy by 3% ([#273](https://github.com/open-mmlab/mmaction2/pull/273)) + +**ModelZoo** + +- Add Baselines for Kinetics-600 and Kinetics-700, including TSN-R50-8seg and SlowOnly-R50-8x8 ([#259](https://github.com/open-mmlab/mmaction2/pull/259)) +- Add OmniSource benchmark on MiniKineitcs ([#296](https://github.com/open-mmlab/mmaction2/pull/296)) +- Add Baselines for HVU, including TSN-R18-8seg on 6 tag categories of HVU ([#287](https://github.com/open-mmlab/mmaction2/pull/287)) +- Add X3D models ported from [SlowFast](https://github.com/facebookresearch/SlowFast/) ([#288](https://github.com/open-mmlab/mmaction2/pull/288)) + +## v0.7.0 (30/9/2020) + +**Highlights** + +- Support TPN +- Support JHMDB, UCF101-24, HVU dataset preparation +- support onnx model conversion + +**New Features** + +- Support the data pre-processing pipeline for the HVU Dataset ([#277](https://github.com/open-mmlab/mmaction2/pull/227/)) +- Support real-time action recognition from web camera ([#171](https://github.com/open-mmlab/mmaction2/pull/171)) +- Support onnx ([#160](https://github.com/open-mmlab/mmaction2/pull/160)) +- Support UCF101-24 preparation ([#219](https://github.com/open-mmlab/mmaction2/pull/219)) +- Support evaluating mAP for ActivityNet with [CUHK17_activitynet_pred](http://activity-net.org/challenges/2017/evaluation.html) ([#176](https://github.com/open-mmlab/mmaction2/pull/176)) +- Add the data pipeline for ActivityNet, including downloading videos, extracting RGB and Flow frames, finetuning TSN and extracting feature ([#190](https://github.com/open-mmlab/mmaction2/pull/190)) +- Support JHMDB preparation ([#220](https://github.com/open-mmlab/mmaction2/pull/220)) + +**ModelZoo** + +- Add finetuning setting for SlowOnly ([#173](https://github.com/open-mmlab/mmaction2/pull/173)) +- Add TSN and SlowOnly models trained with [OmniSource](https://arxiv.org/abs/2003.13042), which achieve 75.7% Top-1 with TSN-R50-3seg and 80.4% Top-1 with SlowOnly-R101-8x8 ([#215](https://github.com/open-mmlab/mmaction2/pull/215)) + +**Improvements** + +- Support demo with video url ([#165](https://github.com/open-mmlab/mmaction2/pull/165)) +- Support multi-batch when testing ([#184](https://github.com/open-mmlab/mmaction2/pull/184)) +- Add tutorial for adding a new learning rate updater ([#181](https://github.com/open-mmlab/mmaction2/pull/181)) +- Add config name in meta info ([#183](https://github.com/open-mmlab/mmaction2/pull/183)) +- Remove git hash in `__version__` ([#189](https://github.com/open-mmlab/mmaction2/pull/189)) +- Check mmcv version ([#189](https://github.com/open-mmlab/mmaction2/pull/189)) +- Update url with 'https://download.openmmlab.com' ([#208](https://github.com/open-mmlab/mmaction2/pull/208)) +- Update Docker file to support PyTorch 1.6 and update `install.md` ([#209](https://github.com/open-mmlab/mmaction2/pull/209)) +- Polish readsthedocs display ([#217](https://github.com/open-mmlab/mmaction2/pull/217), [#229](https://github.com/open-mmlab/mmaction2/pull/229)) + +**Bug Fixes** + +- Fix the bug when using OpenCV to extract only RGB frames with original shape ([#184](https://github.com/open-mmlab/mmaction2/pull/187)) +- Fix the bug of sthv2 `num_classes` from 339 to 174 ([#174](https://github.com/open-mmlab/mmaction2/pull/174), [#207](https://github.com/open-mmlab/mmaction2/pull/207)) + +## v0.6.0 (2/9/2020) + +**Highlights** + +- Support TIN, CSN, SSN, NonLocal +- Support FP16 training + +**New Features** + +- Support NonLocal module and provide ckpt in TSM and I3D ([#41](https://github.com/open-mmlab/mmaction2/pull/41)) +- Support SSN ([#33](https://github.com/open-mmlab/mmaction2/pull/33), [#37](https://github.com/open-mmlab/mmaction2/pull/37), [#52](https://github.com/open-mmlab/mmaction2/pull/52), [#55](https://github.com/open-mmlab/mmaction2/pull/55)) +- Support CSN ([#87](https://github.com/open-mmlab/mmaction2/pull/87)) +- Support TIN ([#53](https://github.com/open-mmlab/mmaction2/pull/53)) +- Support HMDB51 dataset preparation ([#60](https://github.com/open-mmlab/mmaction2/pull/60)) +- Support encoding videos from frames ([#84](https://github.com/open-mmlab/mmaction2/pull/84)) +- Support FP16 training ([#25](https://github.com/open-mmlab/mmaction2/pull/25)) +- Enhance demo by supporting rawframe inference ([#59](https://github.com/open-mmlab/mmaction2/pull/59)), output video/gif ([#72](https://github.com/open-mmlab/mmaction2/pull/72)) + +**ModelZoo** + +- Update Slowfast modelzoo ([#51](https://github.com/open-mmlab/mmaction2/pull/51)) +- Update TSN, TSM video checkpoints ([#50](https://github.com/open-mmlab/mmaction2/pull/50)) +- Add data benchmark for TSN ([#57](https://github.com/open-mmlab/mmaction2/pull/57)) +- Add data benchmark for SlowOnly ([#77](https://github.com/open-mmlab/mmaction2/pull/77)) +- Add BSN/BMN performance results with feature extracted by our codebase ([#99](https://github.com/open-mmlab/mmaction2/pull/99)) + +**Improvements** + +- Polish data preparation codes ([#70](https://github.com/open-mmlab/mmaction2/pull/70)) +- Improve data preparation scripts ([#58](https://github.com/open-mmlab/mmaction2/pull/58)) +- Improve unittest coverage and minor fix ([#62](https://github.com/open-mmlab/mmaction2/pull/62)) +- Support PyTorch 1.6 in CI ([#117](https://github.com/open-mmlab/mmaction2/pull/117)) +- Support `with_offset` for rawframe dataset ([#48](https://github.com/open-mmlab/mmaction2/pull/48)) +- Support json annotation files ([#119](https://github.com/open-mmlab/mmaction2/pull/119)) +- Support `multi-class` in TSMHead ([#104](https://github.com/open-mmlab/mmaction2/pull/104)) +- Support using `val_step()` to validate data for each `val` workflow ([#123](https://github.com/open-mmlab/mmaction2/pull/123)) +- Use `xxInit()` method to get `total_frames` and make `total_frames` a required key ([#90](https://github.com/open-mmlab/mmaction2/pull/90)) +- Add paper introduction in model readme ([#140](https://github.com/open-mmlab/mmaction2/pull/140)) +- Adjust the directory structure of `tools/` and rename some scripts files ([#142](https://github.com/open-mmlab/mmaction2/pull/142)) + +**Bug Fixes** + +- Fix configs for localization test ([#67](https://github.com/open-mmlab/mmaction2/pull/67)) +- Fix configs of SlowOnly by fixing lr to 8 gpus ([#136](https://github.com/open-mmlab/mmaction2/pull/136)) +- Fix the bug in analyze_log ([#54](https://github.com/open-mmlab/mmaction2/pull/54)) +- Fix the bug of generating HMDB51 class index file ([#69](https://github.com/open-mmlab/mmaction2/pull/69)) +- Fix the bug of using `load_checkpoint()` in ResNet ([#93](https://github.com/open-mmlab/mmaction2/pull/93)) +- Fix the bug of `--work-dir` when using slurm training script ([#110](https://github.com/open-mmlab/mmaction2/pull/110)) +- Correct the sthv1/sthv2 rawframes filelist generate command ([#71](https://github.com/open-mmlab/mmaction2/pull/71)) +- `CosineAnnealing` typo ([#47](https://github.com/open-mmlab/mmaction2/pull/47)) + +## v0.5.0 (9/7/2020) + +**Highlights** + +- MMAction2 is released + +**New Features** + +- Support various datasets: UCF101, Kinetics-400, Something-Something V1&V2, Moments in Time, + Multi-Moments in Time, THUMOS14 +- Support various action recognition methods: TSN, TSM, R(2+1)D, I3D, SlowOnly, SlowFast, Non-local +- Support various action localization methods: BSN, BMN +- Colab demo for action recognition diff --git a/docs/en/notes/ecosystem.md b/docs/en/notes/ecosystem.md new file mode 100644 index 0000000000000000000000000000000000000000..3f1b2a784d7e20a5cae0d95e0d3838ba20d3feb7 --- /dev/null +++ b/docs/en/notes/ecosystem.md @@ -0,0 +1,24 @@ +# Ecosystem Projects based on MMAction2 + +There are many research works and projects built on MMAction2. +We list some of them as examples of how to extend MMAction2 for your own projects. +As the page might not be completed, please feel free to create a PR to update this page. + +## Projects as an extension + +- [OTEAction2](https://github.com/openvinotoolkit/mmaction2): OpenVINO Training Extensions for Action Recognition. +- [PYSKL](https://github.com/kennymckormick/pyskl): A Toolbox Focusing on Skeleton-Based Action Recognition. + +## Projects of papers + +There are also projects released with papers. +Some of the papers are published in top-tier conferences (CVPR, ICCV, and ECCV), the others are also highly influential. +To make this list also a reference for the community to develop and compare new video understanding algorithms, we list them following the time order of top-tier conferences. +Methods already supported and maintained by MMAction2 are not listed. + +- Video Swin Transformer, CVPR 2022. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer) +- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 Oral. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR) +- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 Oral. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS) +- MGSampler: An Explainable Sampling Strategy for Video Action Recognition, ICCV 2021. [\[paper\]](https://arxiv.org/abs/2104.09952)[\[github\]](https://github.com/MCG-NJU/MGSampler) +- MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions, ICCV 2021. [\[paper\]](https://arxiv.org/abs/2105.07404) +- Long Short-Term Transformer for Online Action Detection, NeurIPS 2021 [\[paper\]](https://arxiv.org/abs/2107.03377)[\[github\]](https://github.com/amazon-research/long-short-term-transformer) diff --git a/docs/en/notes/pytorch2.0.md b/docs/en/notes/pytorch2.0.md new file mode 100644 index 0000000000000000000000000000000000000000..09499beacd30f21384ebf64ab62e2607a2675d11 --- /dev/null +++ b/docs/en/notes/pytorch2.0.md @@ -0,0 +1,21 @@ +# PyTorch 2.0 Compatibility and Benchmark + +PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation. + +| Config | compiled | Train time / iter (s) | GPU memory (M) | test metric | +| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | False | 0.50 | 42537 | 36.55 | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | True | 0.61 | 53149 | 36.72 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | False | 0.688 | 14263 | 77.69 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | True | 0.691 | 13863 | 77.57 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | False | 0.0305 | 1184 | 91.69 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | True | 0.0298 | 1273 | 91.64 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | False | 0.498 | 9581 | 93.6 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | True | 0.505 | 11968 | 93.49 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | False | 0.17 | 8278 | 20.76 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | True | 0.1835 | 12004 | 21.67 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | False | 0.323 | 21651 | 78.90 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | True | 0.262 | 20905 | 78.70 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False | 0.098 | 5777 | 75.12 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True | 0.0942 | 7095 | 75.15 | +| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb | Fail | incompatible | incompatible | incompatible | diff --git a/docs/en/project_zoo.py b/docs/en/project_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..ef5909e41e5680a7d12c98177df4e031e7c55bd1 --- /dev/null +++ b/docs/en/project_zoo.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +from pathlib import Path + +from utils import replace_link + +# This script reads /projects/*/README.md and generate projectzoo.md + +all_files = list(Path('../../projects/').glob('*/README.md')) +example_project = '../../projects/example_project/README.md' +all_files.remove(Path(example_project)) +all_files.insert(0, Path(example_project)) + +project_zoo = open('../../projects/README.md').read() +for file in all_files: + with open(file) as f: + content = f.read() + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + file) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + file) + + project_zoo += content + +with open('projectzoo.md', 'w') as f: + f.write(project_zoo) diff --git a/docs/en/stat.py b/docs/en/stat.py new file mode 100644 index 0000000000000000000000000000000000000000..350fa113e9bd80a8a6b81cd2874a138f7e7cbe06 --- /dev/null +++ b/docs/en/stat.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +import re +import shutil +from collections import defaultdict +from pathlib import Path + +from modelindex.load_model_index import load +from modelindex.models.Result import Result +from tabulate import tabulate +from utils import replace_link + +MMACT_ROOT = Path(__file__).absolute().parents[2] +PAPERS_ROOT = Path('model_zoo') # Path to save generated paper pages. +GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/' +MODELZOO_TEMPLATE = """\ +# Model Zoo Summary + +In this page, we list [all algorithms](#all-supported-algorithms) we support. You can click the link to jump to the corresponding model pages. + +And we also list all checkpoints for different tasks we provide. You can sort or search checkpoints in the table and click the corresponding link to model pages for more details. + +## All supported algorithms + +* Number of papers: {num_papers} +{type_msg} + +* Number of checkpoints: {num_ckpts} +{paper_msg} + +""" # noqa: E501 + +METRIC_ALIAS = { + 'Top 1 Accuracy': 'Top-1 (%)', + 'Top 5 Accuracy': 'Top-5 (%)', +} + +TASK_MAP = dict( + detection='Spatio Temporal Action Detection Models', + localization='Action Localization Models', + recognition='Action Recognition Models', + skeleton='Skeleton-based Action Recognition Models', + retrieval='Video Retrieval Models', + recognition_audio='Audio-based Action Recognition Models') + +model_index = load(str(MMACT_ROOT / 'model-index.yml')) + + +def build_collections(model_index): + # add models for collections + col_by_name = {} + for col in model_index.collections: + setattr(col, 'models', []) + col_by_name[col.name] = col + + for model in model_index.models: + col = col_by_name[model.in_collection] + col.models.append(model) + setattr(model, 'collection', col) + if model.results is None: + setattr(model, 'tasks', []) + else: + setattr(model, 'tasks', [result.task for result in model.results]) + + +build_collections(model_index) + +# save a map from model name to title in README +model2title = dict() + + +def count_papers(collections): + total_num_ckpts = 0 + type_count = defaultdict(int) + paper_msgs = [] + + for collection in collections: + with open(MMACT_ROOT / collection.readme) as f: + readme = f.read() + + ckpts = set(x.lower().strip() + for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme)) + total_num_ckpts += len(ckpts) + title = collection.paper['Title'] + papertype = collection.data.get('type', 'Algorithm') + type_count[papertype] += 1 + + readme_title = re.search(r'^#\s+.+', readme) + + readme = Path(collection.filepath).parents[1].with_suffix('.md').name + model = Path(collection.filepath).parent.name + model2title[model] = readme_title.group()[2:].replace(' ', '-') + paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}' + f'#{model2title[model]}) ({len(ckpts)} ckpts)') + + type_msg = '\n'.join( + [f'\t- {type_}: {count}' for type_, count in type_count.items()]) + paper_msg = '\n'.join(paper_msgs) + + modelzoo = MODELZOO_TEMPLATE.format( + num_papers=len(collections), + num_ckpts=total_num_ckpts, + type_msg=type_msg, + paper_msg=paper_msg, + ) + + with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +count_papers(model_index.collections) + + +def generate_paper_page(collection): + + # Write a copy of README + with open(MMACT_ROOT / collection.readme) as f: + content = f.read() + readme_path = Path(collection.filepath) + copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name + if not copy.exists(): + with open(copy, 'w') as copy_file: + task = readme_path.parents[1].name + head_content = f'# {TASK_MAP[task]}\n' + copy_file.write(head_content) + + def lower_heading(match): + return '#' + match.group() + + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + Path(collection.readme)) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + Path(collection.readme)) + + content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M) + + with open(copy, 'a') as copy_file: + copy_file.write(content) + + +if PAPERS_ROOT.exists(): + shutil.rmtree(PAPERS_ROOT) +PAPERS_ROOT.mkdir(exist_ok=True) +for collection in model_index.collections: + generate_paper_page(collection) + + +def scatter_results(models): + model_result_pairs = [] + for model in models: + if model.results is None: + result = Result(task=None, dataset=None, metrics={}) + model_result_pairs.append((model, result)) + else: + for result in model.results: + model_result_pairs.append((model, result)) + return model_result_pairs + + +def generate_summary_table(task, model_result_pairs, title=None): + metrics = set() + for model, result in model_result_pairs: + if result.task == task: + metrics = metrics.union(result.metrics.keys()) + metrics = sorted(list(metrics)) + + rows = [] + + def convert2float(number): + units = {'M': 1e6, 'G': 1e9, 'T': 1e12} + if isinstance(number, str): + num = float(number.rstrip('MGT')) + number = num * units[number[-1]] + return number + + for model, result in model_result_pairs: + if result.task != task: + continue + name = model.name + if model.metadata.parameters is not None: + params = convert2float(model.metadata.parameters) + params = f'{params / 1e6:.2f}' # Params + else: + params = None + if model.metadata.flops is not None: + flops = convert2float(model.metadata.flops) + flops = f'{flops / 1e9:.2f}' # Flops + else: + flops = None + + readme = Path( + model.collection.filepath).parents[1].with_suffix('.md').name + model = Path(model.collection.filepath).parent.name + page = f'[link]({PAPERS_ROOT / readme}#{model2title[model]})' + model_metrics = [] + for metric in metrics: + model_metrics.append(str(result.metrics.get(metric, ''))) + + rows.append([name, params, flops, *model_metrics, page]) + + with open('modelzoo_statistics.md', 'a') as f: + if title is not None: + f.write(f'\n{title}') + f.write("""\n```{table}\n:class: model-summary\n""") + header = [ + 'Model', + 'Params (M)', + 'Flops (G)', + *[METRIC_ALIAS.get(metric, metric) for metric in metrics], + 'Readme', + ] + table_cfg = dict( + tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(rows, header, **table_cfg)) + f.write('\n```\n') + + +def generate_dataset_wise_table(task, model_result_pairs, title=None): + dataset_rows = defaultdict(list) + for model, result in model_result_pairs: + if result.task == task: + dataset_rows[result.dataset].append((model, result)) + + if title is not None: + with open('modelzoo_statistics.md', 'a') as f: + f.write(f'\n{title}') + for dataset, pairs in dataset_rows.items(): + generate_summary_table(task, pairs, title=f'### {dataset}') + + +model_result_pairs = scatter_results(model_index.models) + +# Generate Action Recognition Summary +generate_dataset_wise_table( + task='Action Recognition', + model_result_pairs=model_result_pairs, + title='## Action Recognition', +) + +# Generate Action Detection Summary +generate_dataset_wise_table( + task='Action Detection', + model_result_pairs=model_result_pairs, + title='## Action Detection', +) + +# Generate Skeleton-based Action Recognition Summary +generate_dataset_wise_table( + task='Skeleton-based Action Recognition', + model_result_pairs=model_result_pairs, + title='## Skeleton-based Action Recognition', +) + +# Generate Video Retrieval Summary +generate_dataset_wise_table( + task='Video Retrieval', + model_result_pairs=model_result_pairs, + title='## Video Retrieval', +) + +# Generate Temporal Action Localization Summary +generate_dataset_wise_table( + task='Temporal Action Localization', + model_result_pairs=model_result_pairs, + title='## Temporal Action Localization', +) diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md new file mode 100644 index 0000000000000000000000000000000000000000..88b3a3777af732797f98e5cba78c68808fa655c2 --- /dev/null +++ b/docs/en/switch_language.md @@ -0,0 +1,3 @@ +## English + +## ็ฎ€ไฝ“ไธญๆ–‡ diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md new file mode 100644 index 0000000000000000000000000000000000000000..8805b31e00fbcfb35145ba2d8cf6c2f1cdcbf354 --- /dev/null +++ b/docs/en/useful_tools.md @@ -0,0 +1,92 @@ +# Useful Tools + +Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory. + +## Useful Tools Link + + + +- [Useful Tools](#useful-tools) + - [Useful Tools Link](#useful-tools-link) + - [Model Conversion](#model-conversion) + - [Prepare a model for publishing](#prepare-a-model-for-publishing) + - [Miscellaneous](#miscellaneous) + - [Evaluating a metric](#evaluating-a-metric) + - [Print the entire config](#print-the-entire-config) + - [Check videos](#check-videos) + - [Multi-Stream Fusion](#multi-stream-fusion) + + + +## Model Conversion + +### Prepare a model for publishing + +`tools/deployment/publish_model.py` helps users to prepare their model for publishing. + +Before you upload a model to AWS, you may want to: + +(1) convert model weights to CPU tensors. +(2) delete the optimizer states. +(3) compute the hash of the checkpoint file and append the hash id to the filename. + +```shell +python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME} +``` + +E.g., + +```shell +python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth +``` + +The final output filename will be `tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb-{hash id}.pth`. + +## Miscellaneous + +### Evaluating a metric + +`tools/analysis_tools/eval_metric.py` evaluates certain metrics of the results saved in a file according to a config file. + +The saved result file is created on `tools/test.py` by setting the arguments `--out ${RESULT_FILE}` to indicate the result file, +which stores the final output of the whole model. + +```shell +python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}] +``` + +### Print the entire config + +`tools/analysis_tools/print_config.py` prints the whole config verbatim, expanding all its imports. + +```shell +python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` + +### Check videos + +`tools/analysis_tools/check_videos.py` uses specified video encoder to iterate all samples that are specified by the input configuration file, looks for invalid videos (corrupted or missing), and saves the corresponding file path to the output file. Please note that after deleting invalid videos, users need to regenerate the video file list. + +```shell +python tools/analysis_tools/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos] +``` + +### Multi-Stream Fusion + +`tools/analysis_tools/report_accuracy.py` uses the dumped results (by setting `--dump res.pkl` when testing) to fuse the multi-stream prediction scores, i.e., late fusion. + +```shell +python tools/analysis_tools/report_accuracy.py [--preds ${RESULT_PKL_1 [RESULT_PKL_2 ...]}] [--coefficients ${COEFFICIENT_1 [COEFFICIENT_2, ...]}] [--apply-softmax] +``` + +Take joint-bone fusion as an example, which is a general practice in the task of skeleton-based action recognition. + +```shell +python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0 +``` + +``` +Mean Class Accuracy: 0.9180 +Top 1 Accuracy: 0.9333 +Top 5 Accuracy: 0.9833 +``` diff --git a/docs/en/user_guides/config.md b/docs/en/user_guides/config.md new file mode 100644 index 0000000000000000000000000000000000000000..e86aab7fbf75af90ed69f0781fc3aff100ee03cc --- /dev/null +++ b/docs/en/user_guides/config.md @@ -0,0 +1,706 @@ +# Learn about Configs + +We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments. +You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file, +you may run `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` to see the complete config. + + + +- [Learn about Configs](#learn-about-configs) + - [Modify config through script arguments](#modify-config-through-script-arguments) + - [Config File Structure](#config-file-structure) + - [Config File Naming Convention](#config-file-naming-convention) + - [Config System for Action Recognition](#config-system-for-action-recognition) + - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection) + - [Config System for Action localization](#config-system-for-action-localization) + + + +## Modify config through script arguments + +When submitting jobs using `tools/train.py` or `tools/test.py`, you may specify `--cfg-options` to in-place modify the config. + +- Update config keys of dict. + + The config options can be specified following the order of the dict keys in the original config. + For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode. + +- Update keys inside a list of configs. + + Some config dicts are composed as a list in your config. For example, the training pipeline `train_pipeline` is normally a list + e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline, + you may specify `--cfg-options train_pipeline.0.type=DenseSampleFrames`. + +- Update values of list/tuples. + + If the value to be updated is a list or a tuple. For example, the config file normally sets `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. If you want to + change this key, you may specify `--cfg-options model.data_preprocessor.mean="[128,128,128]"`. Note that the quotation mark " is necessary to support list/tuple data types. + +## Config File Structure + +There are 3 basic component types under `configs/_base_`, models, schedules, default_runtime. +Many methods could be easily constructed with one of each like TSN, I3D, SlowOnly, etc. +The configs that are composed by components from `_base_` are called _primitive_. + +For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3. + +For easy understanding, we recommend contributors to inherit from exiting methods. +For example, if some modification is made based on TSN, users may first inherit the basic TSN structure by specifying `_base_ = ../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`, then modify the necessary fields in the config files. + +If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder under `configs/TASK`. + +Please refer to [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) for detailed documentation. + +## Config File Naming Convention + +We follow the style below to name config files. Contributors are advised to follow the same style. The config file names are divided into several parts. Logically, different parts are concatenated by underscores `'_'`, and settings in the same part are concatenated by dashes `'-'`. + +``` +{algorithm info}_{module info}_{training info}_{data info}.py +``` + +`{xxx}` is required field and `[yyy]` is optional. + +- `{algorithm info}`: + - `{model}`: model type, e.g. `tsn`, `i3d`, `swin`, `vit`, etc. + - `[model setting]`: specific setting for some models, e.g. `base`, `p16`, `w877`, etc. +- `{module info}`: + - `[pretained info]`: pretrained information, e.g. `kinetics400-pretrained`, `in1k-pre`, etc. + - `{backbone}`: backbone type. e.g. `r50` (ResNet-50), etc. + - `[backbone setting]`: specific setting for some backbones, e.g. `nl-dot-product`, `bnfrozen`, `nopool`, etc. +- `{training info}`: + - `{gpu x batch_per_gpu]}`: GPUs and samples per GPU. + - `{pipeline setting}`: frame sample setting, e.g. `dense`, `{clip_len}x{frame_interval}x{num_clips}`, `u48`, etc. + - `{schedule}`: training schedule, e.g. `coslr-20e`. +- `{data info}`: + - `{dataset}`: dataset name, e.g. `kinetics400`, `mmit`, etc. + - `{modality}`: data modality, e.g. `rgb`, `flow`, `keypoint-2d`, etc. + +### Config System for Action Recognition + +We incorporate modular design into our config system, +which is convenient to conduct various experiments. + +- An Example of TSN + + To help the users have a basic idea of a complete config structure and the modules in an action recognition system, + we make brief comments on the config of TSN as the following. + For more detailed usage and alternative for per parameter in each module, please refer to the API documentation. + + ```python + # model settings + model = dict( # Config of the model + type='Recognizer2D', # Class name of the recognizer + backbone=dict( # Dict for backbone + type='ResNet', # Name of the backbone + pretrained='torchvision://resnet50', # The url/site of the pretrained model + depth=50, # Depth of ResNet model + norm_eval=False), # Whether to set BN layers to eval mode when training + cls_head=dict( # Dict for classification head + type='TSNHead', # Name of classification head + num_classes=400, # Number of classes to be classified. + in_channels=2048, # The input channels of classification head. + spatial_type='avg', # Type of pooling in spatial dimension + consensus=dict(type='AvgConsensus', dim=1), # Config of consensus module + dropout_ratio=0.4, # Probability in dropout layer + init_std=0.01, # Std value for linear layer initiation + average_clips='prob'), # Method to average multiple clip results + data_preprocessor=dict( # Dict for data preprocessor + type='ActionDataPreprocessor', # Name of data preprocessor + mean=[123.675, 116.28, 103.53], # Mean values of different channels to normalize + std=[58.395, 57.12, 57.375], # Std values of different channels to normalize + format_shape='NCHW'), # Final image shape format + # model training and testing settings + train_cfg=None, # Config of training hyperparameters for TSN + test_cfg=None) # Config for testing hyperparameters for TSN. + + # dataset settings + dataset_type = 'RawframeDataset' # Type of dataset for training, validation and testing + data_root = 'data/kinetics400/rawframes_train/' # Root path to data for training + data_root_val = 'data/kinetics400/rawframes_val/' # Root path to data for validation and testing + ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' # Path to the annotation file for training + ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # Path to the annotation file for validation + ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # Path to the annotation file for testing + + train_pipeline = [ # Training data processing pipeline + dict( # Config of SampleFrames + type='SampleFrames', # Sample frames pipeline, sampling frames from video + clip_len=1, # Frames of each sampled output clip + frame_interval=1, # Temporal interval of adjacent sampled frames + num_clips=3), # Number of clips to be sampled + dict( # Config of RawFrameDecode + type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices + dict( # Config of Resize + type='Resize', # Resize pipeline + scale=(-1, 256)), # The scale to resize images + dict( # Config of MultiScaleCrop + type='MultiScaleCrop', # Multi scale crop pipeline, cropping images with a list of randomly selected scales + input_size=224, # Input size of the network + scales=(1, 0.875, 0.75, 0.66), # Scales of width and height to be selected + random_crop=False, # Whether to randomly sample cropping bbox + max_wh_scale_gap=1), # Maximum gap of w and h scale levels + dict( # Config of Resize + type='Resize', # Resize pipeline + scale=(224, 224), # The scale to resize images + keep_ratio=False), # Whether to resize with changing the aspect ratio + dict( # Config of Flip + type='Flip', # Flip Pipeline + flip_ratio=0.5), # Probability of implementing flip + dict( # Config of FormatShape + type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format + input_format='NCHW'), # Final image shape format + dict(type='PackActionInputs') # Config of PackActionInputs + ] + val_pipeline = [ # Validation data processing pipeline + dict( # Config of SampleFrames + type='SampleFrames', # Sample frames pipeline, sampling frames from video + clip_len=1, # Frames of each sampled output clip + frame_interval=1, # Temporal interval of adjacent sampled frames + num_clips=3, # Number of clips to be sampled + test_mode=True), # Whether to set test mode in sampling + dict( # Config of RawFrameDecode + type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices + dict( # Config of Resize + type='Resize', # Resize pipeline + scale=(-1, 256)), # The scale to resize images + dict( # Config of CenterCrop + type='CenterCrop', # Center crop pipeline, cropping the center area from images + crop_size=224), # The size to crop images + dict( # Config of Flip + type='Flip', # Flip pipeline + flip_ratio=0), # Probability of implementing flip + dict( # Config of FormatShape + type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format + input_format='NCHW'), # Final image shape format + dict(type='PackActionInputs') # Config of PackActionInputs + ] + test_pipeline = [ # Testing data processing pipeline + dict( # Config of SampleFrames + type='SampleFrames', # Sample frames pipeline, sampling frames from video + clip_len=1, # Frames of each sampled output clip + frame_interval=1, # Temporal interval of adjacent sampled frames + num_clips=25, # Number of clips to be sampled + test_mode=True), # Whether to set test mode in sampling + dict( # Config of RawFrameDecode + type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices + dict( # Config of Resize + type='Resize', # Resize pipeline + scale=(-1, 256)), # The scale to resize images + dict( # Config of TenCrop + type='TenCrop', # Ten crop pipeline, cropping ten area from images + crop_size=224), # The size to crop images + dict( # Config of Flip + type='Flip', # Flip pipeline + flip_ratio=0), # Probability of implementing flip + dict( # Config of FormatShape + type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format + input_format='NCHW'), # Final image shape format + dict(type='PackActionInputs') # Config of PackActionInputs + ] + + train_dataloader = dict( # Config of train dataloader + batch_size=32, # Batch size of each single GPU during training + num_workers=8, # Workers to pre-fetch data for each single GPU during training + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed + sampler=dict( + type='DefaultSampler', # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # Randomly shuffle the training data in each epoch + dataset=dict( # Config of train dataset + type=dataset_type, + ann_file=ann_file_train, # Path of annotation file + data_prefix=dict(img=data_root), # Prefix of frame path + pipeline=train_pipeline)) + val_dataloader = dict( # Config of validation dataloader + batch_size=1, # Batch size of each single GPU during validation + num_workers=8, # Workers to pre-fetch data for each single GPU during validation + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end + sampler=dict( + type='DefaultSampler', + shuffle=False), # Not shuffle during validation and testing + dataset=dict( # Config of validation dataset + type=dataset_type, + ann_file=ann_file_val, # Path of annotation file + data_prefix=dict(img=data_root_val), # Prefix of frame path + pipeline=val_pipeline, + test_mode=True)) + test_dataloader = dict( # Config of test dataloader + batch_size=32, # Batch size of each single GPU during testing + num_workers=8, # Workers to pre-fetch data for each single GPU during testing + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end + sampler=dict( + type='DefaultSampler', + shuffle=False), # Not shuffle during validation and testing + dataset=dict( # Config of test dataset + type=dataset_type, + ann_file=ann_file_val, # Path of annotation file + data_prefix=dict(img=data_root_val), # Prefix of frame path + pipeline=test_pipeline, + test_mode=True)) + + # evaluation settings + val_evaluator = dict(type='AccMetric') # Config of validation evaluator + test_evaluator = val_evaluator # Config of testing evaluator + + train_cfg = dict( # Config of training loop + type='EpochBasedTrainLoop', # Name of training loop + max_epochs=100, # Total training epochs + val_begin=1, # The epoch that begins validating + val_interval=1) # Validation interval + val_cfg = dict( # Config of validation loop + type='ValLoop') # Name of validation loop + test_cfg = dict( # Config of testing loop + type='TestLoop') # Name of testing loop + + # learning policy + param_scheduler = [ # Parameter scheduler for updating optimizer parameters, support dict or list + dict(type='MultiStepLR', # Decays the learning rate once the number of epoch reaches one of the milestones + begin=0, # Step at which to start updating the learning rate + end=100, # Step at which to stop updating the learning rate + by_epoch=True, # Whether the scheduled learning rate is updated by epochs + milestones=[40, 80], # Steps to decay the learning rate + gamma=0.1)] # Multiplicative factor of learning rate decay + + # optimizer + optim_wrapper = dict( # Config of optimizer wrapper + type='OptimWrapper', # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training + optimizer=dict( # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # Name of optimizer + lr=0.01, # Learning rate + momentum=0.9, # Momentum factor + weight_decay=0.0001), # Weight decay + clip_grad=dict(max_norm=40, norm_type=2)) # Config of gradient clip + + # runtime settings + default_scope = 'mmaction' # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( # Hooks to execute default actions like updating model parameters and saving checkpoints. + runtime_info=dict(type='RuntimeInfoHook'), # The hook to updates runtime information into message hub + timer=dict(type='IterTimerHook'), # The logger used to record time spent during iteration + logger=dict( + type='LoggerHook', # The logger used to record logs during training/validation/testing phase + interval=20, # Interval to print the log + ignore_last=False), # Ignore the log of last iterations in each epoch + param_scheduler=dict(type='ParamSchedulerHook'), # The hook to update some hyper-parameters in optimizer + checkpoint=dict( + type='CheckpointHook', # The hook to save checkpoints periodically + interval=3, # The saving period + save_best='auto', # Specified metric to mearsure the best checkpoint during evaluation + max_keep_ckpts=3), # The maximum checkpoints to keep + sampler_seed=dict(type='DistSamplerSeedHook'), # Data-loading sampler for distributed training + sync_buffers=dict(type='SyncBuffersHook')) # Synchronize model buffers at the end of each epoch + env_cfg = dict( # Dict for setting environment + cudnn_benchmark=False, # Whether to enable cudnn benchmark + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing + dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set + + log_processor = dict( + type='LogProcessor', # Log processor used to format log information + window_size=20, # Default smooth interval + by_epoch=True) # Whether to format logs with epoch type + vis_backends = [ # List of visualization backends + dict(type='LocalVisBackend')] # Local visualization backend + visualizer = dict( # Config of visualizer + type='ActionVisualizer', # Name of visualizer + vis_backends=vis_backends) + log_level = 'INFO' # The level of logging + load_from = None # Load model checkpoint as a pre-trained model from a given path. This will not resume training. + resume = False # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`. + ``` + +### Config System for Spatio-Temporal Action Detection + +We incorporate modular design into our config system, which is convenient to conduct various experiments. + +- An Example of FastRCNN + + To help the users have a basic idea of a complete config structure and the modules in a spatio-temporal action detection system, + we make brief comments on the config of FastRCNN as the following. + For more detailed usage and alternative for per parameter in each module, please refer to the API documentation. + + ```python + # model setting + model = dict( # Config of the model + type='FastRCNN', # Class name of the detector + _scope_='mmdet', # The scope of current config + backbone=dict( # Dict for backbone + type='ResNet3dSlowOnly', # Name of the backbone + depth=50, # Depth of ResNet model + pretrained=None, # The url/site of the pretrained model + pretrained2d=False, # If the pretrained model is 2D + lateral=False, # If the backbone is with lateral connections + num_stages=4, # Stages of ResNet model + conv1_kernel=(1, 7, 7), # Conv1 kernel size + conv1_stride_t=1, # Conv1 temporal stride + pool1_stride_t=1, # Pool1 temporal stride + spatial_strides=(1, 2, 2, 1)), # The spatial stride for each ResNet stage + roi_head=dict( # Dict for roi_head + type='AVARoIHead', # Name of the roi_head + bbox_roi_extractor=dict( # Dict for bbox_roi_extractor + type='SingleRoIExtractor3D', # Name of the bbox_roi_extractor + roi_layer_type='RoIAlign', # Type of the RoI op + output_size=8, # Output feature size of the RoI op + with_temporal_pool=True), # If temporal dim is pooled + bbox_head=dict( # Dict for bbox_head + type='BBoxHeadAVA', # Name of the bbox_head + in_channels=2048, # Number of channels of the input feature + num_classes=81, # Number of action classes + 1 + multilabel=True, # If the dataset is multilabel + dropout_ratio=0.5), # The dropout ratio used + data_preprocessor=dict( # Dict for data preprocessor + type='ActionDataPreprocessor', # Name of data preprocessor + mean=[123.675, 116.28, 103.53], # Mean values of different channels to normalize + std=[58.395, 57.12, 57.375], # Std values of different channels to normalize + format_shape='NCHW')), # Final image shape format + # model training and testing settings + train_cfg=dict( # Training config of FastRCNN + rcnn=dict( # Dict for rcnn training config + assigner=dict( # Dict for assigner + type='MaxIoUAssignerAVA', # Name of the assigner + pos_iou_thr=0.9, # IoU threshold for positive examples, > pos_iou_thr -> positive + neg_iou_thr=0.9, # IoU threshold for negative examples, < neg_iou_thr -> negative + min_pos_iou=0.9), # Minimum acceptable IoU for positive examples + sampler=dict( # Dict for sample + type='RandomSampler', # Name of the sampler + num=32, # Batch Size of the sampler + pos_fraction=1, # Positive bbox fraction of the sampler + neg_pos_ub=-1, # Upper bound of the ratio of num negative to num positive + add_gt_as_proposals=True), # Add gt bboxes as proposals + pos_weight=1.0)), # Loss weight of positive examples + test_cfg=dict(rcnn=None)) # Testing config of FastRCNN + + # dataset settings + dataset_type = 'AVADataset' # Type of dataset for training, validation and testing + data_root = 'data/ava/rawframes' # Root path to data + anno_root = 'data/ava/annotations' # Root path to annotations + + ann_file_train = f'{anno_root}/ava_train_v2.1.csv' # Path to the annotation file for training + ann_file_val = f'{anno_root}/ava_val_v2.1.csv' # Path to the annotation file for validation + + exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' # Path to the exclude annotation file for training + exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' # Path to the exclude annotation file for validation + + label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' # Path to the label file + + proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl' # Path to the human detection proposals for training examples + proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' # Path to the human detection proposals for validation examples + + train_pipeline = [ # Training data processing pipeline + dict( # Config of SampleFrames + type='AVASampleFrames', # Sample frames pipeline, sampling frames from video + clip_len=4, # Frames of each sampled output clip + frame_interval=16), # Temporal interval of adjacent sampled frames + dict( # Config of RawFrameDecode + type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices + dict( # Config of RandomRescale + type='RandomRescale', # Randomly rescale the shortedge by a given range + scale_range=(256, 320)), # The shortedge size range of RandomRescale + dict( # Config of RandomCrop + type='RandomCrop', # Randomly crop a patch with the given size + size=256), # The size of the cropped patch + dict( # Config of Flip + type='Flip', # Flip Pipeline + flip_ratio=0.5), # Probability of implementing flip + dict( # Config of FormatShape + type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format + input_format='NCTHW', # Final image shape format + collapse=True), # Collapse the dim N if N == 1 + dict(type='PackActionInputs') # Pack input data + ] + + val_pipeline = [ # Validation data processing pipeline + dict( # Config of SampleFrames + type='AVASampleFrames', # Sample frames pipeline, sampling frames from video + clip_len=4, # Frames of each sampled output clip + frame_interval=16), # Temporal interval of adjacent sampled frames + dict( # Config of RawFrameDecode + type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices + dict( # Config of Resize + type='Resize', # Resize pipeline + scale=(-1, 256)), # The scale to resize images + dict( # Config of FormatShape + type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format + input_format='NCTHW', # Final image shape format + collapse=True), # Collapse the dim N if N == 1 + dict(type='PackActionInputs') # Pack input data + ] + + train_dataloader = dict( # Config of train dataloader + batch_size=32, # Batch size of each single GPU during training + num_workers=8, # Workers to pre-fetch data for each single GPU during training + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed + sampler=dict( + type='DefaultSampler', # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # Randomly shuffle the training data in each epoch + dataset=dict( # Config of train dataset + type=dataset_type, + ann_file=ann_file_train, # Path of annotation file + exclude_file=exclude_file_train, # Path of exclude annotation file + label_file=label_file, # Path of label file + data_prefix=dict(img=data_root), # Prefix of frame path + proposal_file=proposal_file_train, # Path of human detection proposals + pipeline=train_pipeline)) + val_dataloader = dict( # Config of validation dataloader + batch_size=1, # Batch size of each single GPU during evaluation + num_workers=8, # Workers to pre-fetch data for each single GPU during evaluation + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end + sampler=dict( + type='DefaultSampler', + shuffle=False), # Not shuffle during validation and testing + dataset=dict( # Config of validation dataset + type=dataset_type, + ann_file=ann_file_val, # Path of annotation file + exclude_file=exclude_file_val, # Path of exclude annotation file + label_file=label_file, # Path of label file + data_prefix=dict(img=data_root_val), # Prefix of frame path + proposal_file=proposal_file_val, # Path of human detection proposals + pipeline=val_pipeline, + test_mode=True)) + test_dataloader = val_dataloader # Config of testing dataloader + + # evaluation settings + val_evaluator = dict( # Config of validation evaluator + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) + test_evaluator = val_evaluator # Config of testing evaluator + + train_cfg = dict( # Config of training loop + type='EpochBasedTrainLoop', # Name of training loop + max_epochs=20, # Total training epochs + val_begin=1, # The epoch that begins validating + val_interval=1) # Validation interval + val_cfg = dict( # Config of validation loop + type='ValLoop') # Name of validation loop + test_cfg = dict( # Config of testing loop + type='TestLoop') # Name of testing loop + + # learning policy + param_scheduler = [ # Parameter scheduler for updating optimizer parameters, support dict or list + dict(type='LinearLR', # Decays the learning rate of each parameter group by linearly changing small multiplicative factor + start_factor=0.1, # The number we multiply learning rate in the first epoch + by_epoch=True, # Whether the scheduled learning rate is updated by epochs + begin=0, # Step at which to start updating the learning rate + end=5), # Step at which to stop updating the learning rate + dict(type='MultiStepLR', # Decays the learning rate once the number of epoch reaches one of the milestones + begin=0, # Step at which to start updating the learning rate + end=20, # Step at which to stop updating the learning rate + by_epoch=True, # Whether the scheduled learning rate is updated by epochs + milestones=[10, 15], # Steps to decay the learning rate + gamma=0.1)] # Multiplicative factor of learning rate decay + + # optimizer + optim_wrapper = dict( # Config of optimizer wrapper + type='OptimWrapper', # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training + optimizer=dict( # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # Name of optimizer + lr=0.2, # Learning rate + momentum=0.9, # Momentum factor + weight_decay=0.0001), # Weight decay + clip_grad=dict(max_norm=40, norm_type=2)) # Config of gradient clip + + # runtime settings + default_scope = 'mmaction' # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( # Hooks to execute default actions like updating model parameters and saving checkpoints. + runtime_info=dict(type='RuntimeInfoHook'), # The hook to updates runtime information into message hub + timer=dict(type='IterTimerHook'), # The logger used to record time spent during iteration + logger=dict( + type='LoggerHook', # The logger used to record logs during training/validation/testing phase + interval=20, # Interval to print the log + ignore_last=False), # Ignore the log of last iterations in each epoch + param_scheduler=dict(type='ParamSchedulerHook'), # The hook to update some hyper-parameters in optimizer + checkpoint=dict( + type='CheckpointHook', # The hook to save checkpoints periodically + interval=3, # The saving period + save_best='auto', # Specified metric to mearsure the best checkpoint during evaluation + max_keep_ckpts=3), # The maximum checkpoints to keep + sampler_seed=dict(type='DistSamplerSeedHook'), # Data-loading sampler for distributed training + sync_buffers=dict(type='SyncBuffersHook')) # Synchronize model buffers at the end of each epoch + env_cfg = dict( # Dict for setting environment + cudnn_benchmark=False, # Whether to enable cudnn benchmark + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing + dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set + + log_processor = dict( + type='LogProcessor', # Log processor used to format log information + window_size=20, # Default smooth interval + by_epoch=True) # Whether to format logs with epoch type + vis_backends = [ # List of visualization backends + dict(type='LocalVisBackend')] # Local visualization backend + visualizer = dict( # Config of visualizer + type='ActionVisualizer', # Name of visualizer + vis_backends=vis_backends) + log_level = 'INFO' # The level of logging + load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth') # Load model checkpoint as a pre-trained model from a given path. This will not resume training. + resume = False # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`. + ``` + +### Config System for Action localization + +We incorporate modular design into our config system, +which is convenient to conduct various experiments. + +- An Example of BMN + + To help the users have a basic idea of a complete config structure and the modules in an action localization system, + we make brief comments on the config of BMN as the following. + For more detailed usage and alternative for per parameter in each module, please refer to the [API documentation](https://mmaction2.readthedocs.io/en/latest/api.html). + + ```python + # model settings + model = dict( # Config of the model + type='BMN', # Class name of the localizer + temporal_dim=100, # Total frames selected for each video + boundary_ratio=0.5, # Ratio for determining video boundaries + num_samples=32, # Number of samples for each proposal + num_samples_per_bin=3, # Number of bin samples for each sample + feat_dim=400, # Dimension of feature + soft_nms_alpha=0.4, # Soft NMS alpha + soft_nms_low_threshold=0.5, # Soft NMS low threshold + soft_nms_high_threshold=0.9, # Soft NMS high threshold + post_process_top_k=100) # Top k proposals in post process + + # dataset settings + dataset_type = 'ActivityNetDataset' # Type of dataset for training, validation and testing + data_root = 'data/activitynet_feature_cuhk/csv_mean_100/' # Root path to data for training + data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/' # Root path to data for validation and testing + ann_file_train = 'data/ActivityNet/anet_anno_train.json' # Path to the annotation file for training + ann_file_val = 'data/ActivityNet/anet_anno_val.json' # Path to the annotation file for validation + ann_file_test = 'data/ActivityNet/anet_anno_test.json' # Path to the annotation file for testing + + train_pipeline = [ # Training data processing pipeline + dict(type='LoadLocalizationFeature'), # Load localization feature pipeline + dict(type='GenerateLocalizationLabels'), # Generate localization labels pipeline + dict( + type='PackLocalizationInputs', # Pack localization data + keys=('gt_bbox'), # Keys of input + meta_keys=('video_name'))] # Meta keys of input + val_pipeline = [ # Validation data processing pipeline + dict(type='LoadLocalizationFeature'), # Load localization feature pipeline + dict(type='GenerateLocalizationLabels'), # Generate localization labels pipeline + dict( + type='PackLocalizationInputs', # Pack localization data + keys=('gt_bbox'), # Keys of input + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame'))] # Meta keys of input + test_pipeline = [ # Testing data processing pipeline + dict(type='LoadLocalizationFeature'), # Load localization feature pipeline + dict( + type='PackLocalizationInputs', # Pack localization data + keys=('gt_bbox'), # Keys of input + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame'))] # Meta keys of input + train_dataloader = dict( # Config of train dataloader + batch_size=8, # Batch size of each single GPU during training + num_workers=8, # Workers to pre-fetch data for each single GPU during training + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed + sampler=dict( + type='DefaultSampler', # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # Randomly shuffle the training data in each epoch + dataset=dict( # Config of train dataset + type=dataset_type, + ann_file=ann_file_train, # Path of annotation file + data_prefix=dict(video=data_root), # Prefix of video path + pipeline=train_pipeline)) + val_dataloader = dict( # Config of validation dataloader + batch_size=1, # Batch size of each single GPU during evaluation + num_workers=8, # Workers to pre-fetch data for each single GPU during evaluation + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end + sampler=dict( + type='DefaultSampler', + shuffle=False), # Not shuffle during validation and testing + dataset=dict( # Config of validation dataset + type=dataset_type, + ann_file=ann_file_val, # Path of annotation file + data_prefix=dict(video=data_root_val), # Prefix of video path + pipeline=val_pipeline, + test_mode=True)) + test_dataloader = dict( # Config of test dataloader + batch_size=1, # Batch size of each single GPU during testing + num_workers=8, # Workers to pre-fetch data for each single GPU during testing + persistent_workers=True, # If `True`, the dataloader will not shut down the worker processes after an epoch end + sampler=dict( + type='DefaultSampler', + shuffle=False), # Not shuffle during validation and testing + dataset=dict( # Config of test dataset + type=dataset_type, + ann_file=ann_file_val, # Path of annotation file + data_prefix=dict(video=data_root_val), # Prefix of video path + pipeline=test_pipeline, + test_mode=True)) + + # evaluation settings + work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' # Directory to save the model checkpoints and logs for the current experiments + val_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict( # Config of localization output + out=f'{work_dir}/results.json', # Path to the output file + output_format='json')) # File format of the output file + test_evaluator = val_evaluator # Set test_evaluator as val_evaluator + + max_epochs = 9 # Total epochs to train the model + train_cfg = dict( # Config of training loop + type='EpochBasedTrainLoop', # Name of training loop + max_epochs=max_epochs, # Total training epochs + val_begin=1, # The epoch that begins validating + val_interval=1) # Validation interval + val_cfg = dict( # Config of validation loop + type='ValLoop') # Name of validating loop + test_cfg = dict( # Config of testing loop + type='TestLoop') # Name of testing loop + + # learning policy + param_scheduler = [ # Parameter scheduler for updating optimizer parameters, support dict or list + dict(type='MultiStepLR', # Decays the learning rate once the number of epoch reaches one of the milestones + begin=0, # Step at which to start updating the learning rate + end=max_epochs, # Step at which to stop updating the learning rate + by_epoch=True, # Whether the scheduled learning rate is updated by epochs + milestones=[7, ], # Steps to decay the learning rate + gamma=0.1)] # Multiplicative factor of parameter value decay + + # optimizer + optim_wrapper = dict( # Config of optimizer wrapper + type='OptimWrapper', # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training + optimizer=dict( # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='Adam', # Name of optimizer + lr=0.001, # Learning rate + weight_decay=0.0001), # Weight decay + clip_grad=dict(max_norm=40, norm_type=2)) # Config of gradient clip + + # runtime settings + default_scope = 'mmaction' # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( # Hooks to execute default actions like updating model parameters and saving checkpoints. + runtime_info=dict(type='RuntimeInfoHook'), # The hook to updates runtime information into message hub + timer=dict(type='IterTimerHook'), # The logger used to record time spent during iteration + logger=dict( + type='LoggerHook', # The logger used to record logs during training/validation/testing phase + interval=20, # Interval to print the log + ignore_last=False), # Ignore the log of last iterations in each epoch + param_scheduler=dict(type='ParamSchedulerHook'), # The hook to update some hyper-parameters in optimizer + checkpoint=dict( + type='CheckpointHook', # The hook to save checkpoints periodically + interval=3, # The saving period + save_best='auto', # Specified metric to mearsure the best checkpoint during evaluation + max_keep_ckpts=3), # The maximum checkpoints to keep + sampler_seed=dict(type='DistSamplerSeedHook'), # Data-loading sampler for distributed training + sync_buffers=dict(type='SyncBuffersHook')) # Synchronize model buffers at the end of each epoch + env_cfg = dict( # Dict for setting environment + cudnn_benchmark=False, # Whether to enable cudnn benchmark + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing + dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set + + log_processor = dict( + type='LogProcessor', # Log processor used to format log information + window_size=20, # Default smooth interval + by_epoch=True) # Whether to format logs with epoch type + vis_backends = [ # List of visualization backends + dict(type='LocalVisBackend')] # Local visualization backend + visualizer = dict( # Config of visualizer + type='ActionVisualizer', # Name of visualizer + vis_backends=vis_backends) + log_level = 'INFO' # The level of logging + load_from = None # Load model checkpoint as a pre-trained model from a given path. This will not resume training. + resume = False # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`. + ``` diff --git a/docs/en/user_guides/finetune.md b/docs/en/user_guides/finetune.md new file mode 100644 index 0000000000000000000000000000000000000000..23046c589c509d720adfd9da39523647a917f18c --- /dev/null +++ b/docs/en/user_guides/finetune.md @@ -0,0 +1,331 @@ +# Finetuning Models + +This tutorial provides instructions for users to use the pre-trained models +to finetune them on other datasets, so that better performance can be achieved. + +- [Finetuning Models](#finetuning-models) + - [Outline](#outline) + - [Choose Template Config](#choose-template-config) + - [Modify Head](#modify-head) + - [Modify Dataset](#modify-dataset) + - [Modify Training Schedule](#modify-training-schedule) + - [Use Pre-Trained Model](#use-pre-trained-model) + - [Start Training](#start-training) + +## Outline + +There are two steps to finetune a model on a new dataset. + +1. Add support for the new dataset. See [Prepare Dataset](prepare_dataset.md) and [Customize Dataset](../advanced_guides/customize_dataset.md). +2. Modify the configs. This will be discussed in this tutorial. + +## Choose Template Config + +Here, we would like to take `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as an example. We first copy this config file to the same folder and rename it to `tsn_ucf101.py`, then four parts in the config need attention, specifically, add new keys for non-existing keys and modify the original keys for existing keys. + +## Modify Head + +The `num_classes` in the `cls_head` need to be changed to the class number of the new dataset. +The weights of the pre-trained models are reused except for the final prediction layer. +So it is safe to change the class number. +In our case, UCF101 has 101 classes. +So we change it from 400 (class number of Kinetics-400) to 101. + +```python +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101 # change from 400 to 101 + )) +``` + +## Modify Dataset + +MMAction2 supports UCF101, Kinetics-400, Moments in Time, Multi-Moments in Time, THUMOS14, +Something-Something V1&V2, ActivityNet Dataset. +The users may need to adapt one of the above datasets to fit their special datasets. +You could refer to [Prepare Dataset](prepare_dataset.md) and [Customize Dataset](../advanced_guides/customize_dataset.md) for more details. +In our case, UCF101 is already supported by various dataset types, like `VideoDataset`, +so we change the config as follows. + +```python +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' +``` + +## Modify Training Schedule + +Finetuning usually requires a smaller learning rate and fewer training epochs. + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # change from 100 to 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # change from 100 to 50 + by_epoch=True, + milestones=[20, 40], # change milestones + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # change from 0.01 to 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) +``` + +## Use Pre-Trained Model + +To use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`. +We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](config.md), users can directly change it by setting `load_from` in their configs. + +```python +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' # model path can be found in model zoo +``` + +## Start Training + +Now, we have finished the fine-tuning config file as follows: + +```python +_base_ = [ + '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101 # change from 400 to 101 + )) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # change from 100 to 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # change from 100 to 50 + by_epoch=True, + milestones=[20, 40], # change milestones + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # change from 0.01 to 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) + +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' + +``` + +An easier way is to inherit the kinetics400 config and only specify the modified keys. Please make sure that the custom config is in the same folder with `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`. + +```python +_base_ = [ + 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py' # inherit template config +] + +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101)) # change from 400 to 101 + + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' + +train_dataloader = dict( + dataset=dict( + ann_file=ann_file_train, + data_prefix=dict(video=data_root))) +val_dataloader = dict( + dataset=dict( + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val))) +test_dataloader = dict( + dataset=dict( + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val))) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # change from 100 to 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # change from 100 to 50 + by_epoch=True, + milestones=[20, 40], # change milestones + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # change from 0.01 to 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' + +``` + +You can use the following command to finetune a model on your dataset. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the TSN model on Kinetics-400 dataset in a deterministic option. + +```shell +python tools/train.py configs/recognition/tsn/tsn_ucf101.py \ + --seed=0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](train_test.md). diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md new file mode 100644 index 0000000000000000000000000000000000000000..7c8a5b63eb7ef3ea85e7276c7a912bc93a2fb591 --- /dev/null +++ b/docs/en/user_guides/inference.md @@ -0,0 +1,40 @@ +# Inference with existing models + +MMAction2 provides pre-trained models for video understanding in [Model Zoo](../modelzoo.md). +This note will show **how to use existing models to inference on given video**. + +As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test) + +## Inference on a given video + +MMAction2 provides high-level Python APIs for inference on a given video: + +- [init_recognizer](mmaction.apis.init_recognizer): Initialize a recognizer with a config and checkpoint +- [inference_recognizer](mmaction.apis.inference_recognizer): Inference on a given video + +Here is an example of building the model and inference on a given video by using Kinitics-400 pre-trained checkpoint. + +```{note} +If you use mmaction2 as a 3rd-party package, you need to download the conifg and the demo video in the example. + +Run 'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .' to download the required config. + +Run 'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4' to download the desired demo video. +``` + +```python +from mmaction.apis import inference_recognizer, init_recognizer + +config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py' +checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # can be a local path +img_path = 'demo/demo.mp4' # you can specify your own picture path + +# build the model from a config file and a checkpoint file +model = init_recognizer(config_path, checkpoint_path, device="cpu") # device can be 'cuda:0' +# test a single image +result = inference_recognizer(model, img_path) +``` + +`result` is a dictionary containing `pred_scores`. + +An action recognition demo can be found in [demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py). diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..602effa8633249db7013d619350eb85ee62b6f0b --- /dev/null +++ b/docs/en/user_guides/prepare_dataset.md @@ -0,0 +1,305 @@ +# Prepare Dataset + +MMAction2 supports many existing datasets. In this chapter, we will lead you to prepare datasets for MMAction2. + +- [Prepare Dataset](#prepare-dataset) + - [Notes on Video Data Format](#notes-on-video-data-format) + - [Use built-in datasets](#use-built-in-datasets) + - [Use a custom dataset](#use-a-custom-dataset) + - [Action Recognition](#action-recognition) + - [Skeleton-based Action Recognition](#skeleton-based-action-recognition) + - [Audio-based Action Recognition](#audio-based-action-recognition) + - [Spatio-temporal Action Detection](#spatio-temporal-action-detection) + - [Temporal Action Localization](#temporal-action-localization) + - [Use mixed datasets for training](#use-mixed-datasets-for-training) + - [Repeat dataset](#repeat-dataset) + - [Browse dataset](#browse-dataset) + +## Notes on Video Data Format + +MMAction2 supports two types of data formats: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks). +This is fast when SSD is available but fails to scale to the fast-growing datasets. +(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.) +The latter saves much space but has to do the computation intensive video decoding at execution time. +To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc. + +## Use built-in datasets + +MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](https://mmaction2.readthedocs.io/en/latest/datasetzoo_statistics.html) for details to prepare specific datasets. + +## Use a custom dataset + +The simplest way is to convert your dataset to existing dataset formats: + +- `RawFrameDataset` and `VideoDataset` for [Action Recognition](#action-recognition) +- `PoseDataset` for [Skeleton-based Action Recognition](#skeleton-based-action-recognition) +- `AudioDataset` for [Audio-based Action Recognition](#Audio-based-action-recognition) +- `AVADataset` for [Spatio-temporal Action Detection](#spatio-temporal-action-detection) +- `ActivityNetDataset` for [Temporal Action Localization](#temporal-action-localization) + +After the data pre-processing, the users need to further modify the config files to use the dataset. +Here is an example of using a custom dataset in rawframe format. + +In `configs/task/method/my_custom_config.py`: + +```python +... +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'path/to/your/root' +data_root_val = 'path/to/your/root_val' +ann_file_train = 'data/custom/custom_train_list.txt' +ann_file_val = 'data/custom/custom_val_list.txt' +ann_file_test = 'data/custom/custom_val_list.txt' +... +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + ...), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + ...), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + ...)) +... +``` + +### Action Recognition + +There are two kinds of annotation files for action recognition. + +- rawframe annotaiton for `RawFrameDataset` + + The annotation of a rawframe dataset is a text file with multiple lines, + and each line indicates `frame_directory` (relative path) of a video, + `total_frames` of a video and the `label` of a video, which are split by a whitespace. + + Here is an example. + + ``` + some/directory-1 163 1 + some/directory-2 122 1 + some/directory-3 258 2 + some/directory-4 234 2 + some/directory-5 295 3 + some/directory-6 121 3 + ``` + +- video annotation for `VideoDataset` + + The annotation of a video dataset is a text file with multiple lines, + and each line indicates a sample video with the `filepath` (relative path) and `label`, + which are split by a whitespace. + + Here is an example. + + ``` + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + some/path/003.mp4 2 + some/path/004.mp4 3 + some/path/005.mp4 3 + ``` + +### Skeleton-based Action Recognition + +The task recognizes the action class based on the skeleton sequence (time sequence of keypoints). We provide some methods to build your custom skeleton dataset. + +- Build from RGB video data + + You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions. + +- Build from existing keypoint data + + Assuming that you already have keypoint data in coco formats, you can gather them into a pickle file. + + Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations` + + 1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip. + 2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields: + - `frame_dir` (str): The identifier of the corresponding video. + - `total_frames` (int): The number of frames in this video. + - `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of `(height, width)`. Only required for 2D skeletons. + - `original_shape` (tuple\[int\]): Same as `img_shape`. + - `label` (int): The action label. + - `keypoint` (np.ndarray, with shape `[M x T x V x C]`): The keypoint annotation. + - M: number of persons; + - T: number of frames (same as `total_frames`); + - V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); + - C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint). + - `keypoint_score` (np.ndarray, with shape `[M x T x V]`): The confidence score of keypoints. Only required for 2D skeletons. + + Here is an example: + + ``` + { + "split": + { + 'xsub_train': + ['S001C001P001R001A001', ...], + 'xsub_val': + ['S001C001P003R001A001', ...], + ... + } + + "annotations: + [ + { + { + 'frame_dir': 'S001C001P001R001A001', + 'label': 0, + 'img_shape': (1080, 1920), + 'original_shape': (1080, 1920), + 'total_frames': 103, + 'keypoint': array([[[[1032. , 334.8], ...]]]) + 'keypoint_score': array([[[0.934 , 0.9766, ...]]]) + }, + { + 'frame_dir': 'S001C001P003R001A001', + ... + }, + ... + + } + ] + } + ``` + + Support other keypoint formats needs further modification, please refer to [customize dataset](../advanced_guides/customize_dataset.md). + +### Audio-based Action Recognition + +MMAction2 provides support for audio-based action recognition tasks utilizing the `AudioDataset`. This task employs mel spectrogram features as input. An example annotation file format is as follows: + +``` +ihWykL5mYRI.npy 300 153 +lumzQD42AN8.npy 240 321 +sWFRmD9Of4s.npy 250 250 +w_IpfgRsBVA.npy 300 356 +``` + +Each line represents a training sample. Taking the first line as an example, `ihWykL5mYRI.npy` corresponds to the filename of the mel spectrogram feature. The value `300` represents the total number of frames of the original video corresponding to this mel spectrogram feature, and `153` denotes the class label. We take the following two steps to perpare the mel spectrogram feature data: + +First, extract `audios` from videos: + +```shell +cd $MMACTION2 +python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \ + [--level ${LEVEL}] +``` + +- `ROOT`: The root directory of the videos. +- `DST_ROOT`: The destination root directory of the audios. +- `EXT`: Extension of the video files. e.g., `mp4`. +- `N_WORKERS`: Number of processes to be used. + +Next, offline generate the `mel spectrogram features` from the audios: + +```shell +cd $MMACTION2 +python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \ + [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART] +``` + +- `AUDIO_HOME_PATH`: The root directory of the audio files. +- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features. +- `EXT`: Extension of the audio files. e.g., `m4a`. +- `N_WORKERS`: Number of processes to be used. +- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines. + +### Spatio-temporal Action Detection + +MMAction2 supports the task based on `AVADataset`. The annotation contains groundtruth bbox and proposal bbox. + +- groundtruth bbox + groundtruth bbox is a csv file with multiple lines, and each line is a detection sample of one frame, with following formats: + + video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id + each field means: + `video_identifier` : The identifier of the corresponding video + `time_stamp`: The time stamp of current frame + `lt_x`: The normalized x-coordinate of the left top point of bounding box + `lt_y`: The normalized y-coordinate of the left top point of bounding box + `rb_y`: The normalized x-coordinate of the right bottom point of bounding box + `rb_y`: The normalized y-coordinate of the right bottom point of bounding box + `label`: The action label + `entity_id`: a unique integer allowing this box to be linked to other boxes depicting the same person in adjacent frames of this video + + Here is an example. + + ``` + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0 + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0 + ... + ``` + +- proposal bbox + proposal bbox is a pickle file generated by a person detector, and usually needs to be fine-tuned on the target dataset. The pickle file contains a dict with below data structure: + + `{'video_identifier,time_stamp': bbox_info}` + + video_identifier (str): The identifier of the corresponding video + time_stamp (int): The time stamp of current frame + bbox_info (np.ndarray, with shape `[n, 5]`): Detected bbox, \ \ \ \ \. x1, x2, y1, y2 are normalized with respect to frame size, which are between 0.0-1.0. + +### Temporal Action Localization + +We support Temporal Action Localization based on `ActivityNetDataset`. The annotation of ActivityNet dataset is a json file. Each key is a video name and the corresponding value is the meta data and annotation for the video. + +Here is an example. + +``` +{ + "video1": { + "duration_second": 211.53, + "duration_frame": 6337, + "annotations": [ + { + "segment": [ + 30.025882995319815, + 205.2318595943838 + ], + "label": "Rock climbing" + } + ], + "feature_frame": 6336, + "fps": 30.0, + "rfps": 29.9579255898 + }, + "video2": {... + } + ... +} +``` + +## Use mixed datasets for training + +MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset. + +### Repeat dataset + +We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`, +to repeat it, the config looks like the following + +```python +dataset_A_train = dict( + type='RepeatDataset', + times=N, + dataset=dict( # This is the original config of Dataset_A + type='Dataset_A', + ... + pipeline=train_pipeline + ) + ) +``` + +## Browse dataset + +coming soon... diff --git a/docs/en/user_guides/train_test.md b/docs/en/user_guides/train_test.md new file mode 100644 index 0000000000000000000000000000000000000000..637f90c63cccda13c4cbfaff8181bd9911a02925 --- /dev/null +++ b/docs/en/user_guides/train_test.md @@ -0,0 +1,252 @@ +# Training and Test + +- [Training and Test](#training-and-test) + - [Training](#training) + - [Training with your PC](#training-with-your-pc) + - [Training with multiple GPUs](#training-with-multiple-gpus) + - [Training with multiple machines](#training-with-multiple-machines) + - [Multiple machines in the same network](#multiple-machines-in-the-same-network) + - [Multiple machines managed with slurm](#multiple-machines-managed-with-slurm) + - [Test](#test) + - [Test with your PC](#test-with-your-pc) + - [Test with multiple GPUs](#test-with-multiple-gpus) + - [Test with multiple machines](#test-with-multiple-machines) + - [Multiple machines in the same network](#multiple-machines-in-the-same-network-1) + - [Multiple machines managed with slurm](#multiple-machines-managed-with-slurm-1) + +## Training + +### Training with your PC + +You can use `tools/train.py` to train a model on a single machine with a CPU and optionally a GPU. + +Here is the full usage of the script: + +```shell +python tools/train.py ${CONFIG_FILE} [ARGS] +``` + +````{note} +By default, MMAction2 prefers GPU to CPU. If you want to train a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program. + +```bash +CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS] +``` +```` + +| ARGS | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CONFIG_FILE` | The path to the config file. | +| `--work-dir WORK_DIR` | The target folder to save logs and checkpoints. Defaults to a folder with the same name of the config file under `./work_dirs`. | +| `--resume [RESUME]` | Resume training. If a path is specified, resume from it, while if not specified, try to auto resume from the latest checkpoint. | +| `--amp` | Enable automatic-mixed-precision training. | +| `--no-validate` | **Not suggested**. Disable checkpoint evaluation during training. | +| `--auto-scale-lr` | Auto scale the learning rate according to the actual batch size and the original batch size. | +| `--seed` | Random seed. | +| `--diff-rank-seed` | Whether or not set different seeds for different ranks. | +| `--deterministic` | Whether to set deterministic options for CUDNN backend. | +| `--cfg-options CFG_OPTIONS` | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. | +| `--launcher {none,pytorch,slurm,mpi}` | Options for job launcher. Defaults to `none`. | + +### Training with multiple GPUs + +We provide a shell script to start a multi-GPUs task with `torch.distributed.launch`. + +```shell +bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS] +``` + +| ARGS | Description | +| ---------- | ---------------------------------------------------------------------------------- | +| `CONFIG` | The path to the config file. | +| `GPUS` | The number of GPUs to be used. | +| `[PYARGS]` | The other optional arguments of `tools/train.py`, see [here](#train-with-your-pc). | + +You can also specify extra arguments of the launcher by environment variables. For example, change the +communication port of the launcher to 29666 by the following command: + +```shell +PORT=29666 bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS] +``` + +If you want to startup multiple training jobs and use different GPUs, you can launch them by specifying +different port and visible devices. + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS] +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS] +``` + +### Training with multiple machines + +#### Multiple machines in the same network + +If you launch a training job with multiple machines connected with ethernet, you can run the following commands: + +On the first machine: + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +On the second machine: + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +The following extra environment variables need to be specified to train or test models with multiple machines: + +| ENV_VARS | Description | +| ------------- | ----------------------------------------------------------------------------------------------------- | +| `NNODES` | The total number of machines. Defaults to 1. | +| `NODE_RANK` | The index of the local machine. Defaults to 0. | +| `PORT` | The communication port, it should be the same in all machines. Defaults to 29500. | +| `MASTER_ADDR` | The IP address of the master machine, it should be the same in all machines. Defaults to `127.0.0.1`. | + +Usually it is slow if you do not have high speed networking like InfiniBand. + +#### Multiple machines managed with slurm + +If you run MMAction2 on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`. + +```shell +[ENV_VARS] bash tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG} [PY_ARGS] +``` + +Here are the arguments description of the script. + +| ARGS | Description | +| ----------- | ---------------------------------------------------------------------------------- | +| `PARTITION` | The partition to use in your cluster. | +| `JOB_NAME` | The name of your job, you can name it as you like. | +| `CONFIG` | The path to the config file. | +| `[PYARGS]` | The other optional arguments of `tools/train.py`, see [here](#train-with-your-pc). | + +Here are the environment variables can be used to configure the slurm job. + +| ENV_VARS | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------- | +| `GPUS` | The number of GPUs to be used. Defaults to 8. | +| `GPUS_PER_NODE` | The number of GPUs to be allocated per node. Defaults to 8. | +| `CPUS_PER_TASK` | The number of CPUs to be allocated per task (Usually one GPU corresponds to one task). Defaults to 5. | +| `SRUN_ARGS` | The other arguments of `srun`. Available options can be found [here](https://slurm.schedmd.com/srun.html). | + +## Test + +### Test with your PC + +You can use `tools/test.py` to test a model on a single machine with a CPU and optionally a GPU. + +Here is the full usage of the script: + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS] +``` + +````{note} +By default, MMAction2 prefers GPU to CPU. If you want to test a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program. + +```bash +CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS] +``` +```` + +| ARGS | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CONFIG_FILE` | The path to the config file. | +| `CHECKPOINT_FILE` | The path to the checkpoint file (It can be a http link) | +| `--work-dir WORK_DIR` | The directory to save the file containing evaluation metrics. Defaults to a folder with the same name of the config file under `./work_dirs`. | +| `--dump DUMP` | The path to dump all outputs of the model for offline evaluation. | +| `--cfg-options CFG_OPTIONS` | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. | +| `--show-dir SHOW_DIR` | The directory to save the result visualization images. | +| `--show` | Visualize the prediction result in a window. | +| `--interval INTERVAL` | The interval of samples to visualize. Defaults to 1. | +| `--wait-time WAIT_TIME` | The display time of every window (in seconds). Defaults to 2. | +| `--launcher {none,pytorch,slurm,mpi}` | Options for job launcher. Defaults to `none`. | + +### Test with multiple GPUs + +We provide a shell script to start a multi-GPUs task with `torch.distributed.launch`. + +```shell +bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS] +``` + +| ARGS | Description | +| ------------ | -------------------------------------------------------------------------------- | +| `CONFIG` | The path to the config file. | +| `CHECKPOINT` | The path to the checkpoint file (It can be a http link) | +| `GPUS` | The number of GPUs to be used. | +| `[PYARGS]` | The other optional arguments of `tools/test.py`, see [here](#test-with-your-pc). | + +You can also specify extra arguments of the launcher by environment variables. For example, change the +communication port of the launcher to 29666 by the following command: + +```shell +PORT=29666 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS] +``` + +If you want to startup multiple test jobs and use different GPUs, you can launch them by specifying +different port and visible devices. + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS] +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS] +``` + +### Test with multiple machines + +#### Multiple machines in the same network + +If you launch a test job with multiple machines connected with ethernet, you can run the following commands: + +On the first machine: + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS +``` + +On the second machine: + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS +``` + +Compared with multi-GPUs in a single machine, you need to specify some extra environment variables: + +| ENV_VARS | Description | +| ------------- | ----------------------------------------------------------------------------------------------------- | +| `NNODES` | The total number of machines. Defaults to 1. | +| `NODE_RANK` | The index of the local machine. Defaults to 0. | +| `PORT` | The communication port, it should be the same in all machines. Defaults to 29500. | +| `MASTER_ADDR` | The IP address of the master machine, it should be the same in all machines. Defaults to `127.0.0.1`. | + +Usually it is slow if you do not have high speed networking like InfiniBand. + +#### Multiple machines managed with slurm + +If you run MMAction2 on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_test.sh`. + +```shell +[ENV_VARS] bash tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG} ${CHECKPOINT} [PY_ARGS] +``` + +Here are the arguments description of the script. + +| ARGS | Description | +| ------------ | -------------------------------------------------------------------------------- | +| `PARTITION` | The partition to use in your cluster. | +| `JOB_NAME` | The name of your job, you can name it as you like. | +| `CONFIG` | The path to the config file. | +| `CHECKPOINT` | The path to the checkpoint file (It can be a http link) | +| `[PYARGS]` | The other optional arguments of `tools/test.py`, see [here](#test-with-your-pc). | + +Here are the environment variables can be used to configure the slurm job. + +| ENV_VARS | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------- | +| `GPUS` | The number of GPUs to be used. Defaults to 8. | +| `GPUS_PER_NODE` | The number of GPUs to be allocated per node. Defaults to 8. | +| `CPUS_PER_TASK` | The number of CPUs to be allocated per task (Usually one GPU corresponds to one task). Defaults to 5. | +| `SRUN_ARGS` | The other arguments of `srun`. Available options can be found [here](https://slurm.schedmd.com/srun.html). | diff --git a/docs/en/utils.py b/docs/en/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..444e4c147d19d3f20686c81233d7ffc6e0821c19 --- /dev/null +++ b/docs/en/utils.py @@ -0,0 +1,28 @@ +import re +from pathlib import Path + + +def replace_link(pattern, template, content, file_path): + MMACT_ROOT = Path(__file__).absolute().parents[2] + GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/' + + def replace_core(matchobj): + name = matchobj.group(1) + link = matchobj.group(2) + if link.startswith('http') or link.startswith('#'): + return template.format(name, link) + # For link relative to project folder, such as '/configs/*/*.py' + elif Path(link).is_absolute(): + link = link.lstrip('/') + folder = MMACT_ROOT + # For link relative to current file, such as './config/*.py' + else: + folder = file_path.parent + file_link = link.split('#')[0] + assert (folder / file_link).exists(), \ + f'Link not found:\n{file_path}: {folder / link}' + rel_link = (folder / link).resolve().relative_to(MMACT_ROOT) + link = GITHUB_PREFIX + str(rel_link) + return template.format(name, link) + + return re.sub(pattern, replace_core, content) diff --git a/docs/zh_cn/Makefile b/docs/zh_cn/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..73a28c7134cd1760744f34bac4ebdedfbed40f72 --- /dev/null +++ b/docs/zh_cn/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..55b3d3f8ffde0fd0e9d00e7f8b73124bba6cfe2d --- /dev/null +++ b/docs/zh_cn/_static/css/readthedocs.css @@ -0,0 +1,62 @@ +.header-logo { + background-image: url("../images/logo.png"); + background-size: 130px 40px; + height: 40px; + width: 130px; +} + +@media screen and (min-width: 1100px) { + .header-logo { + top: -12px; + } + } + + pre { + white-space: pre; + } + + @media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } + } + + + article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; + } + + /* Disable the change in tables */ + article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; + } + + table.autosummary td { + width: 50% + } + + img.align-center { + display: block; + margin-left: auto; + margin-right: auto; + } + + article.pytorch-article p.rubric { + font-weight: bold; + } diff --git a/docs/zh_cn/_static/images/logo.png b/docs/zh_cn/_static/images/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add Binary files /dev/null and b/docs/zh_cn/_static/images/logo.png differ diff --git a/docs/zh_cn/_static/js/custom.js b/docs/zh_cn/_static/js/custom.js new file mode 100644 index 0000000000000000000000000000000000000000..6afd7b02a2acfc8d608ff935f9951597b9310c11 --- /dev/null +++ b/docs/zh_cn/_static/js/custom.js @@ -0,0 +1,20 @@ +var collapsedSections = ['ๆ•ฐๆฎ้›†ๆ”ฏๆŒ']; + +$(document).ready(function () { + $('.model-summary').DataTable({ + "stateSave": false, + "lengthChange": false, + "pageLength": 20, + "order": [], + "language": { + "info": "ๆ˜พ็คบ _START_ ่‡ณ _END_ ๆก็›ฎ๏ผˆๆ€ป่ฎก _TOTAL_ ๏ผ‰", + "infoFiltered": "๏ผˆ็ญ›้€‰่‡ช _MAX_ ๆก็›ฎ๏ผ‰", + "search": "ๆœ็ดข๏ผš", + "zeroRecords": "ๆฒกๆœ‰ๆ‰พๅˆฐไปปไฝ•ๆก็›ฎ", + "paginate": { + "next": "ไธ‹ไธ€้กต", + "previous": "ไธŠไธ€้กต" + }, + } + }); +}); diff --git a/docs/zh_cn/_templates/404.html b/docs/zh_cn/_templates/404.html new file mode 100644 index 0000000000000000000000000000000000000000..dfdc4c33992e96876205a84e184130cd6806c7f8 --- /dev/null +++ b/docs/zh_cn/_templates/404.html @@ -0,0 +1,16 @@ +{% extends "layout.html" %} + +{% block body %} + +

ๆœชๆ‰พๅˆฐ้กต้ข

+

+ ๆœชๆ‰พๅˆฐไฝ ่ฆๆ‰“ๅผ€็š„้กต้ขใ€‚ +

+

+ ๅฆ‚ๆžœไฝ ๆ˜ฏไปŽๆ—ง็‰ˆๆœฌๆ–‡ๆกฃ่ทณ่ฝฌ่‡ณๆญค๏ผŒๅฏ่ƒฝๆ˜ฏๅฏนๅบ”็š„้กต้ข่ขซ็งปๅŠจไบ†ใ€‚่ฏทไปŽๅทฆไพง็š„็›ฎๅฝ•ไธญๅฏปๆ‰พๆ–ฐ็‰ˆๆœฌๆ–‡ๆกฃ๏ผŒๆˆ–่€…่ทณ่ฝฌ่‡ณ้ฆ–้กตใ€‚ +

+

+ ๅฆ‚ๆžœไฝ ๆ‰พไธๅˆฐๅธŒๆœ›ๆ‰“ๅผ€็š„ๆ–‡ๆกฃ๏ผŒๆฌข่ฟŽๅœจ Issue ไธญๅ‘Š่ฏ‰ๆˆ‘ไปฌ๏ผ +

+ +{% endblock %} diff --git a/docs/zh_cn/advanced_guides/customize_dataset.md b/docs/zh_cn/advanced_guides/customize_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..3aa87119386802b3ae9e5d5ddb0835642484bc80 --- /dev/null +++ b/docs/zh_cn/advanced_guides/customize_dataset.md @@ -0,0 +1,126 @@ +# ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›† + +ๅœจๆœฌๆ•™็จ‹ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไป‹็ปๅฆ‚ไฝ•้€š่ฟ‡ๅœจ็บฟ่ฝฌๆขๆฅ่‡ชๅฎšไน‰ไฝ ็š„ๆ•ฐๆฎ้›†ใ€‚ + +- [่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](#่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†) + - [MMAction2 ๆ•ฐๆฎ้›†ๆฆ‚่ฟฐ](#mmaction2-ๆ•ฐๆฎ้›†ๆฆ‚่ฟฐ) + - [ๅฎšๅˆถๆ–ฐ็š„ๆ•ฐๆฎ้›†](#ๅฎšๅˆถๆ–ฐ็š„ๆ•ฐๆฎ้›†) + - [ไธบ PoseDataset ่‡ชๅฎšไน‰ๅ…ณ้”ฎ็‚นๆ ผๅผ](#ไธบ-posedataset-่‡ชๅฎšไน‰ๅ…ณ้”ฎ็‚นๆ ผๅผ) + +## MMAction2 ๆ•ฐๆฎ้›†ๆฆ‚่ฟฐ + +MMAction2 ๆไพ›ไบ†ไปปๅŠก็‰นๅฎš็š„ `Dataset` ็ฑป๏ผŒไพ‹ๅฆ‚็”จไบŽๅŠจไฝœ่ฏ†ๅˆซ็š„ `VideoDataset`/`RawframeDataset`๏ผŒ็”จไบŽๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹็š„ `AVADataset`๏ผŒ็”จไบŽๅŸบไบŽ้ชจ้ชผ็š„ๅŠจไฝœ่ฏ†ๅˆซ็š„`PoseDataset`ใ€‚่ฟ™ไบ›ไปปๅŠก็‰นๅฎš็š„ๆ•ฐๆฎ้›†ๅช้œ€่ฆๅฎž็Žฐ `load_data_list(self)` ๆฅไปŽๆณจ้‡Šๆ–‡ไปถ็”Ÿๆˆๆ•ฐๆฎๅˆ—่กจใ€‚ๅ‰ฉไธ‹็š„ๅ‡ฝๆ•ฐ็”ฑ่ถ…็ฑป๏ผˆๅณ `BaseActionDataset` ๅ’Œ `BaseDataset`๏ผ‰่‡ชๅŠจๅค„็†ใ€‚ไธ‹่กจๆ˜พ็คบไบ†ๆจกๅ—็š„็ปงๆ‰ฟๅ…ณ็ณปๅ’Œไธป่ฆๆ–นๆณ•ใ€‚ + +| ็ฑปๅ | ็ฑปๆ–นๆณ• | +| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `MMAction2::VideoDataset` | `load_data_list(self)`
ไปŽๆณจ้‡Šๆ–‡ไปถไธญๆž„ๅปบๆ•ฐๆฎๅˆ—่กจใ€‚ | +| `MMAction2::BaseActionDataset` | `get_data_info(self, idx)`
็ป™ๅฎš `idx`๏ผŒไปŽๆ•ฐๆฎๅˆ—่กจไธญ่ฟ”ๅ›ž็›ธๅบ”็š„ๆ•ฐๆฎๆ ทๆœฌใ€‚ | +| `MMEngine::BaseDataset` | `__getitem__(self, idx)`
็ป™ๅฎš `idx`๏ผŒ่ฐƒ็”จ `get_data_info` ่Žทๅ–ๆ•ฐๆฎๆ ทๆœฌ๏ผŒ็„ถๅŽ่ฐƒ็”จ `pipeline` ๅœจ `train_pipeline` ๆˆ– `val_pipeline` ไธญๆ‰ง่กŒๆ•ฐๆฎๅ˜ๆขๅ’Œๅขžๅผบใ€‚ | + +## ๅฎšๅˆถๆ–ฐ็š„ๆ•ฐๆฎ้›†็ฑป + +ๅคงๅคšๆ•ฐๆƒ…ๅ†ตไธ‹๏ผŒๆŠŠไฝ ็š„ๆ•ฐๆฎ้›†็ฆป็บฟ่ฝฌๆขๆˆๆŒ‡ๅฎšๆ ผๅผๆ˜ฏ้ฆ–้€‰ๆ–นๆณ•๏ผŒไฝ† MMAction2 ๆไพ›ไบ†ไธ€ไธชๆ–นไพฟ็š„่ฟ‡็จ‹ๆฅๅˆ›ๅปบไธ€ไธชๅฎšๅˆถ็š„ `Dataset` ็ฑปใ€‚ๅฆ‚ๅ‰ๆ‰€่ฟฐ๏ผŒไปปๅŠก็‰นๅฎš็š„ๆ•ฐๆฎ้›†ๅช้œ€่ฆๅฎž็Žฐ `load_data_list(self)` ๆฅไปŽๆณจ้‡Šๆ–‡ไปถ็”Ÿๆˆๆ•ฐๆฎๅˆ—่กจใ€‚่ฏทๆณจๆ„๏ผŒ`data_list` ไธญ็š„ๅ…ƒ็ด ๆ˜ฏๅŒ…ๅซๅŽ็ปญๆต็จ‹ไธญๅฟ…่ฆๅญ—ๆฎต็š„ `dict`ใ€‚ + +ไปฅ `VideoDataset` ไธบไพ‹๏ผŒ`train_pipeline`/`val_pipeline` ๅœจ `DecordInit` ไธญ้œ€่ฆ `'filename'`๏ผŒๅœจ `PackActionInputs` ไธญ้œ€่ฆ `'label'`ใ€‚ๅ› ๆญค๏ผŒ`data_list` ไธญ็š„ๆ•ฐๆฎๆ ทๆœฌๅฟ…้กปๅŒ…ๅซ2ไธชๅญ—ๆฎต๏ผš`'filename'`ๅ’Œ`'label'`ใ€‚ +่ฏทๅ‚่€ƒ[ๅฎšๅˆถๆ•ฐๆฎๆตๆฐด็บฟ](customize_pipeline.md)ไปฅ่Žทๅ–ๆœ‰ๅ…ณ `pipeline` ็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏใ€‚ + +``` +data_list.append(dict(filename=filename, label=label)) +``` + +`AVADataset` ไผšๆ›ดๅŠ ๅคๆ‚๏ผŒ`data_list` ไธญ็š„ๆ•ฐๆฎๆ ทๆœฌๅŒ…ๅซๆœ‰ๅ…ณ่ง†้ข‘ๆ•ฐๆฎ็š„ๅ‡ ไธชๅญ—ๆฎตใ€‚ๆญคๅค–๏ผŒๅฎƒ้‡ๅ†™ไบ† `get_data_info(self, idx)` ไปฅ่ฝฌๆขๅœจๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹ๆ•ฐๆฎๆตๆฐด็บฟไธญ้œ€่ฆ็”จ็š„ๅญ—ๆฎตใ€‚ + +```python + +class AVADataset(BaseActionDataset): + ... + + def load_data_list(self) -> List[dict]: + ... + video_info = dict( + frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + data_list.append(video_info) + data_list.append(video_info) + return data_list + + def get_data_info(self, idx: int) -> dict: + ... + ann = data_info.pop('ann') + data_info['gt_bboxes'] = ann['gt_bboxes'] + data_info['gt_labels'] = ann['gt_labels'] + data_info['entity_ids'] = ann['entity_ids'] + return data_info +``` + +## ไธบ PoseDataset ่‡ชๅฎšไน‰ๅ…ณ้”ฎ็‚นๆ ผๅผ + +MMAction2 ็›ฎๅ‰ๆ”ฏๆŒไธ‰็งๅ…ณ้”ฎ็‚นๆ ผๅผ๏ผš`coco`๏ผŒ`nturgb+d` ๅ’Œ `openpose`ใ€‚ๅฆ‚ๆžœไฝ ไฝฟ็”จๅ…ถไธญไธ€็งๆ ผๅผ๏ผŒไฝ ๅฏไปฅ็ฎ€ๅ•ๅœฐๅœจไปฅไธ‹ๆจกๅ—ไธญๆŒ‡ๅฎš็›ธๅบ”็š„ๆ ผๅผ๏ผš + +ๅฏนไบŽๅ›พๅท็งฏ็ฝ‘็ปœ๏ผŒๅฆ‚ AAGCN๏ผŒSTGCN๏ผŒ... + +- `pipeline`๏ผšๅœจ `JointToBone` ไธญ็š„ๅ‚ๆ•ฐ `dataset`ใ€‚ +- `backbone`๏ผšๅœจๅ›พๅท็งฏ็ฝ‘็ปœไธญ็š„ๅ‚ๆ•ฐ `graph_cfg`ใ€‚ + +ๅฏนไบŽ PoseC3D๏ผš + +- `pipeline`๏ผšๅœจ `Flip` ไธญ๏ผŒๆ นๆฎๅ…ณ้”ฎ็‚น็š„ๅฏน็งฐๅ…ณ็ณปๆŒ‡ๅฎš `left_kp` ๅ’Œ `right_kp`ใ€‚ +- `pipeline`๏ผšๅœจ `GeneratePoseTarget` ไธญ๏ผŒๅฆ‚ๆžœ `with_limb` ไธบ `True`๏ผŒๆŒ‡ๅฎš`skeletons`๏ผŒ`left_limb`๏ผŒ`right_limb`๏ผŒๅฆ‚ๆžœ `with_kp` ไธบ `True`๏ผŒๆŒ‡ๅฎš`left_kp` ๅ’Œ `right_kp`ใ€‚ + +ๅฆ‚ๆžœไฝฟ็”จ่‡ชๅฎšไน‰ๅ…ณ้”ฎ็‚นๆ ผๅผ๏ผŒ้œ€่ฆๅœจ `backbone` ๅ’Œ `pipeline` ไธญ้ƒฝๅŒ…ๅซไธ€ไธชๆ–ฐ็š„ๅ›พๅธƒๅฑ€ใ€‚่ฟ™ไธชๅธƒๅฑ€ๅฐ†ๅฎšไน‰ๅ…ณ้”ฎ็‚นๅŠๅ…ถ่ฟžๆŽฅๅ…ณ็ณปใ€‚ + +ไปฅ `coco` ๆ•ฐๆฎ้›†ไธบไพ‹๏ผŒๆˆ‘ไปฌๅœจ `Graph` ไธญๅฎšไน‰ไบ†ไธ€ไธชๅไธบ `coco` ็š„ๅธƒๅฑ€ใ€‚่ฟ™ไธชๅธƒๅฑ€็š„ `inward` ่ฟžๆŽฅๅŒ…ๆ‹ฌๆ‰€ๆœ‰่Š‚็‚น่ฟžๆŽฅ๏ผŒๆฏไธช**ๅ‘ๅฟƒ**่ฟžๆŽฅ็”ฑไธ€ไธช่Š‚็‚นๅ…ƒ็ป„็ป„ๆˆใ€‚`coco`็š„้ขๅค–่ฎพ็ฝฎๅŒ…ๆ‹ฌๅฐ†่Š‚็‚นๆ•ฐๆŒ‡ๅฎšไธบ `17`๏ผŒๅฐ† `node 0` ่ฎพไธบไธญๅฟƒ่Š‚็‚นใ€‚ + +```python + +self.num_node = 17 +self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), + (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0), + (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)] +self.center = 0 +``` + +ๅŒๆ ท๏ผŒๆˆ‘ไปฌๅœจ `JointToBone` ไธญๅฎšไน‰ไบ† `pairs`๏ผŒๆทปๅŠ ไบ†ไธ€ไธช bone `(0, 0)` ไปฅไฝฟ bone ็š„ๆ•ฐ้‡ๅฏน้ฝๅˆฐ jointใ€‚cocoๆ•ฐๆฎ้›†็š„ `pairs` ๅฆ‚ไธ‹ๆ‰€็คบ๏ผŒ`JointToBone` ไธญ็š„ `pairs` ็š„้กบๅบๆ— ๅ…ณ็ดง่ฆใ€‚ + +```python + +self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), + (5, 0), (6, 0), (7, 5), (8, 6), (9, 7), + (10, 8), (11, 0), (12, 0), (13, 11), (14, 12), + (15, 13), (16, 14)) +``` + +่ฆไฝฟ็”จไฝ ็š„่‡ชๅฎšไน‰ๅ…ณ้”ฎ็‚นๆ ผๅผ๏ผŒๅช้œ€ๅฎšไน‰ไธŠ่ฟฐ่ฎพ็ฝฎไธบไฝ ็š„ๅ›พ็ป“ๆž„๏ผŒๅนถๅœจไฝ ็š„้…็ฝฎๆ–‡ไปถไธญๆŒ‡ๅฎšๅฎƒไปฌ๏ผŒๅฆ‚ไธ‹ๆ‰€็คบใ€‚ๅœจ่ฟ™ไธชไพ‹ๅญไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไฝฟ็”จ `STGCN`๏ผŒๅ…ถไธญ `n` ่กจ็คบ็ฑปๅˆซ็š„ๆ•ฐ้‡๏ผŒ`custom_dataset` ๅœจ `Graph` ๅ’Œ `JointToBone` ไธญๅฎšไน‰ใ€‚ + +```python +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=n, in_channels=256)) + +train_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +val_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +test_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +``` + +ๅช้œ€็ฎ€ๅ•ๅœฐๆŒ‡ๅฎš่‡ชๅฎšไน‰ๅธƒๅฑ€๏ผŒไฝ ๅฐฑๅฏไปฅไฝฟ็”จไฝ ่‡ชๅทฑ็š„ๅ…ณ้”ฎ็‚นๆ ผๅผ่ฟ›่กŒ่ฎญ็ปƒๅ’Œๆต‹่ฏ•ไบ†ใ€‚้€š่ฟ‡่ฟ™็งๆ–นๅผ๏ผŒMMAction2 ไธบ็”จๆˆทๆไพ›ไบ†ๅพˆๅคง็š„็ตๆดปๆ€ง๏ผŒๅ…่ฎธ็”จๆˆท่‡ชๅฎšไน‰ไป–ไปฌ็š„ๆ•ฐๆฎ้›†ๅ’Œๅ…ณ้”ฎ็‚นๆ ผๅผ๏ผŒไปฅๆปก่ถณไป–ไปฌ็‰นๅฎš็š„้œ€ๆฑ‚ใ€‚ + +ไปฅไธŠๅฐฑๆ˜ฏๅ…ณไบŽๅฆ‚ไฝ•่‡ชๅฎšไน‰ไฝ ็š„ๆ•ฐๆฎ้›†็š„ไธ€ไบ›ๆ–นๆณ•ใ€‚ๅธŒๆœ›่ฟ™ไธชๆ•™็จ‹่ƒฝๅธฎๅŠฉไฝ ็†่งฃMMAction2็š„ๆ•ฐๆฎ้›†็ป“ๆž„๏ผŒๅนถๆ•™็ป™ไฝ ๅฆ‚ไฝ•ๆ นๆฎ่‡ชๅทฑ็š„้œ€ๆฑ‚ๅˆ›ๅปบๆ–ฐ็š„ๆ•ฐๆฎ้›†ใ€‚่™ฝ็„ถ่ฟ™ๅฏ่ƒฝ้œ€่ฆไธ€ไบ›็ผ–็จ‹็Ÿฅ่ฏ†๏ผŒไฝ†ๆ˜ฏ MMAction2 ่ฏ•ๅ›พไฝฟ่ฟ™ไธช่ฟ‡็จ‹ๅฐฝๅฏ่ƒฝ็ฎ€ๅ•ใ€‚้€š่ฟ‡ไบ†่งฃ่ฟ™ไบ›ๅŸบๆœฌๆฆ‚ๅฟต๏ผŒไฝ ๅฐ†่ƒฝๅคŸๆ›ดๅฅฝๅœฐๆŽงๅˆถไฝ ็š„ๆ•ฐๆฎ๏ผŒไปŽ่€Œๆ”น่ฟ›ไฝ ็š„ๆจกๅž‹ๆ€ง่ƒฝใ€‚ diff --git a/docs/zh_cn/advanced_guides/customize_logging.md b/docs/zh_cn/advanced_guides/customize_logging.md new file mode 100644 index 0000000000000000000000000000000000000000..c0283ae4d8cad19f340c12b3cfaa0146b57fc613 --- /dev/null +++ b/docs/zh_cn/advanced_guides/customize_logging.md @@ -0,0 +1,163 @@ +# ่‡ชๅฎšไน‰ๆ—ฅๅฟ— + +MMAction2 ๅœจ่ฟ่กŒ่ฟ‡็จ‹ไธญไผšไบง็”Ÿๅคง้‡็š„ๆ—ฅๅฟ—๏ผŒๅฆ‚ๆŸๅคฑใ€่ฟญไปฃๆ—ถ้—ดใ€ๅญฆไน ็އ็ญ‰ใ€‚ๅœจ่ฟ™ไธ€้ƒจๅˆ†๏ผŒๆˆ‘ไปฌๅฐ†ๅ‘ไฝ ไป‹็ปๅฆ‚ไฝ•่พ“ๅ‡บ่‡ชๅฎšไน‰ๆ—ฅๅฟ—ใ€‚ๆœ‰ๅ…ณๆ—ฅๅฟ—็ณป็ปŸ็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ [MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html)ใ€‚ + +- [่‡ชๅฎšไน‰ๆ—ฅๅฟ—](#่‡ชๅฎšไน‰ๆ—ฅๅฟ—) + - [็ตๆดป็š„ๆ—ฅๅฟ—็ณป็ปŸ](#็ตๆดป็š„ๆ—ฅๅฟ—็ณป็ปŸ) + - [ๅฎšๅˆถๆ—ฅๅฟ—](#ๅฎšๅˆถๆ—ฅๅฟ—) + - [ๅฏผๅ‡บ่ฐƒ่ฏ•ๆ—ฅๅฟ—](#ๅฏผๅ‡บ่ฐƒ่ฏ•ๆ—ฅๅฟ—) + +## ็ตๆดป็š„ๆ—ฅๅฟ—็ณป็ปŸ + +้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒMMAction2 ็š„ๆ—ฅๅฟ—็ณป็ปŸ็”ฑ [default_runtime](/configs/_base_/default_runtime.py) ไธญ็š„ `LogProcessor` ้…็ฝฎ๏ผš + +```python +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) +``` + +้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒ`LogProcessor` ๆ•่Žท `model.forward` ่ฟ”ๅ›ž็š„ๆ‰€ๆœ‰ไปฅ `loss` ๅผ€ๅคด็š„ๅญ—ๆฎตใ€‚ไพ‹ๅฆ‚๏ผŒๅœจไปฅไธ‹ๆจกๅž‹ไธญ๏ผŒ`loss1` ๅ’Œ `loss2` ๅฐ†ๅœจๆฒกๆœ‰ไปปไฝ•้ขๅค–้…็ฝฎ็š„ๆƒ…ๅ†ตไธ‹่‡ชๅŠจ่ฎฐๅฝ•ๅˆฐๆ—ฅๅฟ—ใ€‚ + +```python +from mmengine.model import BaseModel + +class ToyModel(BaseModel): + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss1 = (feat - label).pow(2) + loss2 = (feat - label).abs() + return dict(loss1=loss1, loss2=loss2) +``` + +่พ“ๅ‡บๆ—ฅๅฟ—้ตๅพชไปฅไธ‹ๆ ผๅผ๏ผš + +``` +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0019 data_time: 0.0004 loss1: 0.8381 loss2: 0.9007 loss: 1.7388 +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0029 data_time: 0.0010 loss1: 0.1978 loss2: 0.4312 loss: 0.6290 +``` + +`LogProcessor` ๅฐ†ๆŒ‰ไปฅไธ‹ๆ ผๅผ่พ“ๅ‡บๆ—ฅๅฟ—๏ผš + +- ๆ—ฅๅฟ—็š„ๅ‰็ผ€๏ผš + - epoch ๆจกๅผ(`by_epoch=True`)๏ผš`Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}` + - iteration ๆจกๅผ(`by_epoch=False`)๏ผš`Iter(train) [{current_iteration}/{max_iteration}]` +- ๅญฆไน ็އ (`lr`)๏ผšๆœ€ๅŽไธ€ๆฌก่ฟญไปฃ็š„ๅญฆไน ็އใ€‚ +- ๆ—ถ้—ด๏ผš + - `time`๏ผš่ฟ‡ๅŽป `window_size` ๆฌก่ฟญไปฃ็š„ๆŽจ็†ๅนณๅ‡ๆ—ถ้—ดใ€‚ + - `data_time`๏ผš่ฟ‡ๅŽป `window_size` ๆฌก่ฟญไปฃ็š„ๆ•ฐๆฎๅŠ ่ฝฝๅนณๅ‡ๆ—ถ้—ดใ€‚ + - `eta`๏ผšๅฎŒๆˆ่ฎญ็ปƒ็š„้ข„่ฎกๅˆฐ่พพๆ—ถ้—ดใ€‚ +- ๆŸๅคฑ๏ผš่ฟ‡ๅŽป `window_size` ๆฌก่ฟญไปฃไธญๆจกๅž‹่พ“ๅ‡บ็š„ๅนณๅ‡ๆŸๅคฑใ€‚ + +```{warning} +้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒlog_processor ่พ“ๅ‡บๅŸบไบŽ epoch ็š„ๆ—ฅๅฟ—(`by_epoch=True`)ใ€‚่ฆๅพ—ๅˆฐไธŽ `train_cfg` ๅŒน้…็š„้ข„ๆœŸๆ—ฅๅฟ—๏ผŒๆˆ‘ไปฌๅบ”ๅœจ `train_cfg` ๅ’Œ `log_processor` ไธญ่ฎพ็ฝฎ็›ธๅŒ็š„ `by_epoch` ๅ€ผใ€‚ +``` + +ๆ นๆฎไปฅไธŠ่ง„ๅˆ™๏ผŒไปฃ็ ็‰‡ๆฎตๅฐ†ๆฏ20ๆฌก่ฟญไปฃ่ฎก็ฎ— loss1 ๅ’Œ loss2 ็š„ๅนณๅ‡ๅ€ผใ€‚ๆ›ดๅคš็ฑปๅž‹็š„็ปŸ่ฎกๆ–นๆณ•๏ผŒ่ฏทๅ‚่€ƒ [mmengine.runner.LogProcessor](mmengine.runner.LogProcessor)ใ€‚ + +## ๅฎšๅˆถๆ—ฅๅฟ— + +ๆ—ฅๅฟ—็ณป็ปŸไธไป…ๅฏไปฅ่ฎฐๅฝ• `loss`๏ผŒ`lr` ็ญ‰๏ผŒ่ฟ˜ๅฏไปฅๆ”ถ้›†ๅ’Œ่พ“ๅ‡บ่‡ชๅฎšไน‰ๆ—ฅๅฟ—ใ€‚ไพ‹ๅฆ‚๏ผŒๅฆ‚ๆžœๆˆ‘ไปฌๆƒณ่ฆ็ปŸ่ฎกไธญ้—ดๆŸๅคฑ๏ผš + +`ToyModel` ๅœจ forward ไธญ่ฎก็ฎ— `loss_tmp`๏ผŒไฝ†ไธๅฐ†ๅ…ถไฟๅญ˜ๅˆฐ่ฟ”ๅ›žๅญ—ๅ…ธไธญใ€‚ + +```python +from mmengine.logging import MessageHub + +class ToyModel(BaseModel): + + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss_tmp = (feat - label).abs() + loss = loss_tmp.pow(2) + + message_hub = MessageHub.get_current_instance() + # ๅœจๆถˆๆฏไธญๅฟƒๆ›ดๆ–ฐไธญ้—ด็š„ `loss_tmp` + message_hub.update_scalar('train/loss_tmp', loss_tmp.sum()) + return dict(loss=loss) +``` + +ๅฐ† `loss_tmp` ๆทปๅŠ ๅˆฐ้…็ฝฎไธญ๏ผš + +```python +log_processor = dict( + type='LogProcessor', + window_size=20, + by_epoch=True, + custom_cfg=[ + # ไฝฟ็”จๅนณๅ‡ๅ€ผ็ปŸ่ฎก loss_tmp + dict( + data_src='loss_tmp', + window_size=20, + method_name='mean') + ]) +``` + +`loss_tmp` ๅฐ†่ขซๆทปๅŠ ๅˆฐ่พ“ๅ‡บๆ—ฅๅฟ—ไธญ๏ผš + +``` +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0026 data_time: 0.0008 loss_tmp: 0.0097 loss: 0.0000 +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0028 data_time: 0.0013 loss_tmp: 0.0065 loss: 0.0000 +``` + +## ๅฏผๅ‡บ่ฐƒ่ฏ•ๆ—ฅๅฟ— + +่ฆๅฐ†่ฐƒ่ฏ•ๆ—ฅๅฟ—ๅฏผๅ‡บๅˆฐ `work_dir`๏ผŒไฝ ๅฏไปฅๅœจ้…็ฝฎๆ–‡ไปถไธญ่ฎพ็ฝฎๆ—ฅๅฟ—็บงๅˆซๅฆ‚ไธ‹๏ผš + +``` +log_level='DEBUG' +``` + +``` +08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend +08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook +08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine" +... +``` + +ๆญคๅค–๏ผŒๅฆ‚ๆžœไฝ ๆญฃๅœจไฝฟ็”จๅ…ฑไบซๅญ˜ๅ‚จ่ฎญ็ปƒไฝ ็š„ๆจกๅž‹๏ผŒ้‚ฃไนˆๅœจ `debug` ๆจกๅผไธ‹๏ผŒไธๅŒๆŽ’ๅ็š„ๆ—ฅๅฟ—ๅฐ†่ขซไฟๅญ˜ใ€‚ๆ—ฅๅฟ—็š„ๅฑ‚็บง็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +```text +./tmp +โ”œโ”€โ”€ tmp.log +โ”œโ”€โ”€ tmp_rank1.log +โ”œโ”€โ”€ tmp_rank2.log +โ”œโ”€โ”€ tmp_rank3.log +โ”œโ”€โ”€ tmp_rank4.log +โ”œโ”€โ”€ tmp_rank5.log +โ”œโ”€โ”€ tmp_rank6.log +โ””โ”€โ”€ tmp_rank7.log +... +โ””โ”€โ”€ tmp_rank63.log +``` + +ๅœจๅ…ทๆœ‰็‹ฌ็ซ‹ๅญ˜ๅ‚จ็š„ๅคšๅฐๆœบๅ™จไธŠ็š„ๆ—ฅๅฟ—๏ผš + +```text +# ่ฎพๅค‡๏ผš0๏ผš +work_dir/ +โ””โ”€โ”€ exp_name_logs + โ”œโ”€โ”€ exp_name.log + โ”œโ”€โ”€ exp_name_rank1.log + โ”œโ”€โ”€ exp_name_rank2.log + โ”œโ”€โ”€ exp_name_rank3.log + ... + โ””โ”€โ”€ exp_name_rank7.log + +# ่ฎพๅค‡๏ผš7๏ผš +work_dir/ +โ””โ”€โ”€ exp_name_logs + โ”œโ”€โ”€ exp_name_rank56.log + โ”œโ”€โ”€ exp_name_rank57.log + โ”œโ”€โ”€ exp_name_rank58.log + ... + โ””โ”€โ”€ exp_name_rank63.log +``` diff --git a/docs/zh_cn/advanced_guides/customize_models.md b/docs/zh_cn/advanced_guides/customize_models.md new file mode 100644 index 0000000000000000000000000000000000000000..32fc255b0097be6fe90ee656efb8f3be6dac35d1 --- /dev/null +++ b/docs/zh_cn/advanced_guides/customize_models.md @@ -0,0 +1,3 @@ +# ่‡ชๅฎšไน‰ๆจกๅž‹ + +ๅ†…ๅฎนๅปบ่ฎพไธญ... diff --git a/docs/zh_cn/advanced_guides/customize_optimizer.md b/docs/zh_cn/advanced_guides/customize_optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..6e09d100a916788c7d3862f717573fc3f51407e0 --- /dev/null +++ b/docs/zh_cn/advanced_guides/customize_optimizer.md @@ -0,0 +1,332 @@ +# ่‡ชๅฎšไน‰ไผ˜ๅŒ–ๅ™จ + +ๅœจๆœฌๆ•™็จ‹ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไป‹็ปไธ€ไบ›ๆž„ๅปบไผ˜ๅŒ–ๅ™จๅ’Œๅญฆไน ็އ็ญ–็•ฅ็š„ๆ–นๆณ•๏ผŒไปฅ็”จไบŽไฝ ็š„ไปปๅŠกใ€‚ + +- [่‡ชๅฎšไน‰ไผ˜ๅŒ–ๅ™จ](#่‡ชๅฎšไน‰ไผ˜ๅŒ–ๅ™จ) + - [ไฝฟ็”จ optim_wrapper ๆž„ๅปบไผ˜ๅŒ–ๅ™จ](#ไฝฟ็”จ-optim_wrapper-ๆž„ๅปบไผ˜ๅŒ–ๅ™จ) + - [ไฝฟ็”จ PyTorch ๆ”ฏๆŒ็š„ไผ˜ๅŒ–ๅ™จ](#ไฝฟ็”จ-pytorch-ๆ”ฏๆŒ็š„ไผ˜ๅŒ–ๅ™จ) + - [ๅ‚ๆ•ฐๅŒ–็ฒพ็ป†้…็ฝฎ](#ๅ‚ๆ•ฐๅŒ–็ฒพ็ป†้…็ฝฎ) + - [ๆขฏๅบฆ่ฃๅ‰ช](#ๆขฏๅบฆ่ฃๅ‰ช) + - [ๆขฏๅบฆ็ดฏ็งฏ](#ๆขฏๅบฆ็ดฏ็งฏ) + - [่‡ชๅฎšไน‰ๅ‚ๆ•ฐ็ญ–็•ฅ](#่‡ชๅฎšไน‰ๅ‚ๆ•ฐ็ญ–็•ฅ) + - [่‡ชๅฎšไน‰ๅญฆไน ็އ็ญ–็•ฅ](#่‡ชๅฎšไน‰ๅญฆไน ็އ็ญ–็•ฅ) + - [่‡ชๅฎšไน‰ๅŠจ้‡็ญ–็•ฅ](#่‡ชๅฎšไน‰ๅŠจ้‡็ญ–็•ฅ) + - [ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆˆ–ๆž„้€ ๅ™จ](#ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆˆ–ๆž„้€ ๅ™จ) + - [ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ](#ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ) + - [1. ๅฎž็Žฐไธ€ไธชๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ](#1-ๅฎž็Žฐไธ€ไธชๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ) + - [2. ๅฏผๅ…ฅไผ˜ๅŒ–ๅ™จ](#2-ๅฏผๅ…ฅไผ˜ๅŒ–ๅ™จ) + - [3. ๅœจ้…็ฝฎๆ–‡ไปถไธญๆŒ‡ๅฎšไผ˜ๅŒ–ๅ™จ](#3-ๅœจ้…็ฝฎๆ–‡ไปถไธญๆŒ‡ๅฎšไผ˜ๅŒ–ๅ™จ) + - [ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จ](#ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จ) + +## ไฝฟ็”จ optim_wrapper ๆž„ๅปบไผ˜ๅŒ–ๅ™จ + +ๆˆ‘ไปฌไฝฟ็”จ `optim_wrapper` ๅญ—ๆฎตๆฅ้…็ฝฎไผ˜ๅŒ–็ญ–็•ฅ๏ผŒๅ…ถไธญๅŒ…ๆ‹ฌ้€‰ๆ‹ฉไผ˜ๅŒ–ๅ™จใ€ๅ‚ๆ•ฐ้€ไธช้…็ฝฎใ€ๆขฏๅบฆ่ฃๅ‰ชๅ’Œๆขฏๅบฆ็ดฏ็งฏใ€‚ไธ€ไธช็ฎ€ๅ•็š„็คบไพ‹ๅฏไปฅๆ˜ฏ๏ผš + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001) +) +``` + +ๅœจไธŠ้ข็š„็คบไพ‹ไธญ๏ผŒๆˆ‘ไปฌๆž„ๅปบไบ†ไธ€ไธชๅญฆไน ็އไธบ 0.0003๏ผŒๆƒ้‡่กฐๅ‡ไธบ 0.0001 ็š„ SGD ไผ˜ๅŒ–ๅ™จใ€‚ + +### ไฝฟ็”จ PyTorch ๆ”ฏๆŒ็š„ไผ˜ๅŒ–ๅ™จ + +ๆˆ‘ไปฌๆ”ฏๆŒ PyTorch ๅฎž็Žฐ็š„ๆ‰€ๆœ‰ไผ˜ๅŒ–ๅ™จใ€‚่ฆไฝฟ็”จไธๅŒ็š„ไผ˜ๅŒ–ๅ™จ๏ผŒๅช้œ€ๆ›ดๆ”น้…็ฝฎๆ–‡ไปถไธญ็š„ `optimizer` ๅญ—ๆฎตใ€‚ไพ‹ๅฆ‚๏ผŒๅฆ‚ๆžœๆƒณไฝฟ็”จ `torch.optim.Adam`๏ผŒๅฏไปฅๅœจ้…็ฝฎๆ–‡ไปถไธญ่ฟ›่กŒๅฆ‚ไธ‹ไฟฎๆ”นใ€‚ + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer = dict( + type='Adam', + lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False), +) +``` + +้ฆ–ๅ…ˆ๏ผŒๆˆ‘ไปฌ้œ€่ฆๅฐ† `type` ็š„ๅ€ผๆ›ดๆ”นไธบ `torch.optim` ๆ”ฏๆŒ็š„ๆœŸๆœ›ไผ˜ๅŒ–ๅ™จๅ็งฐใ€‚็„ถๅŽ๏ผŒๅฐ†่ฏฅไผ˜ๅŒ–ๅ™จ็š„ๅฟ…่ฆๅ‚ๆ•ฐๆทปๅŠ ๅˆฐ `optimizer` ๅญ—ๆฎตไธญใ€‚ไธŠ่ฟฐ้…็ฝฎๅฐ†ๆž„ๅปบไปฅไธ‹ไผ˜ๅŒ–ๅ™จ๏ผš + +```python +torch.optim.Adam(lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False) +``` + +### ๅ‚ๆ•ฐๅŒ–็ฒพ็ป†้…็ฝฎ + +ไธ€ไบ›ๆจกๅž‹ๅฏ่ƒฝๅฏนไผ˜ๅŒ–ๆœ‰็‰นๅฎš็š„ๅ‚ๆ•ฐ่ฎพ็ฝฎ๏ผŒไพ‹ๅฆ‚ๅฏนไบŽ BatchNorm ๅฑ‚ไธไฝฟ็”จๆƒ้‡่กฐๅ‡๏ผŒๆˆ–่€…ๅฏนไธๅŒ็ฝ‘็ปœๅฑ‚ไฝฟ็”จไธๅŒ็š„ๅญฆไน ็އใ€‚ไธบไบ†ๅฏนๅ…ถ่ฟ›่กŒ็ป†่‡ด้…็ฝฎ๏ผŒๆˆ‘ไปฌๅฏไปฅไฝฟ็”จ `optim_wrapper` ไธญ็š„ `paramwise_cfg` ๅ‚ๆ•ฐใ€‚ + +- **ไธบไธๅŒ็ฑปๅž‹็š„ๅ‚ๆ•ฐ่ฎพ็ฝฎไธๅŒ็š„่ถ…ๅ‚ๆ•ฐๅ€ๆ•ฐใ€‚** + + ไพ‹ๅฆ‚๏ผŒๆˆ‘ไปฌๅฏไปฅๅœจ `paramwise_cfg` ไธญ่ฎพ็ฝฎ `norm_decay_mult=0.`๏ผŒๅฐ†ๅฝ’ไธ€ๅŒ–ๅฑ‚็š„ๆƒ้‡่กฐๅ‡่ฎพ็ฝฎไธบ้›ถใ€‚ + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4), + paramwise_cfg=dict(norm_decay_mult=0.)) + ``` + + ่ฟ˜ๆ”ฏๆŒ่ฎพ็ฝฎๅ…ถไป–็ฑปๅž‹็š„ๅ‚ๆ•ฐ๏ผŒๅŒ…ๆ‹ฌ๏ผš + + - `lr_mult`๏ผšๆ‰€ๆœ‰ๅ‚ๆ•ฐ็š„ๅญฆไน ็އไน˜ๆ•ฐใ€‚ + - `decay_mult`๏ผšๆ‰€ๆœ‰ๅ‚ๆ•ฐ็š„ๆƒ้‡่กฐๅ‡ไน˜ๆ•ฐใ€‚ + - `bias_lr_mult`๏ผšๅ็ฝฎ้กน็š„ๅญฆไน ็އไน˜ๆ•ฐ๏ผˆไธๅŒ…ๆ‹ฌๅฝ’ไธ€ๅŒ–ๅฑ‚็š„ๅ็ฝฎ้กนๅ’Œๅฏๅ˜ๅฝขๅท็งฏๅฑ‚็š„ๅ็งป้‡๏ผ‰ใ€‚้ป˜่ฎคไธบ 1ใ€‚ + - `bias_decay_mult`๏ผšๅ็ฝฎ้กน็š„ๆƒ้‡่กฐๅ‡ไน˜ๆ•ฐ๏ผˆไธๅŒ…ๆ‹ฌๅฝ’ไธ€ๅŒ–ๅฑ‚็š„ๅ็ฝฎ้กนๅ’Œๅฏๅ˜ๅฝขๅท็งฏๅฑ‚็š„ๅ็งป้‡๏ผ‰ใ€‚้ป˜่ฎคไธบ 1ใ€‚ + - `norm_decay_mult`๏ผšๅฝ’ไธ€ๅŒ–ๅฑ‚ๆƒ้‡ๅ’Œๅ็ฝฎ้กน็š„ๆƒ้‡่กฐๅ‡ไน˜ๆ•ฐใ€‚้ป˜่ฎคไธบ 1ใ€‚ + - `dwconv_decay_mult`๏ผšๆทฑๅบฆๅท็งฏๅฑ‚็š„ๆƒ้‡่กฐๅ‡ไน˜ๆ•ฐใ€‚้ป˜่ฎคไธบ 1ใ€‚ + - `bypass_duplicate`๏ผšๆ˜ฏๅฆ่ทณ่ฟ‡้‡ๅค็š„ๅ‚ๆ•ฐใ€‚้ป˜่ฎคไธบ `False`ใ€‚ + - `dcn_offset_lr_mult`๏ผšๅฏๅ˜ๅฝขๅท็งฏๅฑ‚็š„ๅญฆไน ็އไน˜ๆ•ฐใ€‚้ป˜่ฎคไธบ 1ใ€‚ + +- **ไธบ็‰นๅฎšๅ‚ๆ•ฐ่ฎพ็ฝฎไธๅŒ็š„่ถ…ๅ‚ๆ•ฐๅ€ๆ•ฐใ€‚** + + MMAction2 ๅฏไปฅไฝฟ็”จ `paramwise_cfg` ไธญ็š„ `custom_keys` ๆฅๆŒ‡ๅฎšไธๅŒ็š„ๅ‚ๆ•ฐไฝฟ็”จไธๅŒ็š„ๅญฆไน ็އๆˆ–ๆƒ้‡่กฐๅ‡ใ€‚ + + ไพ‹ๅฆ‚๏ผŒ่ฆๅฐ† `backbone.layer0` ็š„ๆ‰€ๆœ‰ๅญฆไน ็އๅ’Œๆƒ้‡่กฐๅ‡่ฎพ็ฝฎไธบ 0๏ผŒ่€ŒไฟๆŒ `backbone` ็š„ๅ…ถไฝ™้ƒจๅˆ†ไธŽไผ˜ๅŒ–ๅ™จ็›ธๅŒ๏ผŒๅนถๅฐ† `head` ็š„ๅญฆไน ็އ่ฎพ็ฝฎไธบ 0.001๏ผŒๅฏไปฅไฝฟ็”จไปฅไธ‹้…็ฝฎ๏ผš + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + paramwise_cfg=dict( + custom_keys={ + 'backbone.layer0': dict(lr_mult=0, decay_mult=0), + 'backbone': dict(lr_mult=1), + 'head': dict(lr_mult=0.1) + })) + ``` + +### ๆขฏๅบฆ่ฃๅ‰ช + +ๅœจ่ฎญ็ปƒ่ฟ‡็จ‹ไธญ๏ผŒๆŸๅคฑๅ‡ฝๆ•ฐๅฏ่ƒฝๆŽฅ่ฟ‘ๆ‚ฌๅด–ๅŒบๅŸŸ๏ผŒๅฏผ่‡ดๆขฏๅบฆ็ˆ†็‚ธใ€‚ๆขฏๅบฆ่ฃๅ‰ชๆœ‰ๅŠฉไบŽ็จณๅฎš่ฎญ็ปƒ่ฟ‡็จ‹ใ€‚ๆขฏๅบฆ่ฃๅ‰ช็š„ๆ›ดๅคšไป‹็ปๅฏไปฅๅœจ[่ฟ™ไธช้กต้ข](https://paperswithcode.com/method/gradient-clipping)ๆ‰พๅˆฐใ€‚ + +็›ฎๅ‰๏ผŒๆˆ‘ไปฌๆ”ฏๆŒ `optim_wrapper` ไธญ็š„ `clip_grad` ้€‰้กน่ฟ›่กŒๆขฏๅบฆ่ฃๅ‰ช๏ผŒๅ‚่€ƒ[PyTorch ๆ–‡ๆกฃ](torch.nn.utils.clip_grad_norm_)ใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + +```python +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + # norm_type: ไฝฟ็”จ็š„ p-่Œƒๆ•ฐ็š„็ฑปๅž‹๏ผŒ่ฟ™้‡Œ norm_type ไธบ 2ใ€‚ + clip_grad=dict(max_norm=35, norm_type=2)) +``` + +### ๆขฏๅบฆ็ดฏ็งฏ + +ๅฝ“่ฎก็ฎ—่ต„ๆบๆœ‰้™ๆ—ถ๏ผŒๆ‰น้‡ๅคงๅฐๅช่ƒฝ่ฎพ็ฝฎไธบ่พƒๅฐ็š„ๅ€ผ๏ผŒ่ฟ™ๅฏ่ƒฝไผšๅฝฑๅ“ๆจกๅž‹็š„ๆ€ง่ƒฝใ€‚ๅฏไปฅไฝฟ็”จๆขฏๅบฆ็ดฏ็งฏๆฅ่งฃๅ†ณ่ฟ™ไธช้—ฎ้ข˜ใ€‚ๆˆ‘ไปฌๆ”ฏๆŒ `optim_wrapper` ไธญ็š„ `accumulative_counts` ้€‰้กน่ฟ›่กŒๆขฏๅบฆ็ดฏ็งฏใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + +```python +train_dataloader = dict(batch_size=64) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + accumulative_counts=4) +``` + +่กจ็คบๅœจ่ฎญ็ปƒ่ฟ‡็จ‹ไธญ๏ผŒๆฏ 4 ไธช่ฟญไปฃๆ‰ง่กŒไธ€ๆฌกๅๅ‘ไผ ๆ’ญใ€‚ไธŠ่ฟฐ็คบไพ‹็ญ‰ไปทไบŽ๏ผš + +```python +train_dataloader = dict(batch_size=256) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001)) +``` + +## ๆ–ฐๅขžไผ˜ๅŒ–ๅ™จๆˆ–่€…ไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จ + +ๅœจ่ฎญ็ปƒไธญ๏ผŒไผ˜ๅŒ–ๅ‚ๆ•ฐ๏ผˆๅฆ‚ๅญฆไน ็އใ€ๅŠจ้‡็ญ‰๏ผ‰้€šๅธธไธๆ˜ฏๅ›บๅฎš็š„๏ผŒ่€Œๆ˜ฏ้š็€่ฟญไปฃๆˆ–ๅ‘จๆœŸ็š„ๅ˜ๅŒ–่€Œๅ˜ๅŒ–ใ€‚PyTorch ๆ”ฏๆŒๅ‡ ็งๅญฆไน ็އ็ญ–็•ฅ๏ผŒไฝ†ๅฏนไบŽๅคๆ‚็š„็ญ–็•ฅๅฏ่ƒฝไธ่ถณๅคŸใ€‚ๅœจ MMAction2 ไธญ๏ผŒๆˆ‘ไปฌๆไพ› `param_scheduler` ๆฅๆ›ดๅฅฝๅœฐๆŽงๅˆถไธๅŒๅ‚ๆ•ฐ็š„ๅญฆไน ็އ็ญ–็•ฅใ€‚ + +### ้…็ฝฎๅญฆไน ็އ่ฐƒๆ•ด็ญ–็•ฅ + +่ฐƒๆ•ดๅญฆไน ็އ็ญ–็•ฅ่ขซๅนฟๆณ›็”จไบŽๆ้ซ˜ๆ€ง่ƒฝใ€‚ๆˆ‘ไปฌๆ”ฏๆŒๅคงๅคšๆ•ฐ PyTorch ๅญฆไน ็އ็ญ–็•ฅ๏ผŒๅŒ…ๆ‹ฌ `ExponentialLR`ใ€`LinearLR`ใ€`StepLR`ใ€`MultiStepLR` ็ญ‰ใ€‚ + +ๆ‰€ๆœ‰ๅฏ็”จ็š„ๅญฆไน ็އ็ญ–็•ฅๅฏไปฅๅœจ[่ฟ™้‡Œ](https://mmaction2.readthedocs.io/en/latest/schedulers.html)ๆ‰พๅˆฐ๏ผŒๅญฆไน ็އ็ญ–็•ฅ็š„ๅ็งฐไปฅ `LR` ็ป“ๅฐพใ€‚ + +- **ๅ•ไธ€ๅญฆไน ็އ็ญ–็•ฅ** + + ๅœจๅคงๅคšๆ•ฐๆƒ…ๅ†ตไธ‹๏ผŒๆˆ‘ไปฌๅชไฝฟ็”จไธ€ไธชๅญฆไน ็ญ–็•ฅไปฅ็ฎ€ๅŒ–้—ฎ้ข˜ใ€‚ไพ‹ๅฆ‚๏ผŒ`MultiStepLR` ่ขซ็”จไฝœ ResNet ็š„้ป˜่ฎคๅญฆไน ็އ็ญ–็•ฅใ€‚ๅœจ่ฟ™้‡Œ๏ผŒ`param_scheduler` ๆ˜ฏไธ€ไธชๅญ—ๅ…ธใ€‚ + + ```python + param_scheduler = dict( + type='MultiStepLR', + by_epoch=True, + milestones=[100, 150], + gamma=0.1) + ``` + + ๆˆ–่€…๏ผŒๆˆ‘ไปฌๆƒณไฝฟ็”จ `CosineAnnealingLR` ็ญ–็•ฅๆฅ่กฐๅ‡ๅญฆไน ็އ๏ผš + + ```python + param_scheduler = dict( + type='CosineAnnealingLR', + by_epoch=True, + T_max=num_epochs) + ``` + +- **ๅคšไธชๅญฆไน ็އ็ญ–็•ฅ** + + ๅœจๆŸไบ›่ฎญ็ปƒๆกˆไพ‹ไธญ๏ผŒไธบไบ†ๆ้ซ˜ๅ‡†็กฎๆ€ง๏ผŒไผšๅบ”็”จๅคšไธชๅญฆไน ็އ็ญ–็•ฅใ€‚ไพ‹ๅฆ‚๏ผŒๅœจๆ—ฉๆœŸ้˜ถๆฎต๏ผŒ่ฎญ็ปƒๅฎนๆ˜“ไธ็จณๅฎš๏ผŒ้ข„็ƒญๆ˜ฏไธ€็งๅ‡ๅฐ‘ไธ็จณๅฎšๆ€ง็š„ๆŠ€ๆœฏใ€‚ๅญฆไน ็އๅฐ†ไปŽไธ€ไธช่พƒๅฐ็š„ๅ€ผ้€ๆธๅขžๅŠ ๅˆฐ้ข„ๆœŸๅ€ผ๏ผŒ้€š่ฟ‡้ข„็ƒญ่ฟ›่กŒ่กฐๅ‡ๅ’Œๅ…ถไป–็ญ–็•ฅ่ฟ›่กŒ่กฐๅ‡ใ€‚ + + ๅœจ MMAction2 ไธญ๏ผŒ้€š่ฟ‡ๅฐ†ๆ‰€้œ€็š„็ญ–็•ฅ็ป„ๅˆๆˆ `param_scheduler` ็š„ๅˆ—่กจๅณๅฏๅฎž็Žฐ้ข„็ƒญ็ญ–็•ฅใ€‚ + + ไปฅไธ‹ๆ˜ฏไธ€ไบ›็คบไพ‹๏ผš + + 1. ๅœจๅ‰ 50 ไธช่ฟญไปฃไธญ่ฟ›่กŒ็บฟๆ€ง้ข„็ƒญใ€‚ + + ```python + param_scheduler = [ + # ็บฟๆ€ง้ข„็ƒญ + dict(type='LinearLR', + start_factor=0.001, + by_epoch=False, # ๆŒ‰่ฟญไปฃ + end=50), # ไป…ๅœจๅ‰ 50 ไธช่ฟญไปฃไธญ่ฟ›่กŒ้ข„็ƒญ + # ไธป่ฆ็š„ๅญฆไน ็އ็ญ–็•ฅ + dict(type='MultiStepLR', + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + ] + ``` + + 2. ๅœจๅ‰ 10 ไธชๅ‘จๆœŸไธญ่ฟ›่กŒ็บฟๆ€ง้ข„็ƒญ๏ผŒๅนถๅœจๆฏไธชๅ‘จๆœŸๅ†…ๆŒ‰่ฟญไปฃๆ›ดๆ–ฐๅญฆไน ็އใ€‚ + + ```python + param_scheduler = [ + # ็บฟๆ€ง้ข„็ƒญ [0, 10) ไธชๅ‘จๆœŸ + dict(type='LinearLR', + start_factor=0.001, + by_epoch=True, + end=10, + convert_to_iter_based=True, # ๆŒ‰่ฟญไปฃๆ›ดๆ–ฐๅญฆไน ็އ + ), + # ๅœจ 10 ไธชๅ‘จๆœŸๅŽไฝฟ็”จ CosineAnnealing ็ญ–็•ฅ + dict(type='CosineAnnealingLR', by_epoch=True, begin=10) + ] + ``` + + ๆณจๆ„๏ผŒๆˆ‘ไปฌๅœจ่ฟ™้‡Œไฝฟ็”จ `begin` ๅ’Œ `end` ๅ‚ๆ•ฐๆฅๆŒ‡ๅฎšๆœ‰ๆ•ˆ่Œƒๅ›ด๏ผŒ่ฏฅ่Œƒๅ›ดไธบ \[`begin`, `end`)ใ€‚่Œƒๅ›ด็š„ๅ•ไฝ็”ฑ `by_epoch` ๅ‚ๆ•ฐๅฎšไน‰ใ€‚ๅฆ‚ๆžœๆœชๆŒ‡ๅฎš๏ผŒๅˆ™ `begin` ไธบ 0๏ผŒ`end` ไธบๆœ€ๅคงๅ‘จๆœŸๆˆ–่ฟญไปฃๆฌกๆ•ฐใ€‚ + + ๅฆ‚ๆžœๆ‰€ๆœ‰็ญ–็•ฅ็š„่Œƒๅ›ด้ƒฝไธ่ฟž็ปญ๏ผŒๅˆ™ๅญฆไน ็އๅฐ†ๅœจๅฟฝ็•ฅ็š„่Œƒๅ›ดๅ†…ไฟๆŒไธๅ˜๏ผŒๅฆๅˆ™ๆ‰€ๆœ‰ๆœ‰ๆ•ˆ็š„็ญ–็•ฅๅฐ†ๆŒ‰็‰นๅฎš้˜ถๆฎต็š„้กบๅบๆ‰ง่กŒ๏ผŒ่ฟ™ไธŽ PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler) ็š„่กŒไธบ็›ธๅŒใ€‚ + +### ่‡ชๅฎšไน‰ๅŠจ้‡็ญ–็•ฅ + +ๆˆ‘ไปฌๆ”ฏๆŒไฝฟ็”จๅŠจ้‡็ญ–็•ฅๆ นๆฎๅญฆไน ็އไฟฎๆ”นไผ˜ๅŒ–ๅ™จ็š„ๅŠจ้‡๏ผŒ่ฟ™ๅฏไปฅไฝฟๆŸๅคฑไปฅๆ›ดๅฟซ็š„ๆ–นๅผๆ”ถๆ•›ใ€‚ไฝฟ็”จๆ–นๆณ•ไธŽๅญฆไน ็އ็ญ–็•ฅ็›ธๅŒใ€‚ + +ๆ‰€ๆœ‰ๅฏ็”จ็š„ๅญฆไน ็އ็ญ–็•ฅๅฏไปฅๅœจ[่ฟ™้‡Œ](https://mmaction2.readthedocs.io/en/latest/schedulers.html)ๆ‰พๅˆฐ๏ผŒๅŠจ้‡็ญ–็•ฅ็š„ๅ็งฐไปฅ `Momentum` ็ป“ๅฐพใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + +```python +param_scheduler = [ + # ๅญฆไน ็އ็ญ–็•ฅ + dict(type='LinearLR', ...), + # ๅŠจ้‡็ญ–็•ฅ + dict(type='LinearMomentum', + start_factor=0.001, + by_epoch=False, + begin=0, + end=1000) +] +``` + +## ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆˆ–ๆž„้€ ๅ™จ + +ๆœฌ้ƒจๅˆ†ๅฐ†ไฟฎๆ”น MMAction2 ๆบไปฃ็ ๆˆ–ๅ‘ MMAction2 ๆก†ๆžถไธญๆทปๅŠ ไปฃ็ ๏ผŒๅˆๅญฆ่€…ๅฏไปฅ่ทณ่ฟ‡ๆญค้ƒจๅˆ†ใ€‚ + +### ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ + +ๅœจๅญฆๆœฏ็ ”็ฉถๅ’Œๅทฅไธšๅฎž่ทตไธญ๏ผŒๅฏ่ƒฝ้œ€่ฆไฝฟ็”จ MMAction2 ๆœชๅฎž็Žฐ็š„ไผ˜ๅŒ–ๆ–นๆณ•๏ผŒๅฏไปฅ้€š่ฟ‡ไปฅไธ‹ๆ–นๆณ•่ฟ›่กŒๆทปๅŠ ใ€‚ + +#### 1. ๅฎž็Žฐไธ€ไธชๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ + +ๅ‡่ฎพ่ฆๆทปๅŠ ไธ€ไธชๅไธบ `MyOptimizer` ็š„ไผ˜ๅŒ–ๅ™จ๏ผŒๅฎƒๅ…ทๆœ‰ๅ‚ๆ•ฐ `a`ใ€`b` ๅ’Œ `c`ใ€‚้œ€่ฆๅœจ `mmaction/engine/optimizers` ไธ‹ๅˆ›ๅปบไธ€ไธชๆ–ฐๆ–‡ไปถ๏ผŒๅนถๅœจๆ–‡ไปถไธญๅฎž็Žฐๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ๏ผŒไพ‹ๅฆ‚ๅœจ `mmaction/engine/optimizers/my_optimizer.py` ไธญ๏ผš + +```python +from torch.optim import Optimizer +from mmaction.registry import OPTIMIZERS + + +@OPTIMIZERS.register_module() +class MyOptimizer(Optimizer): + + def __init__(self, a, b, c): + ... + + def step(self, closure=None): + ... +``` + +#### 2. ๅฏผๅ…ฅไผ˜ๅŒ–ๅ™จ + +ไธบไบ†ๆ‰พๅˆฐไธŠ่ฟฐๅฎšไน‰็š„ๆจกๅ—๏ผŒ้œ€่ฆๅœจ่ฟ่กŒๆ—ถๅฏผๅ…ฅ่ฏฅๆจกๅ—ใ€‚้ฆ–ๅ…ˆ๏ผŒๅœจ `mmaction/engine/optimizers/__init__.py` ไธญๅฏผๅ…ฅ่ฏฅๆจกๅ—๏ผŒๅฐ†ๅ…ถๆทปๅŠ ๅˆฐ `mmaction.engine` ๅŒ…ไธญใ€‚ + +```python +# In mmaction/engine/optimizers/__init__.py +... +from .my_optimizer import MyOptimizer # MyOptimizer ๅฏ่ƒฝๆ˜ฏๅ…ถไป–็ฑปๅ + +__all__ = [..., 'MyOptimizer'] +``` + +ๅœจ่ฟ่กŒๆ—ถ๏ผŒๆˆ‘ไปฌๅฐ†่‡ชๅŠจๅฏผๅ…ฅ `mmaction.engine` ๅŒ…๏ผŒๅนถๅŒๆ—ถๆณจๅ†Œ `MyOptimizer`ใ€‚ + +#### 3. ๅœจ้…็ฝฎๆ–‡ไปถไธญๆŒ‡ๅฎšไผ˜ๅŒ–ๅ™จ + +็„ถๅŽ๏ผŒๅฏไปฅๅœจ้…็ฝฎๆ–‡ไปถ็š„ `optim_wrapper.optimizer` ๅญ—ๆฎตไธญไฝฟ็”จ `MyOptimizer`ใ€‚ + +```python +optim_wrapper = dict( + optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)) +``` + +### ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จ + +ไธ€ไบ›ๆจกๅž‹ๅฏ่ƒฝๅฏนไผ˜ๅŒ–ๆœ‰ไธ€ไบ›็‰นๅฎš็š„ๅ‚ๆ•ฐ่ฎพ็ฝฎ๏ผŒไพ‹ๅฆ‚ๆ‰€ๆœ‰ `BatchNorm` ๅฑ‚็š„ไธๅŒๆƒ้‡่กฐๅ‡็އใ€‚ + +ๅฐฝ็ฎกๆˆ‘ไปฌๅทฒ็ปๅฏไปฅไฝฟ็”จ[ไผ˜ๅŒ–ๅ™จๆ•™็จ‹](#ๅ‚ๆ•ฐๅŒ–็ฒพ็ป†้…็ฝฎ)ไธญ็š„ `optim_wrapper.paramwise_cfg` ๅญ—ๆฎตๆฅ้…็ฝฎๅ„็ง็‰นๅฎšๅ‚ๆ•ฐ็š„ไผ˜ๅŒ–ๅ™จ่ฎพ็ฝฎ๏ผŒไฝ†ๅฏ่ƒฝไปๆ— ๆณ•ๆปก่ถณ้œ€ๆฑ‚ใ€‚ + +ๅฝ“็„ถ๏ผŒไฝ ๅฏไปฅไฟฎๆ”นๅฎƒใ€‚้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒๆˆ‘ไปฌไฝฟ็”จ [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor) ็ฑปๆฅๅค„็†ไผ˜ๅŒ–ๅ™จ็š„ๆž„้€ ใ€‚ๅœจๆž„้€ ่ฟ‡็จ‹ไธญ๏ผŒๅฎƒๆ นๆฎ `paramwise_cfg` ๅฏนไธๅŒๅ‚ๆ•ฐ็š„ไผ˜ๅŒ–ๅ™จ่ฎพ็ฝฎ่ฟ›่กŒ็ป†่‡ด้…็ฝฎ๏ผŒ่ฟ™ไนŸๅฏไปฅไฝœไธบๆ–ฐไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จ็š„ๆจกๆฟใ€‚ + +ไฝ ๅฏไปฅ้€š่ฟ‡ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จๆž„้€ ๅ™จๆฅ่ฆ†็›–่ฟ™ไบ›่กŒไธบใ€‚ + +```python +# In mmaction/engine/optimizers/my_optim_constructor.py +from mmengine.optim import DefaultOptimWrapperConstructor +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class MyOptimWrapperConstructor: + + def __init__(self, optim_wrapper_cfg, paramwise_cfg=None): + ... + + def __call__(self, model): + ... +``` + +็„ถๅŽ๏ผŒๅฏผๅ…ฅๅฎƒๅนถๅ‡ ไนŽๅƒ[ไผ˜ๅŒ–ๅ™จๆ•™็จ‹](#ๆทปๅŠ ๆ–ฐ็š„ไผ˜ๅŒ–ๅ™จ)ไธญ้‚ฃๆ ทไฝฟ็”จๅฎƒใ€‚ + +1. ๅœจ `mmaction/engine/optimizers/__init__.py` ไธญๅฏผๅ…ฅๅฎƒ๏ผŒๅฐ†ๅ…ถๆทปๅŠ ๅˆฐ `mmaction.engine` ๅŒ…ไธญใ€‚ + + ```python + # In mmaction/engine/optimizers/__init__.py + ... + from .my_optim_constructor import MyOptimWrapperConstructor + + __all__ = [..., 'MyOptimWrapperConstructor'] + ``` + +2. ๅœจ้…็ฝฎๆ–‡ไปถ็š„ `optim_wrapper.constructor` ๅญ—ๆฎตไธญไฝฟ็”จ `MyOptimWrapperConstructor`ใ€‚ + + ```python + optim_wrapper = dict( + constructor=dict(type='MyOptimWrapperConstructor'), + optimizer=..., + paramwise_cfg=..., + ) + ``` diff --git a/docs/zh_cn/advanced_guides/customize_pipeline.md b/docs/zh_cn/advanced_guides/customize_pipeline.md new file mode 100644 index 0000000000000000000000000000000000000000..f9fe232677775fa19a92512f318fcfe8d9ff903a --- /dev/null +++ b/docs/zh_cn/advanced_guides/customize_pipeline.md @@ -0,0 +1,144 @@ +# ่‡ชๅฎšไน‰ๆ•ฐๆฎๆตๆฐด็บฟ + +ๅœจๆœฌๆ•™็จ‹ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไป‹็ปๅฆ‚ไฝ•ไธบไฝ ็š„ไปปๅŠกๆž„ๅปบๆ•ฐๆฎๆตๆฐด็บฟ๏ผˆๅณ๏ผŒๆ•ฐๆฎ่ฝฌๆข๏ผ‰็š„ไธ€ไบ›ๆ–นๆณ•ใ€‚ + +- [่‡ชๅฎšไน‰ๆ•ฐๆฎๆตๆฐด็บฟ](#่‡ชๅฎšไน‰ๆ•ฐๆฎๆตๆฐด็บฟ) + - [ๆ•ฐๆฎๆตๆฐด็บฟ่ฎพ่ฎก](#ๆ•ฐๆฎๆตๆฐด็บฟ่ฎพ่ฎก) + - [ไฟฎๆ”น่ฎญ็ปƒ/ๆต‹่ฏ•ๆ•ฐๆฎๆตๆฐด็บฟ](#ไฟฎๆ”น่ฎญ็ปƒ/ๆต‹่ฏ•ๆ•ฐๆฎๆตๆฐด็บฟ) + - [ๅŠ ่ฝฝ](#ๅŠ ่ฝฝ) + - [้‡‡ๆ ทๅธงๅ’Œๅ…ถไป–ๅค„็†](#้‡‡ๆ ทๅธงๅ’Œๅ…ถไป–ๅค„็†) + - [ๆ ผๅผๅŒ–](#ๆ ผๅผๅŒ–) + - [ๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข](#ๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข) + +## ๆ•ฐๆฎๆตๆฐด็บฟ่ฎพ่ฎก + +ๆ•ฐๆฎๆตๆฐด็บฟๆŒ‡็š„ๆ˜ฏไปŽๆ•ฐๆฎ้›†็ดขๅผ•ๆ ทๆœฌๆ—ถๅค„็†ๆ•ฐๆฎๆ ทๆœฌๅญ—ๅ…ธ็š„่ฟ‡็จ‹๏ผŒๅฎƒๅŒ…ๆ‹ฌไธ€็ณปๅˆ—็š„ๆ•ฐๆฎ่ฝฌๆขใ€‚ๆฏไธชๆ•ฐๆฎ่ฝฌๆขๆŽฅๅ—ไธ€ไธช `dict` ไฝœไธบ่พ“ๅ…ฅ๏ผŒๅฏนๅ…ถ่ฟ›่กŒๅค„็†๏ผŒๅนถไบง็”Ÿไธ€ไธช `dict` ไฝœไธบ่พ“ๅ‡บ๏ผŒไพ›ๅบๅˆ—ไธญ็š„ๅŽ็ปญๆ•ฐๆฎ่ฝฌๆขไฝฟ็”จใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไธชไพ‹ๅญ๏ผŒ็”จไบŽไฝฟ็”จ `VideoDataset` ๅœจ Kinetics ไธŠ่ฎญ็ปƒ SlowFast ็š„ๆ•ฐๆฎๆตๆฐด็บฟใ€‚่ฟ™ไธชๆ•ฐๆฎๆตๆฐด็บฟ้ฆ–ๅ…ˆไฝฟ็”จ [`decord`](https://github.com/dmlc/decord) ่ฏปๅ–ๅŽŸๅง‹่ง†้ข‘ๅนถ้šๆœบ้‡‡ๆ ทไธ€ไธช่ง†้ข‘ๅ‰ช่พ‘๏ผŒ่ฏฅๅ‰ช่พ‘ๅŒ…ๅซ `32` ๅธง๏ผŒๅธง้—ด้š”ไธบ `2`ใ€‚็„ถๅŽ๏ผŒๅฎƒๅฏนๆ‰€ๆœ‰ๅธงๅบ”็”จ้šๆœบๅคงๅฐ่ฐƒๆ•ด็š„่ฃๅ‰ชๅ’Œ้šๆœบๆฐดๅนณ็ฟป่ฝฌ๏ผŒ็„ถๅŽๅฐ†ๆ•ฐๆฎๅฝข็Šถๆ ผๅผๅŒ–ไธบ `NCTHW`๏ผŒๅœจ่ฟ™ไธชไพ‹ๅญไธญ๏ผŒๅฎƒๆ˜ฏ `(1, 3, 32, 224, 224)`ใ€‚ + +```python +train_pipeline = [ + dict(type='DecordInit',), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +MMAction2 ไธญๆ‰€ๆœ‰ๅฏ็”จ็š„ๆ•ฐๆฎ่ฝฌๆข็š„่ฏฆ็ป†ๅˆ—่กจๅฏไปฅๅœจ [mmaction.datasets.transforms](mmaction.datasets.transforms) ไธญๆ‰พๅˆฐใ€‚ + +## ไฟฎๆ”น่ฎญ็ปƒ/ๆต‹่ฏ•ๆ•ฐๆฎๆตๆฐด็บฟ + +MMAction2 ็š„ๆ•ฐๆฎๆตๆฐด็บฟ้žๅธธ็ตๆดป๏ผŒๅ› ไธบๅ‡ ไนŽๆฏไธ€ๆญฅ็š„ๆ•ฐๆฎ้ข„ๅค„็†้ƒฝๅฏไปฅไปŽ้…็ฝฎๆ–‡ไปถไธญ่ฟ›่กŒ้…็ฝฎใ€‚็„ถ่€Œ๏ผŒๅฏนไบŽไธ€ไบ›็”จๆˆทๆฅ่ฏด๏ผŒ่ฟ™็งๅคšๆ ทๆ€งๅฏ่ƒฝไผš่ฎฉไบบๆ„Ÿๅˆฐไธ็Ÿฅๆ‰€ๆŽชใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไบ›็”จไบŽๆž„ๅปบๅŠจไฝœ่ฏ†ๅˆซไปปๅŠกๆ•ฐๆฎๆตๆฐด็บฟ็š„ไธ€่ˆฌๅฎž่ทตๅ’ŒๆŒ‡ๅ—ใ€‚ + +### ๅŠ ่ฝฝ + +ๅœจๆ•ฐๆฎๆตๆฐด็บฟ็š„ๅผ€ๅง‹๏ผŒ้€šๅธธๆ˜ฏๅŠ ่ฝฝ่ง†้ข‘ใ€‚็„ถ่€Œ๏ผŒๅฆ‚ๆžœๅธงๅทฒ็ป่ขซๆๅ–ๅ‡บๆฅ๏ผŒไฝ ๅบ”่ฏฅไฝฟ็”จ `RawFrameDecode` ๅนถไฟฎๆ”นๆ•ฐๆฎ้›†็ฑปๅž‹ไธบ `RawframeDataset`ใ€‚ + +```python +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +ๅฆ‚ๆžœไฝ ้œ€่ฆไปŽๅ…ทๆœ‰ไธๅŒๆ ผๅผ๏ผˆไพ‹ๅฆ‚๏ผŒ`pkl`๏ผŒ`bin`็ญ‰๏ผ‰็š„ๆ–‡ไปถๆˆ–ไปŽ็‰นๅฎšไฝ็ฝฎๅŠ ่ฝฝๆ•ฐๆฎ๏ผŒไฝ ๅฏไปฅๅˆ›ๅปบไธ€ไธชๆ–ฐ็š„ๅŠ ่ฝฝ่ฝฌๆขๅนถๅฐ†ๅ…ถๅŒ…ๅซๅœจๆ•ฐๆฎๆตๆฐด็บฟ็š„ๅผ€ๅง‹ใ€‚ๆœ‰ๅ…ณๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข](#ๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข)ใ€‚ + +### ้‡‡ๆ ทๅธงๅ’Œๅ…ถไป–ๅค„็† + +ๅœจ่ฎญ็ปƒๅ’Œๆต‹่ฏ•่ฟ‡็จ‹ไธญ๏ผŒๆˆ‘ไปฌๅฏ่ƒฝไผšๆœ‰ไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง็š„ไธๅŒ็ญ–็•ฅใ€‚ + +ไพ‹ๅฆ‚๏ผŒๅฝ“ๆต‹่ฏ• SlowFast ๆ—ถ๏ผŒๆˆ‘ไปฌไผšๅ‡ๅŒ€ๅœฐ้‡‡ๆ ทๅคšไธชๅ‰ช่พ‘๏ผŒๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +```python +test_pipeline = [ + ... + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + ... +] +``` + +ๅœจไธŠ่ฟฐไพ‹ๅญไธญ๏ผŒๆฏไธช่ง†้ข‘ๅฐ†ๅ‡ๅŒ€ๅœฐ้‡‡ๆ ท10ไธช่ง†้ข‘ๅ‰ช่พ‘๏ผŒๆฏไธชๅ‰ช่พ‘ๅŒ…ๅซ32ๅธงใ€‚ `test_mode=True` ็”จไบŽๅฎž็Žฐ่ฟ™ไธ€็‚น๏ผŒไธŽ่ฎญ็ปƒๆœŸ้—ด็š„้šๆœบ้‡‡ๆ ท็›ธๅใ€‚ + +ๅฆไธ€ไธชไพ‹ๅญๆถ‰ๅŠ `TSN/TSM` ๆจกๅž‹๏ผŒๅฎƒไปฌไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅคšไธช็‰‡ๆฎต๏ผš + +```python +train_pipeline = [ + ... + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + ... +] +``` + +้€šๅธธ๏ผŒๆ•ฐๆฎๆตๆฐด็บฟไธญ็š„ๆ•ฐๆฎๅขžๅผบๅชๅค„็†่ง†้ข‘็บง็š„่ฝฌๆข๏ผŒไพ‹ๅฆ‚่ฐƒๆ•ดๅคงๅฐๆˆ–่ฃๅ‰ช๏ผŒ่€Œไธๅค„็†ๅƒ่ง†้ข‘ๆ ‡ๅ‡†ๅŒ–ๆˆ– mixup/cutmix ่ฟ™ๆ ท็š„่ฝฌๆขใ€‚่ฟ™ๆ˜ฏๅ› ไธบๆˆ‘ไปฌๅฏไปฅๅœจๆ‰น้‡่ง†้ข‘ๆ•ฐๆฎไธŠ่ฟ›่กŒ่ง†้ข‘ๆ ‡ๅ‡†ๅŒ–ๅ’Œ mixup/cutmix๏ผŒไปฅไฝฟ็”จ GPU ๅŠ ้€Ÿๅค„็†ใ€‚่ฆ้…็ฝฎ่ง†้ข‘ๆ ‡ๅ‡†ๅŒ–ๅ’Œ mixup/cutmix๏ผŒ่ฏทไฝฟ็”จ [mmaction.models.utils.data_preprocessor](mmaction.models.utils.data_preprocessor)ใ€‚ + +### ๆ ผๅผๅŒ– + +ๆ ผๅผๅŒ–ๆถ‰ๅŠไปŽๆ•ฐๆฎไฟกๆฏๅญ—ๅ…ธไธญๆ”ถ้›†่ฎญ็ปƒๆ•ฐๆฎ๏ผŒๅนถๅฐ†ๅ…ถ่ฝฌๆขไธบไธŽๆจกๅž‹ๅ…ผๅฎน็š„ๆ ผๅผใ€‚ + +ๅœจๅคงๅคšๆ•ฐๆƒ…ๅ†ตไธ‹๏ผŒไฝ ๅฏไปฅ็ฎ€ๅ•ๅœฐไฝฟ็”จ [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs)๏ผŒๅฎƒๅฐ†ไปฅ `NumPy Array` ๆ ผๅผ็š„ๅ›พๅƒ่ฝฌๆขไธบ `PyTorch Tensor`๏ผŒๅนถๅฐ†ๅœฐ้ข็œŸๅฎž็ฑปๅˆซไฟกๆฏๅ’Œๅ…ถไป–ๅ…ƒไฟกๆฏๆ‰“ๅŒ…ไธบไธ€ไธช็ฑปไผผๅญ—ๅ…ธ็š„ๅฏน่ฑก [`ActionDataSample`](mmaction.structures.ActionDataSample)ใ€‚ + +```python +train_pipeline = [ + ... + dict(type='PackActionInputs'), +] +``` + +## ๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข + +1. ่ฆๅˆ›ๅปบไธ€ไธชๆ–ฐ็š„ๆ•ฐๆฎ่ฝฌๆข๏ผŒ็ผ–ๅ†™ไธ€ไธชๆ–ฐ็š„่ฝฌๆข็ฑปๅœจไธ€ไธช Python ๆ–‡ไปถไธญ๏ผŒไพ‹ๅฆ‚๏ผŒๅไธบ `my_transforms.py`ใ€‚ๆ•ฐๆฎ่ฝฌๆข็ฑปๅฟ…้กป็ปงๆ‰ฟ [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) ็ฑป๏ผŒๅนถ้‡ๅ†™ `transform` ๆ–นๆณ•๏ผŒ่ฏฅๆ–นๆณ•ๆŽฅๅ—ไธ€ไธช `dict` ไฝœไธบ่พ“ๅ…ฅๅนถ่ฟ”ๅ›žไธ€ไธช `dict`ใ€‚ๆœ€ๅŽ๏ผŒๅฐ† `my_transforms.py` ๆ”พๅœจ `mmaction/datasets/transforms/` ๆ–‡ไปถๅคนไธญใ€‚ + + ```python + from mmcv.transforms import BaseTransform + from mmaction.datasets import TRANSFORMS + + @TRANSFORMS.register_module() + class MyTransform(BaseTransform): + def __init__(self, msg): + self.msg = msg + + def transform(self, results): + # ไฟฎๆ”นๆ•ฐๆฎไฟกๆฏๅญ—ๅ…ธ `results`ใ€‚ + print(msg, 'MMAction2.') + return results + ``` + +2. ๅœจ `mmaction/datasets/transforms/__init__.py` ไธญๅฏผๅ…ฅๆ–ฐ็ฑปใ€‚ + + ```python + ... + from .my_transform import MyTransform + + __all__ = [ + ..., 'MyTransform' + ] + ``` + +3. ๅœจ้…็ฝฎๆ–‡ไปถไธญไฝฟ็”จๅฎƒใ€‚ + + ```python + train_pipeline = [ + ... + dict(type='MyTransform', msg='Hello!'), + ... + ] + ``` diff --git a/docs/zh_cn/advanced_guides/dataflow.md b/docs/zh_cn/advanced_guides/dataflow.md new file mode 100644 index 0000000000000000000000000000000000000000..c3c7273aff2ce3a3c1c8eac668b9ae2a292ca55e --- /dev/null +++ b/docs/zh_cn/advanced_guides/dataflow.md @@ -0,0 +1,3 @@ +# MMAction2 ็š„ๆ•ฐๆฎๆต + +ๅ†…ๅฎนๅปบ่ฎพไธญ... diff --git a/docs/zh_cn/advanced_guides/depoly.md b/docs/zh_cn/advanced_guides/depoly.md new file mode 100644 index 0000000000000000000000000000000000000000..82fab764a856d26c5575a22f24743411b4e54a5f --- /dev/null +++ b/docs/zh_cn/advanced_guides/depoly.md @@ -0,0 +1,3 @@ +# How to deploy MMAction2 models + +coming soon... diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst new file mode 100644 index 0000000000000000000000000000000000000000..f3f688462bc92067c883eb4c61bc9246c271f659 --- /dev/null +++ b/docs/zh_cn/api.rst @@ -0,0 +1,140 @@ +mmaction.apis +-------------- +.. automodule:: mmaction.apis + :members: + +mmaction.datasets +-------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmaction.datasets + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmaction.datasets.transforms + :members: + +mmaction.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmaction.engine.hooks + :members: + +optimizers +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.engine.optimizers + :members: + +runner +^^^^^^^^^^ +.. automodule:: mmaction.engine.runner + :members: + + +mmaction.evaluation +-------------------- + +functional +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.evaluation.functional + :members: + +metrics +^^^^^^^^^^ +.. automodule:: mmaction.evaluation.metrics + :members: + + +mmaction.models +-------------- + +backbones +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.backbones + :members: + +common +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.common + :members: + +data_preprocessors +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.data_preprocessors + :members: + +heads +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.heads + :members: + +localizers +^^^^^^^^^^ +.. automodule:: mmaction.models.localizers + :members: + + +losses +^^^^^^^^^^ +.. automodule:: mmaction.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmaction.models.necks + :members: + +roi_heads +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.roi_heads + :members: + +recognizers +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.seg_heads + :members: + +task_modules +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.task_modules + :members: + + +utils +^^^^^^^^^^ +.. automodule:: mmaction.models.utils + :members: + + +mmaction.structures +-------------------- + +structures +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.structures + :members: + +bbox +^^^^^^^^^^ +.. automodule:: mmaction.structures.bbox + :members: + + +mmaction.testing +---------------- +.. automodule:: mmaction.testing + :members: + +mmaction.visualization +-------------------- +.. automodule:: mmaction.visualization + :members: + +mmaction.utils +-------------- +.. automodule:: mmaction.utils + :members: diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..8413d472e248c835b8ebf485cf1e5d6fc9bb5961 --- /dev/null +++ b/docs/zh_cn/conf.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- + +project = 'MMAction2' +copyright = '2020, OpenMMLab' +author = 'MMAction2 Authors' +version_file = '../.././mmaction/version.py' + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_copybutton', + 'sphinx_tabs.tabs', + 'notfound.extension', + 'sphinxcontrib.jquery', +] + +# numpy and torch are required +autodoc_mock_imports = ['mmaction.version', 'PIL'] + +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- +source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'} + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pytorch_sphinx_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". + +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] +html_theme_options = { + # 'logo_url': 'https://mmaction2.readthedocs.io/en/latest/', + 'menu': [ + { + 'name': + 'Tutorial', + 'url': + 'https://colab.research.google.com/github/' + 'open-mmlab/mmaction2/blob/master/demo/mmaction2_tutorial.ipynb' + }, + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmaction2' + }, + { + 'name': + 'Upstream', + 'children': [{ + 'name': + 'MMCV', + 'url': + 'https://github.com/open-mmlab/mmcv', + 'description': + 'Foundational library for computer vision' + }, { + 'name': + 'MMPreTrain', + 'url': + 'https://github.com/open-mmlab/mmpretrain', + 'description': + 'Open source pre-training toolbox based on PyTorch' + }, { + 'name': + 'MMDetection', + 'url': + 'https://github.com/open-mmlab/mmdetection', + 'description': + 'Object detection toolbox and benchmark' + }, { + 'name': + 'MMPose', + 'url': + 'https://github.com/open-mmlab/mmpose', + 'description': + 'Open-source toolbox for pose estimation based on PyTorch.' + }] + }, + ], + # Specify the language of shared menu + 'menu_lang': + 'en' +} + +language = 'en' +master_doc = 'index' + +html_static_path = ['_static'] +html_css_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css', + 'css/readthedocs.css' +] +html_js_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js', + 'js/custom.js' +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +# The not found page +notfound_template = '404.html' + + +def builder_inited_handler(app): + if subprocess.run(['python', './stat.py']).returncode != 0: + raise RuntimeError('Failed to run the script `stat.py`.') + if subprocess.run(['python', './project_zoo.py']).returncode != 0: + raise RuntimeError('Failed to run the script `project_zoo.py`.') + if subprocess.run(['python', './dataset_zoo.py']).returncode != 0: + raise RuntimeError('Failed to run the script `dataset_zoo.py`.') + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..3a637cb21ef8eb3775745153994537356c732e1f --- /dev/null +++ b/docs/zh_cn/dataset_zoo.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +import re +from pathlib import Path + +from utils import replace_link + +DATASETS_ROOT = Path('dataset_zoo') # Path to save generated paper pages. +DATASETZOO_TEMPLATE = """\ +# ๆ•ฐๆฎ้›†็ปŸ่ฎก + +ๅœจๆœฌ้กต้ขไธญ๏ผŒๆˆ‘ไปฌๅˆ—ไธพไบ†ๆˆ‘ไปฌๆ”ฏๆŒ็š„[ๆ‰€ๆœ‰ๆ•ฐๆฎ้›†](#ๆ‰€ๆœ‰ๅทฒๆ”ฏๆŒ็š„ๆ•ฐๆฎ้›†)ใ€‚ไฝ ๅฏไปฅ็‚นๅ‡ป้“พๆŽฅ่ทณ่ฝฌ่‡ณๅฏนๅบ”็š„ๆ•ฐๆฎ้›†่ฏฆๆƒ…้กต้ขใ€‚ + +## ๆ‰€ๆœ‰ๅทฒๆ”ฏๆŒ็š„ๆ•ฐๆฎ้›† + +* ๆ•ฐๆฎ้›†ๆ•ฐ้‡๏ผš{num_datasets} +{dataset_msg} + +""" # noqa: E501 + + +def generate_datasets_pages(): + dataset_list = Path('../../tools/data').glob('*/README.md') + num_datasets = 0 + dataset_msgs = [] + + for file in dataset_list: + num_datasets += 1 + + copy = DATASETS_ROOT / file.parent.with_suffix('.md').name + + title_template = r'^# Preparing (.*)' + # use chinese doc if exist + chinese_readme = Path( + str(file).replace('README.md', 'README_zh-CN.md')) + if chinese_readme.exists(): + file = chinese_readme + title_template = r'^# ๅ‡†ๅค‡(.*)' + with open(file, 'r') as f: + content = f.read() + + title = re.match(title_template, content).group(1) + title = title.lstrip(' ') + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + file) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + file) + dataset_msgs.append(f'\t - [{title}]({copy})') + + with open(copy, 'w') as f: + f.write(content) + + dataset_msg = '\n'.join(dataset_msgs) + + modelzoo = DATASETZOO_TEMPLATE.format( + num_datasets=num_datasets, + dataset_msg=dataset_msg, + ) + + with open('datasetzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +DATASETS_ROOT.mkdir(exist_ok=True) +generate_datasets_pages() diff --git a/docs/zh_cn/docutils.conf b/docs/zh_cn/docutils.conf new file mode 100644 index 0000000000000000000000000000000000000000..ddd79c377666db4a615151f0676f7fec32d38359 --- /dev/null +++ b/docs/zh_cn/docutils.conf @@ -0,0 +1,2 @@ +[html writers] +table_style: colwidths-auto diff --git a/docs/zh_cn/get_started/contribution_guide.md b/docs/zh_cn/get_started/contribution_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..439214c520a12b2542ae0dd5560f358e8fe2ebc1 --- /dev/null +++ b/docs/zh_cn/get_started/contribution_guide.md @@ -0,0 +1,62 @@ +# ๅ‚ไธŽ่ดก็Œฎ MMACTION2 + +ๆฌข่ฟŽๅ„็งๅฝขๅผ็š„่ดก็Œฎ๏ผŒๅŒ…ๆ‹ฌไฝ†ไธ้™ไบŽไปฅไธ‹ๅ†…ๅฎนใ€‚ + +- ไฟฎๆ”นๆ‹ผๅ†™้”™่ฏฏๆˆ–ไปฃ็ ้”™่ฏฏ +- ๆ–ฐๅŠŸ่ƒฝๅ’Œ็ป„ไปถ +- ๆทปๅŠ ๆ–‡ๆกฃๆˆ–ๅฐ†ๆ–‡ๆกฃ็ฟป่ฏ‘ๆˆๅ…ถไป–่ฏญ่จ€ +- ๆทปๅŠ ๅ…ณไบŽ่ง†้ข‘็†่งฃ็ฎ—ๆณ•็š„ๆ–ฐ้กน็›ฎ๏ผˆๆŽจ่๏ผ‰๏ผŒๅ…ทไฝ“็ป†่Š‚่ฏทๅ‚่€ƒ[่ฟ™้‡Œ](../projectzoo.md) + +## ๅทฅไฝœๆต็จ‹ + +1. Fork ๅนถๆ‹‰ๅ–ๆœ€ๆ–ฐ็š„ mmaction2 +2. ๅˆ›ๅปบไธ€ไธชๆœ‰ๆ„ไน‰็š„ๆ–ฐๅˆ†ๆ”ฏ๏ผˆไธ่ฆไฝฟ็”จไธปๅˆ†ๆ”ฏ่ฟ›่กŒ PR๏ผ‰ +3. ๆไบคไฝ ็š„ๆ›ดๆ”น +4. ๅˆ›ๅปบไธ€ไธช PR + +```{note} +- ๅฆ‚ๆžœไฝ ่ฎกๅˆ’ๆทปๅŠ ไธ€ไบ›ๆถ‰ๅŠๅคง่ง„ๆจกๆ›ดๆ”น็š„ๆ–ฐๅŠŸ่ƒฝ๏ผŒ่ฏท้ฆ–ๅ…ˆๆ‰“ๅผ€ไธ€ไธช issue ่ฟ›่กŒ่ฎจ่ฎบใ€‚ +- ๅฆ‚ๆžœไฝ ๆ˜ฏ่ฎบๆ–‡็š„ไฝœ่€…๏ผŒๅนถๅธŒๆœ›ๅฐ†ไฝ ็š„ๆ–นๆณ•ๅŒ…ๅซๅœจ mmaction2 ไธญ๏ผŒ่ฏทไธŽๆˆ‘ไปฌ่”็ณปใ€‚ๆˆ‘ไปฌๅฐ†้žๅธธๆ„Ÿ่ฐขๆ‚จ็š„่ดก็Œฎใ€‚ +``` + +## ไปฃ็ ้ฃŽๆ ผ + +### Python + +ๆˆ‘ไปฌ้‡‡็”จ [PEP8](https://www.python.org/dev/peps/pep-0008/) ไฝœไธบ้ฆ–้€‰ไปฃ็ ้ฃŽๆ ผใ€‚ + +ๆˆ‘ไปฌไฝฟ็”จไปฅไธ‹ๅทฅๅ…ท่ฟ›่กŒไปฃ็ ๆฃ€ๆŸฅๅ’Œๆ ผๅผๅŒ–๏ผš + +- [flake8](http://flake8.pycqa.org/en/latest/)๏ผšๆฃ€ๆŸฅๅ™จ +- [yapf](https://github.com/google/yapf)๏ผšๆ ผๅผๅŒ–ๅ™จ +- [isort](https://github.com/timothycrosley/isort)๏ผšๆŽ’ๅบๅฏผๅ…ฅ +- [codespell](https://github.com/codespell-project/codespell)๏ผšไธ€ไธช็”จไบŽไฟฎๅคๆ–‡ๆœฌๆ–‡ไปถไธญๅธธ่งๆ‹ผๅ†™้”™่ฏฏ็š„ Python ๅทฅๅ…ทใ€‚ +- [mdformat](https://github.com/executablebooks/mdformat)๏ผšMdformat ๆ˜ฏไธ€ไธช่‡ช็”ฑ่ฃ้‡็š„ Markdown ๆ ผๅผๅŒ–ๅทฅๅ…ท๏ผŒๅฏ็”จไบŽๅผบๅˆถๆ‰ง่กŒไธ€่‡ด็š„ Markdown ๆ–‡ไปถๆ ทๅผใ€‚ +- [docformatter](https://github.com/myint/docformatter)๏ผšไธ€ไธชๆ ผๅผๅŒ–ๅทฅๅ…ท๏ผŒ็”จไบŽๆ ผๅผๅŒ–ๆ–‡ๆกฃๅญ—็ฌฆไธฒใ€‚ + +yapf ๅ’Œ isort ็š„ๆ ทๅผ้…็ฝฎๅฏไปฅๅœจ [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/main/setup.cfg) ไธญๆ‰พๅˆฐใ€‚ + +ๆˆ‘ไปฌไฝฟ็”จ [pre-commit hook](https://pre-commit.com/) ๆฅไฟ่ฏๆฏๆฌกๆไบคๆ—ถ่‡ชๅŠจ่ฟ›่กŒไปฃ็ ๆฃ€ๆŸฅๅ’Œๆ ผๅผๅŒ–๏ผŒๅฏ็”จ็š„ๅŠŸ่ƒฝๅŒ…ๆ‹ฌ `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, ไฟฎๅค `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, ๅฏน `requirments.txt`็š„ๆŽ’ๅบ็ญ‰ใ€‚ +้ข„ๆไบค้’ฉๅญ็š„้…็ฝฎๅญ˜ๅ‚จๅœจ [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/main/.pre-commit-config.yaml) ไธญใ€‚ + +ๅœจๅ…‹้š†ไป“ๅบ“ๅŽ๏ผŒไฝ ้œ€่ฆๅฎ‰่ฃ…ๅˆๅง‹ๅŒ–็š„้ข„ๆไบค้’ฉๅญใ€‚ + +```shell +pip install -U pre-commit +``` + +ไปŽไป“ๅบ“ๆ–‡ไปถๅคนไธญ + +```shell +pre-commit install +``` + +ๅœจๆญคไน‹ๅŽ๏ผŒๆฏๆฌกๆไบค๏ผŒไปฃ็ ่ง„่Œƒๆฃ€ๆŸฅๅ’Œๆ ผๅผๅŒ–ๅทฅๅ…ท้ƒฝๅฐ†่ขซๅผบๅˆถๆ‰ง่กŒใ€‚ + +```{note} +ๅœจๅˆ›ๅปบ PR ไน‹ๅ‰๏ผŒ่ฏท็กฎไฟไฝ ็š„ไปฃ็ ้€š่ฟ‡ไบ† lint ๆฃ€ๆŸฅๅนถ็”ฑ yapf ่ฟ›่กŒไบ†ๆ ผๅผๅŒ–ใ€‚ +``` + +### C++ ๅ’Œ CUDA + +ๆˆ‘ไปฌ้ตๅพช [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)ใ€‚ diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..ba2cce86675933f3852bdba1a154d8f7a8e9ebbd --- /dev/null +++ b/docs/zh_cn/get_started/faq.md @@ -0,0 +1,125 @@ +# ๅธธ่ง้—ฎ้ข˜่งฃ็ญ” + +## ๆฆ‚่ฟฐ + +ๆˆ‘ไปฌๅœจ่ฟ™้‡Œๅˆ—ๅ‡บไบ†่ฎธๅคš็”จๆˆทๅธธ้‡ๅˆฐ็š„้—ฎ้ข˜ไปฅๅŠ็›ธๅบ”็š„่งฃๅ†ณๆ–นๆกˆใ€‚ + +- [ๅธธ่ง้—ฎ้ข˜่งฃ็ญ”](#ๅธธ่ง้—ฎ้ข˜่งฃ็ญ”) + - [ๆฆ‚่ฟฐ](#ๆฆ‚่ฟฐ) + - [ๅฎ‰่ฃ…](#ๅฎ‰่ฃ…) + - [ๆ•ฐๆฎ](#ๆ•ฐๆฎ) + - [่ฎญ็ปƒ](#่ฎญ็ปƒ) + - [ๆต‹่ฏ•](#ๆต‹่ฏ•) + +ๅฆ‚ๆžœๆ‚จๅ‘็Žฐไปปไฝ•้ข‘็นๅ‡บ็Žฐ็š„้—ฎ้ข˜ๅนถไธ”ๆœ‰่งฃๅ†ณๆ–นๆณ•๏ผŒๆฌข่ฟŽๅœจๅˆ—่กจไธญ่กฅๅ……ใ€‚ๅฆ‚ๆžœ่ฟ™้‡Œ็š„ๅ†…ๅฎนๆฒกๆœ‰ๆถต็›–ๆ‚จ็š„้—ฎ้ข˜๏ผŒ่ฏทไฝฟ็”จ[ๆไพ›็š„ๆจกๆฟ](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md)ๅˆ›ๅปบไธ€ไธช้—ฎ้ข˜๏ผŒๅนถ็กฎไฟๅœจๆจกๆฟไธญๅกซๅ†™ๆ‰€ๆœ‰ๅฟ…่ฆ็š„ไฟกๆฏใ€‚ + +## ๅฎ‰่ฃ… + +- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"** + + 1. ไฝฟ็”จ `pip uninstall mmcv` ๅ‘ฝไปคๅธ่ฝฝ็Žฏๅขƒไธญ็š„็Žฐๆœ‰ mmcvใ€‚ + 2. ๅ‚็…ง[ๅฎ‰่ฃ…่ฏดๆ˜Ž](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html#install-mmcv)ๅฎ‰่ฃ… mmcvใ€‚ + +- **"OSError: MoviePy Error: creation of None failed because of the following error"** + + ไฝฟ็”จ `pip install moviepy` ๅฎ‰่ฃ…ใ€‚ๆ›ดๅคšไฟกๆฏๅฏไปฅๅ‚่€ƒ[ๅฎ˜ๆ–นๅฎ‰่ฃ…ๆ–‡ๆกฃ](https://zulko.github.io/moviepy/install.html), ่ฏทๆณจๆ„๏ผˆๆ นๆฎ่ฟ™ไธช [issue](https://github.com/Zulko/moviepy/issues/693)๏ผ‰๏ผš + + 1. ๅฏนไบŽ Windows ็”จๆˆท๏ผŒ[ImageMagick](https://www.imagemagick.org/script/index.php) ไธไผš่‡ชๅŠจ่ขซ MoviePy ๆฃ€ๆต‹ๅˆฐ๏ผŒ้œ€่ฆไฟฎๆ”น `moviepy/config_defaults.py` ๆ–‡ไปถ๏ผŒๆไพ› ImageMagick ไบŒ่ฟ›ๅˆถๆ–‡ไปถ `magick` ็š„่ทฏๅพ„๏ผŒไพ‹ๅฆ‚ `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"` + 2. ๅฏนไบŽ Linux ็”จๆˆท๏ผŒๅฆ‚ๆžœ MoviePy ๆฒกๆœ‰ๆฃ€ๆต‹ๅˆฐ ImageMagick๏ผŒ้œ€่ฆไฟฎๆ”น `/etc/ImageMagick-6/policy.xml` ๆ–‡ไปถ๏ผŒๅฐ† `` ๆณจ้‡Šๆމ๏ผŒๆ”นไธบ ``ใ€‚ + +- **"ๅณไฝฟๆˆ‘ๅทฒ็ปๅฎ‰่ฃ…ไบ† XXCODEBASE๏ผŒไธบไป€ไนˆ่ฟ˜ไผšๆ”ถๅˆฐ 'Please install XXCODEBASE to use XXX' ็š„้”™่ฏฏๆถˆๆฏ?"** + + ๆ‚จๆ”ถๅˆฐ่ฏฅ้”™่ฏฏๆถˆๆฏๆ˜ฏๅ› ไธบๆˆ‘ไปฌ็š„้กน็›ฎๆ— ๆณ•ไปŽ XXCODEBASE ไธญๅฏผๅ…ฅไธ€ไธชๅ‡ฝๆ•ฐๆˆ–็ฑปใ€‚ๆ‚จๅฏไปฅๅฐ่ฏ•่ฟ่กŒ็›ธๅบ”็š„ไปฃ็ ่กŒๆฅๆŸฅ็œ‹ๅ‘็”Ÿไบ†ไป€ไนˆใ€‚ไธ€ไธชๅฏ่ƒฝ็š„ๅŽŸๅ› ๆ˜ฏ๏ผŒๅœจ OpenMMLAB ็š„ๆŸไบ›ไปฃ็ ๅบ“ไธญ๏ผŒๆ‚จ้œ€่ฆๅœจๅฎ‰่ฃ…ๅฎƒไปฌไน‹ๅ‰ๅ…ˆๅฎ‰่ฃ… mmcv ๅ’Œ mmengineใ€‚ๆ‚จๅฏไปฅๆŒ‰็…ง[ๆ•™็จ‹](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html#installation)ๆฅๅฎ‰่ฃ…ๅฎƒไปฌใ€‚ + +## ๆ•ฐๆฎ + +- **FileNotFound ้”™่ฏฏ๏ผŒไพ‹ๅฆ‚ `No such file or directory: xxx/xxx/img_00300.jpg`** + + ๅœจๆˆ‘ไปฌ็š„ไป“ๅบ“ไธญ๏ผŒๆˆ‘ไปฌๅฐ† `start_index=1` ่ฎพ็ฝฎไธบ rawframe ๆ•ฐๆฎ้›†็š„้ป˜่ฎคๅ€ผ๏ผŒๅฐ† `start_index=0` ่ฎพ็ฝฎไธบ่ง†้ข‘ๆ•ฐๆฎ้›†็š„้ป˜่ฎคๅ€ผใ€‚ๅฆ‚ๆžœ็”จๆˆท้‡ๅˆฐๆ•ฐๆฎ็š„็ฌฌไธ€ๅธงๆˆ–ๆœ€ๅŽไธ€ๅธง็š„ FileNotFound ้”™่ฏฏ๏ผŒ้œ€่ฆๆฃ€ๆŸฅไปฅ 0 ๆˆ– 1 ไฝœไธบๅ็งป้‡ๅผ€ๅง‹็š„ๆ–‡ไปถ๏ผŒไพ‹ๅฆ‚ `xxx_00000.jpg` ๆˆ– `xxx_00001.jpg`๏ผŒ็„ถๅŽๅœจ้…็ฝฎๆ–‡ไปถไธญๆ›ดๆ”นๆ•ฐๆฎๅค„็†ๆตๆฐด็บฟ็š„ `start_index` ๅ€ผใ€‚ + +- **ๆˆ‘ไปฌๅบ”่ฏฅๅฆ‚ไฝ•้ข„ๅค„็†ๆ•ฐๆฎ้›†ไธญ็š„่ง†้ข‘๏ผŸๅฐ†ๅฎƒไปฌ่ฐƒๆ•ดไธบๅ›บๅฎšๅคงๅฐ๏ผˆๆ‰€ๆœ‰่ง†้ข‘็š„้ซ˜ๅฎฝๆฏ”็›ธๅŒ๏ผ‰๏ผŒไพ‹ๅฆ‚ `340x256`๏ผˆ1๏ผ‰๏ผŒ่ฟ˜ๆ˜ฏ่ฐƒๆ•ดๅฎƒไปฌไฝฟๅพ—ๆ‰€ๆœ‰่ง†้ข‘็š„็Ÿญ่พนๅ…ทๆœ‰็›ธๅŒ็š„้•ฟๅบฆ๏ผˆ256px ๆˆ– 320px๏ผ‰๏ผˆ2๏ผ‰๏ผŸ** + + ๆˆ‘ไปฌๅฐ่ฏ•่ฟ‡่ฟ™ไธค็ง้ข„ๅค„็†ๆ–นๆณ•๏ผŒๅนถๅ‘็Žฐ๏ผˆ2๏ผ‰้€šๅธธๆ˜ฏๆ›ดๅฅฝ็š„่งฃๅ†ณๆ–นๆกˆ๏ผŒๅ› ๆญคๆˆ‘ไปฌไฝฟ็”จ๏ผˆ2๏ผ‰ไฝœไธบ้ป˜่ฎค็š„้ข„ๅค„็†่ฎพ็ฝฎ๏ผŒ็Ÿญ่พน้•ฟๅบฆไธบ 256pxใ€‚ๆˆ‘ไปฌๅฏน่ฟ™ไบ›้ข„ๅค„็†ๆ–นๆณ•่ฟ›่กŒไบ†ๅŸบๅ‡†ๆต‹่ฏ•๏ผŒๆ‚จๅฏไปฅๅœจ[TSN ๆ•ฐๆฎๅŸบๅ‡†ๆต‹่ฏ•](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn)ๅ’Œ[SlowOnly ๆ•ฐๆฎๅŸบๅ‡†ๆต‹่ฏ•](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/slowonly)ไธญๆ‰พๅˆฐ็ป“ๆžœใ€‚ + +- **ๆ•ฐๆฎๅค„็†ๆตๆฐด็บฟไธญ็š„้กนไธๅŒน้…ๅฏผ่‡ดๅ‡บ็Žฐ็ฑปไผผ `KeyError: 'total_frames'` ็š„้”™่ฏฏ** + + ๆˆ‘ไปฌๆœ‰็”จไบŽๅค„็†่ง†้ข‘ๅ’Œๅธง็š„ไธคไธชๅค„็†ๆตๆฐด็บฟใ€‚ + + **ๅฏนไบŽ่ง†้ข‘**๏ผŒๆˆ‘ไปฌๅบ”่ฏฅๅœจๅค„็†ๆตๆฐด็บฟไธญๅŠจๆ€่งฃ็ ่ง†้ข‘๏ผŒๆ‰€ไปฅๅœจ่ฟ™็งๆƒ…ๅ†ตไธ‹ๅบ”่ฏฅไฝฟ็”จ `DecordInit & DecordDecode`ใ€`OpenCVInit & OpenCVDecode` ๆˆ– `PyAVInit & PyAVDecode` ่ฟ™ๆ ท็š„้…ๅฏน๏ผŒไพ‹ๅฆ‚[่ฟ™ไธช็คบไพ‹](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py#L14-L16)ใ€‚ + + **ๅฏนไบŽๅธง**๏ผŒๅ›พๅƒๅทฒ็ปๅœจ็ฆป็บฟ็Šถๆ€ไธ‹่งฃ็ ๏ผŒๆ‰€ไปฅๅœจ่ฟ™็งๆƒ…ๅ†ตไธ‹ๅบ”่ฏฅไฝฟ็”จ `RawFrameDecode` ่ฟ™ๆ ท็š„ๅค„็†ๆตๆฐด็บฟ้กน๏ผŒไพ‹ๅฆ‚[่ฟ™ไธช็คบไพ‹](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py#L17)ใ€‚ + + `KeyError: 'total_frames'` ๆ˜ฏ็”ฑไบŽ้”™่ฏฏๅœฐๅฐ† `RawFrameDecode` ๆญฅ้ชค็”จไบŽ่ง†้ข‘๏ผŒๅ› ไธบๅฝ“่พ“ๅ…ฅๆ˜ฏ่ง†้ข‘ๆ—ถ๏ผŒๆ— ๆณ•้ข„ๅ…ˆ่Žทๅ– `total_frames`ใ€‚ + +## ่ฎญ็ปƒ + +- **ๅฆ‚ไฝ•ๅชไฝฟ็”จ่ฎญ็ปƒๅฅฝ็š„่ฏ†ๅˆซๆจกๅž‹่ฟ›่กŒไธปๅนฒ็ฝ‘็ปœ็š„้ข„่ฎญ็ปƒ๏ผŸ** + + ไธบไบ†ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹่ฟ›่กŒๆ•ดไธช็ฝ‘็ปœ็š„่ฎญ็ปƒ๏ผŒๆ–ฐ็š„้…็ฝฎๆ–‡ไปถๅœจ `load_from` ไธญๆทปๅŠ ไบ†้ข„่ฎญ็ปƒๆจกๅž‹็š„้“พๆŽฅใ€‚ + + ่ฆไฝฟ็”จไธปๅนฒ่ฟ›่กŒ้ข„่ฎญ็ปƒ๏ผŒๅฏไปฅๅฐ†้…็ฝฎๆ–‡ไปถไธญไธปๅนฒ้ƒจๅˆ†็š„ `pretrained` ๅ€ผๆ›ดๆ”นไธบๆƒ้‡่ทฏๅพ„/URLใ€‚ๅœจ่ฎญ็ปƒๆ—ถ๏ผŒๆœช้ข„ๆ–™ๅˆฐ็š„้”ฎๅฐ†่ขซๅฟฝ็•ฅใ€‚ + +- **ๅœจๅพฎ่ฐƒๆจกๅž‹ๆ—ถๅฆ‚ไฝ•ๅ›บๅฎšไธปๅนฒ็š„ๆŸไบ›้˜ถๆฎต๏ผŸ** + + ๆ‚จๅฏไปฅๅ‚่€ƒ [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L791) ๅ’Œ [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L369-L370)ใ€‚ + ๆ้†’ๅœจ้…็ฝฎๆ–‡ไปถไธญ่ฎพ็ฝฎ `find_unused_parameters = True`๏ผŒไปฅ่ฟ›่กŒๅˆ†ๅธƒๅผ่ฎญ็ปƒๆˆ–ๆต‹่ฏ•ใ€‚ + + ๅฎž้™…ไธŠ๏ผŒ้™คไบ†ๅฐ‘ๆ•ฐๆจกๅž‹๏ผŒๅฆ‚ C3D ็ญ‰๏ผŒ็”จๆˆทๅฏไปฅ่ฎพ็ฝฎ `frozen_stages` ๆฅๅ†ป็ป“ไธปๅนฒ็š„้˜ถๆฎต๏ผŒๅ› ไธบๅ‡ ไนŽๆ‰€ๆœ‰็ปงๆ‰ฟ่‡ช `ResNet` ๅ’Œ `ResNet3D` ็š„ไธปๅนฒ้ƒฝๆ”ฏๆŒๅ†…้ƒจๅ‡ฝๆ•ฐ `_freeze_stages()`ใ€‚ + +- **ๅฆ‚ไฝ•ๅœจ้…็ฝฎๆ–‡ไปถไธญ่ฎพ็ฝฎ memcached ๏ผŸ** + + ๅœจ MMAction2 ไธญ๏ผŒๆ‚จๅฏไปฅๅฐ† memcached ็š„ๅ‚ๆ•ฐไผ ้€’็ป™็”จไบŽ่ง†้ข‘ๆ•ฐๆฎ้›†็š„ `class DecordInit` ๆˆ–็”จไบŽๅŽŸๅง‹ๅธงๆ•ฐๆฎ้›†็š„ `RawFrameDecode`ใ€‚ๆœ‰ๅ…ณๆ›ดๅคš็ป†่Š‚๏ผŒ่ฏทๅ‚้˜… MMEngine ไธญ็š„ [`class FileClient`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/data/pipelines/file_client.py)ใ€‚ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผŒๆผ”็คบๅฆ‚ไฝ•ๅœจๅŽŸๅง‹ๅธงๆ•ฐๆฎ้›†ไธญไฝฟ็”จ memcached๏ผš + + ```python + mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path') + + train_pipeline = [ + ... + dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg), + ... + ] + ``` + +- **ๅฆ‚ไฝ•ๅœจ้…็ฝฎๆ–‡ไปถไธญ่ฎพ็ฝฎ `load_from` ็š„ๅ€ผไปฅๅพฎ่ฐƒๆจกๅž‹๏ผŸ** + + ๅœจ MMAction2 ไธญ๏ผŒๆˆ‘ไปฌๅฐ† `load_from=None` ่ฎพ็ฝฎไธบ `configs/_base_/default_runtime.py` ไธญ็š„้ป˜่ฎคๅ€ผ๏ผŒๅนถไธ”็”ฑไบŽ[็ปงๆ‰ฟ่ฎพ่ฎก](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md)๏ผŒ็”จๆˆทๅฏไปฅ็›ดๆŽฅ้€š่ฟ‡ๅœจๅ…ถ้…็ฝฎๆ–‡ไปถไธญ่ฎพ็ฝฎ `load_from` ๆฅๆ›ดๆ”นๅฎƒใ€‚ + +- **ๅฆ‚ไฝ•ๅœจ่ฎญ็ปƒๆ—ถไฝฟ็”จ `RawFrameDataset`๏ผŸ** + + ๅœจ MMAction2 1.x ็‰ˆๆœฌไธญ๏ผŒๅคงๅคšๆ•ฐ้…็ฝฎๆ–‡ไปถ้ป˜่ฎคไฝฟ็”จ `VideoDataset` ไฝœไธบๆ•ฐๆฎ้›†็ฑปๅž‹๏ผŒ่ฟ™ๅฏนไบŽๆ–‡ไปถๅญ˜ๅ‚จๆ›ดๅŠ ๅ‹ๅฅฝใ€‚ๅฆ‚ๆžœๆ‚จๆƒณไฝฟ็”จ `RawFrameDataset`๏ผŒ้œ€่ฆ่ฟ›่กŒไธคไธชไฟฎๆ”นๆญฅ้ชค๏ผš + + - `dataset` ็›ธๅ…ณ๏ผš + ๅฐ† `train_dataloader`/`val_dataloader`/`test_dataloader` ไธญ็š„ `dataset` ไปŽ + + ``` + dataset=dict( + type=VideoDataset, + data_prefix=dict(video=xxx), + ...) + ``` + + ไฟฎๆ”นไธบ + + ``` + dataset=dict( + type=RawFrameDataset, + data_prefix=dict(img=xxx), + filename_tmpl='{:05}.jpg', + ...) + ``` + + ๆ•ฐๆฎ้›†็š„ๅ…ถไป–ๅญ—ๆฎตไธ้œ€่ฆไฟฎๆ”นใ€‚่ฏท็กฎไฟ `filename_tmpl` ไธŽๅธงๆ•ฐๆฎๅŒน้…๏ผŒๅนถๅ‚่€ƒ[้…็ฝฎๆ–‡ไปถๆ–‡ๆกฃ](../user_guides/config.md)ไบ†่งฃๆ›ดๅคšๅ…ณไบŽ้…็ฝฎๆ–‡ไปถ็š„่ฏฆ็ป†ไฟกๆฏใ€‚ + + - `transform` ็›ธๅ…ณ๏ผšๅœจ `train_pipeline`/`val_pipeline`/`test_pipeline` ไธญๅˆ ้™ค `dict(type='DecordInit', **file_client_args)`๏ผŒๅฐ† `dict(type='DecordDecode')` ไฟฎๆ”นไธบ `dict(type='RawFrameDecode', **file_client_args)`๏ผŒๅนถ็กฎไฟๅœจ้…็ฝฎๆ–‡ไปถไธญๅฎšไน‰ไบ† `file_client_args = dict(io_backend='disk')`ใ€‚ + + ๆœ‰ๅ…ณ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†็š„ๆ›ดๅคšไฟฎๆ”น๏ผŒ่ฏทๅ‚่€ƒ[ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](../user_guides/prepare_dataset.md)ๅ’Œ[่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](../advanced_guides/customize_dataset.md)ใ€‚ + +## ๆต‹่ฏ• + +- **ๅฆ‚ไฝ•ไฝฟ้ข„ๆต‹ๅพ—ๅˆ†ๅœจ softmax ๅ†…ๅฝ’ไธ€ๅŒ–ๅˆฐ \[0, 1\] ?** + + ๅœจ้…็ฝฎๆ–‡ไปถไธญๅฐ† `model.cls_head.average_clips` ่ฎพ็ฝฎไธบ `'prob'`ใ€‚ + +- **ๅฆ‚ๆžœๆจกๅž‹่ฟ‡ๅคง๏ผŒGPU ๅ†…ๅญ˜ๆ— ๆณ•ๅฎน็บณ็”š่‡ณๅชๆœ‰ไธ€ไธชๆต‹่ฏ•ๆ ทๆœฌๆ€ŽไนˆๅŠž๏ผŸ** + + ้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒ3D ๆจกๅž‹ไฝฟ็”จ 10 ไธช clips x 3 ไธช crops ่ฟ›่กŒๆต‹่ฏ•๏ผŒๆ€ปๅ…ฑๆœ‰ 30 ไธช่ง†ๅ›พใ€‚ๅฏนไบŽ้žๅธธๅคง็š„ๆจกๅž‹๏ผŒๅณไฝฟๅชๆœ‰ไธ€ไธชๆต‹่ฏ•ๆ ทๆœฌ๏ผŒGPU ๅ†…ๅญ˜ไนŸๆ— ๆณ•ๅฎน็บณ๏ผˆๅ› ไธบๆœ‰ 30 ไธช่ง†ๅ›พ๏ผ‰ใ€‚ไธบไบ†่งฃๅ†ณ่ฟ™ไธช้—ฎ้ข˜๏ผŒๆ‚จๅฏไปฅๅœจ้…็ฝฎๆ–‡ไปถ็š„ `model['test_cfg']` ไธญ่ฎพ็ฝฎ `max_testing_views=n`ใ€‚่ฟ™ๆ ท๏ผŒๅœจๅ‰ๅ‘ไผ ๆ’ญ่ฟ‡็จ‹ไธญ๏ผŒไผšไฝฟ็”จ n ไธช่ง†ๅ›พไฝœไธบไธ€ไธชๆ‰นๆฌก๏ผŒไปฅ่Š‚็œ GPU ๅ†…ๅญ˜็š„ไฝฟ็”จใ€‚ diff --git a/docs/zh_cn/get_started/guide_to_framework.md b/docs/zh_cn/get_started/guide_to_framework.md new file mode 100644 index 0000000000000000000000000000000000000000..63264365f17a66f41c2862d06fe370acd4f08dbc --- /dev/null +++ b/docs/zh_cn/get_started/guide_to_framework.md @@ -0,0 +1,761 @@ +# 20ๅˆ†้’Ÿไบ†่งฃ MMAction2 ๆก†ๆžถ่ฎพ่ฎก + +ๅœจๆœฌๆ•™็จ‹ไธญ๏ผŒๆˆ‘ไปฌๅฐ†้€š่ฟ‡ไธ€ไธช่ง†้ข‘ๅŠจไฝœ่ฏ†ๅˆซ็š„ๆ‰‹ๆŠŠๆ‰‹ๆ•™็จ‹ๆฅๆผ”็คบ `MMACTION2 1.0` ็š„ๆ•ดไฝ“ๆžถๆž„ใ€‚ + +ๆœฌๆ•™็จ‹็š„็›ฎๅฝ•ๅฆ‚ไธ‹: + +- [20ๅˆ†้’Ÿไบ†่งฃ MMAction2 ๆก†ๆžถ่ฎพ่ฎก](#20ๅˆ†้’Ÿไบ†่งฃ-mmaction2-ๆก†ๆžถ่ฎพ่ฎก) + - [ๆญฅ้ชค0๏ผšๅ‡†ๅค‡ๆ•ฐๆฎ](#ๆญฅ้ชค0ๅ‡†ๅค‡ๆ•ฐๆฎ) + - [ๆญฅ้ชค1๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟ](#ๆญฅ้ชค1ๆž„ๅปบไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟ) + - [ๆญฅ้ชค2๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎๅŠ ่ฝฝๅ™จ](#ๆญฅ้ชค2ๆž„ๅปบไธ€ไธชๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎๅŠ ่ฝฝๅ™จ) + - [ๆญฅ้ชค3๏ผšๆž„ๅปบไธ€ไธช่ฏ†ๅˆซๅ™จ](#ๆญฅ้ชค3ๆž„ๅปบไธ€ไธช่ฏ†ๅˆซๅ™จ) + - [ๆญฅ้ชค4๏ผšๆž„ๅปบไธ€ไธช่ฏ„ไผฐๆŒ‡ๆ ‡](#ๆญฅ้ชค4ๆž„ๅปบไธ€ไธช่ฏ„ไผฐๆŒ‡ๆ ‡) + - [ๆญฅ้ชค5๏ผšไฝฟ็”จๆœฌๅœฐ PyTorch ่ฎญ็ปƒๅ’Œๆต‹่ฏ•](#ๆญฅ้ชค5ไฝฟ็”จๆœฌๅœฐ-pytorch-่ฎญ็ปƒๅ’Œๆต‹่ฏ•) + - [ๆญฅ้ชค6๏ผšไฝฟ็”จ MMEngine ่ฎญ็ปƒๅ’Œๆต‹่ฏ•๏ผˆๆŽจ่๏ผ‰](#ๆญฅ้ชค6ไฝฟ็”จ-mmengine-่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆŽจ่) + +้ฆ–ๅ…ˆ๏ผŒๆˆ‘ไปฌ้œ€่ฆๅˆๅง‹ๅŒ–ๆณจๅ†Œ่กจ็š„ `scope` ๏ผŒไปฅ็กฎไฟๆฏไธชๆจกๅ—้ƒฝๅœจ `mmaction` ่Œƒๅ›ดไธ‹ๆณจๅ†Œใ€‚ๆœ‰ๅ…ณๆณจๅ†Œ่กจ็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html) ใ€‚ + +```python +from mmaction.utils import register_all_modules + +register_all_modules(init_default_scope=True) +``` + +## ๆญฅ้ชค0๏ผšๅ‡†ๅค‡ๆ•ฐๆฎ + +่ฏทไธ‹่ฝฝๆˆ‘ไปฌๅ‡†ๅค‡็š„[็ฒพ็ฎ€็‰ˆ kinetics400](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) ๆ•ฐๆฎ้›†๏ผŒๅนถๅฐ†ๅ…ถๆๅ–ๅˆฐ `$MMACTION2/data` ็›ฎๅฝ•ใ€‚ + +่งฃๅŽ‹ๅŽ็š„็›ฎๅฝ•็ป“ๆž„ๅบ”ๅฆ‚ไธ‹ๆ‰€็คบ: + +``` +mmaction2 +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ kinetics400_tiny +โ”‚ โ”‚ โ”œโ”€โ”€ kinetics_tiny_train_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ kinetics_tiny_val_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ train +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 27_CSXByd3s.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 34XczvTaRiI.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ A-wiliK50Zw.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ val +โ”‚ โ”‚ โ”œโ”€โ”€ 0pVGiAU6XEA.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ AQrbRSnRt8M.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +``` + +ไปฅไธ‹ๆ˜ฏๆ ‡ๆณจๆ–‡ไปถ `kinetics_tiny_train_video.txt` ไธญ็š„ไธ€ไบ›็คบไพ‹: + +``` +D32_1gwq35E.mp4 0 +iRuyZSKhHRg.mp4 1 +oXy-e_P_cAI.mp4 0 +34XczvTaRiI.mp4 1 +h2YqqUhnR34.mp4 0 +``` + +ๆ–‡ไปถไธญ็š„ๆฏไธ€่กŒ่กจ็คบๆฏไธ€ไธช่ง†้ข‘็š„ๆ ‡ๆณจ๏ผŒๅ…ถไธญ็ฌฌไธ€้กน่กจ็คบ่ง†้ข‘ๆ–‡ไปถๅ(ๅฆ‚ `D32_1gwq35E.mp4` )๏ผŒ็ฌฌไบŒ้กน่กจ็คบ็›ธๅบ”็š„ๆ ‡็ญพ(ๅฆ‚ `D32_1gwq35E.mp4` ็š„ๆ ‡็ญพๆ˜ฏ `0` )ใ€‚ๅœจ่ฟ™ไธชๆ•ฐๆฎ้›†ไธญ๏ผŒๅชๆœ‰ `ไธคไธช` ็ฑปๅˆซใ€‚ + +## ๆญฅ้ชค1๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟ + +ไธบไบ†ๅฎž็Žฐ `่งฃ็ `ใ€`้‡‡ๆ ท`ใ€`่ฐƒๆ•ดๅคงๅฐ`ใ€`่ฃๅ‰ช`ใ€`ๆ ผๅผๅŒ–` ๅ’Œ `ๆ‰“ๅŒ…` ่ง†้ข‘ๆ•ฐๆฎๅ’Œ็›ธๅบ”็š„ๆ ‡็ญพ๏ผŒๆˆ‘ไปฌ้œ€่ฆ่ฎพ่ฎกไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟๆฅๅค„็†่ฟ™ไบ›่ฟ‡็จ‹ใ€‚ๅ…ทไฝ“ๆฅ่ฏด๏ผŒๆˆ‘ไปฌ่ฎพ่ฎกไบ†7ไธช `Transform` ็ฑปๆฅๆž„ๅปบ่ฟ™ไธช่ง†้ข‘ๅค„็†ๆตๆฐด็บฟใ€‚ๆณจๆ„๏ผŒOpenMMLab ไธญ็š„ๆ‰€ๆœ‰`Transform` ็ฑป้ƒฝๅฟ…้กป็ปงๆ‰ฟ่‡ช `mmcv` ไธญ็š„ `BaseTransform` ็ฑป๏ผŒๅฎž็ŽฐๆŠฝ่ฑกๆ–นๆณ• `transform`๏ผŒๅนถๆณจๅ†Œๅˆฐ `TRANSFORMS` ๆณจๅ†Œ่กจใ€‚ๆœ‰ๅ…ณๆ•ฐๆฎ่ฝฌๆข็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html) ใ€‚ + +```python +import mmcv +import decord +import numpy as np +from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor +from mmaction.structures import ActionDataSample + + +@TRANSFORMS.register_module() +class VideoInit(BaseTransform): + def transform(self, results): + container = decord.VideoReader(results['filename']) + results['total_frames'] = len(container) + results['video_reader'] = container + return results + + +@TRANSFORMS.register_module() +class VideoSample(BaseTransform): + def __init__(self, clip_len, num_clips, test_mode=False): + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + + def transform(self, results): + total_frames = results['total_frames'] + interval = total_frames // self.clip_len + + if self.test_mode: + # ไฝฟๆต‹่ฏ•ๆœŸ้—ด็š„้‡‡ๆ ทๅ…ทๆœ‰็กฎๅฎšๆ€ง + np.random.seed(42) + + inds_of_all_clips = [] + for i in range(self.num_clips): + bids = np.arange(self.clip_len) * interval + offset = np.random.randint(interval, size=bids.shape) + inds = bids + offset + inds_of_all_clips.append(inds) + + results['frame_inds'] = np.concatenate(inds_of_all_clips) + results['clip_len'] = self.clip_len + results['num_clips'] = self.num_clips + return results + + +@TRANSFORMS.register_module() +class VideoDecode(BaseTransform): + def transform(self, results): + frame_inds = results['frame_inds'] + container = results['video_reader'] + + imgs = container.get_batch(frame_inds).asnumpy() + imgs = list(imgs) + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoResize(BaseTransform): + def __init__(self, r_size): + self.r_size = (np.inf, r_size) + + def transform(self, results): + img_h, img_w = results['img_shape'] + new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size) + + imgs = [mmcv.imresize(img, (new_w, new_h)) + for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoCrop(BaseTransform): + def __init__(self, c_size): + self.c_size = c_size + + def transform(self, results): + img_h, img_w = results['img_shape'] + center_x, center_y = img_w // 2, img_h // 2 + x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2 + y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2 + imgs = [img[y1:y2, x1:x2] for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoFormat(BaseTransform): + def transform(self, results): + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = results['imgs'] + + # [num_clips*clip_len, H, W, C] + imgs = np.array(imgs) + # [num_clips, clip_len, H, W, C] + imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:]) + # [num_clips, C, clip_len, H, W] + imgs = imgs.transpose(0, 4, 1, 2, 3) + + results['imgs'] = imgs + return results + + +@TRANSFORMS.register_module() +class VideoPack(BaseTransform): + def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')): + self.meta_keys = meta_keys + + def transform(self, results): + packed_results = dict() + inputs = to_tensor(results['imgs']) + data_sample = ActionDataSample().set_gt_label(results['label']) + metainfo = {k: results[k] for k in self.meta_keys if k in results} + data_sample.set_metainfo(metainfo) + packed_results['inputs'] = inputs + packed_results['data_samples'] = data_sample + return packed_results +``` + +ไธ‹้ข๏ผŒๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธชไปฃ็ ็‰‡ๆฎต(ไฝฟ็”จๆ ‡ๆณจๆ–‡ไปถไธญ็š„ `D32_1gwq35E.mp4 0` )ๆฅๆผ”็คบๅฆ‚ไฝ•ไฝฟ็”จๆ•ฐๆฎๆตๆฐด็บฟใ€‚ + +```python +import os.path as osp +from mmengine.dataset import Compose + +pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +pipeline = Compose(pipeline_cfg) +data_prefix = 'data/kinetics400_tiny/train' +results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0) +packed_results = pipeline(results) + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# ่Žทๅ–่พ“ๅ…ฅ็š„ไฟกๆฏ +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# ่Žทๅ–่พ“ๅ…ฅ็š„ๆ ‡็ญพ +print('label: ', data_sample.gt_label) +``` + +``` +shape of the inputs: torch.Size([1, 3, 16, 224, 224]) +image_shape: (224, 224) +num_clips: 1 +clip_len: 16 +label: tensor([0]) +``` + +## ๆญฅ้ชค2๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎๅŠ ่ฝฝๅ™จ + +OpenMMLabไธญ็š„ๆ‰€ๆœ‰ `Dataset` ็ฑป้ƒฝๅฟ…้กป็ปงๆ‰ฟ่‡ช `mmengine` ไธญ็š„ `BaseDataset` ็ฑปใ€‚ๆˆ‘ไปฌๅฏไปฅ้€š่ฟ‡่ฆ†็›– `load_data_list` ๆ–นๆณ•ๆฅๅฎšๅˆถๆณจ้‡ŠๅŠ ่ฝฝ่ฟ‡็จ‹ใ€‚ๆญคๅค–๏ผŒๆˆ‘ไปฌๅฏไปฅ้€š่ฟ‡่ฆ†็›– `get_data_info` ๆ–นๆณ•๏ผŒๅ‘ `results` ๅญ—ๅ…ธๆทปๅŠ ๆ›ดๅคšๅญ—ๆฎต๏ผŒๅฎƒๅฐ†ไฝœไธบ่พ“ๅ…ฅไผ ็ป™ `pipeline` ใ€‚ๆœ‰ๅ…ณ `BaseDataset` ็ฑป็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) ใ€‚ + +```python +import os.path as osp +from mmengine.fileio import list_from_file +from mmengine.dataset import BaseDataset +from mmaction.registry import DATASETS + + +@DATASETS.register_module() +class DatasetZelda(BaseDataset): + def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''), + test_mode=False, modality='RGB', **kwargs): + self.modality = modality + super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root, + data_prefix=data_prefix, test_mode=test_mode, + **kwargs) + + def load_data_list(self): + data_list = [] + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split() + filename, label = line_split + label = int(label) + filename = osp.join(self.data_prefix['video'], filename) + data_list.append(dict(filename=filename, label=label)) + return data_list + + def get_data_info(self, idx: int) -> dict: + data_info = super().get_data_info(idx) + data_info['modality'] = self.modality + return data_info +``` + +ๆŽฅไธ‹ๆฅ๏ผŒๆˆ‘ไปฌๅฐ†ๆผ”็คบๅฆ‚ไฝ•ไฝฟ็”จ dataset ๅ’Œ dataloader ๆฅ็ดขๅผ•ๆ•ฐๆฎใ€‚ๆˆ‘ไปฌๅฐ†ไฝฟ็”จ `Runner.build_dataloader` ๆ–นๆณ•ๆฅๆž„้€  dataloaderใ€‚ๆœ‰ๅ…ณ dataloader ็š„ๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader) ใ€‚ + +```python +from mmaction.registry import DATASETS + +train_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +val_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +train_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_train_video.txt', + pipeline=train_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='train')) + +val_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_val_video.txt', + pipeline=val_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='val')) + +train_dataset = DATASETS.build(train_dataset_cfg) + +packed_results = train_dataset[0] + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# ่Žทๅ–่พ“ๅ…ฅ็š„ไฟกๆฏ +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# ่Žทๅ–่พ“ๅ…ฅ็š„ๆ ‡็ญพ +print('label: ', data_sample.gt_label) + +from mmengine.runner import Runner + +BATCH_SIZE = 2 + +train_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset_cfg) + +val_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=val_dataset_cfg) + +train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg) +val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg) + +batched_packed_results = next(iter(train_data_loader)) + +batched_inputs = batched_packed_results['inputs'] +batched_data_sample = batched_packed_results['data_samples'] + +assert len(batched_inputs) == BATCH_SIZE +assert len(batched_data_sample) == BATCH_SIZE +``` + +็ปˆ็ซฏ่พ“ๅ‡บๅบ”่ฏฅไธŽ[ๆญฅ้ชค1๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟ](#ๆญฅ้ชค1๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎๆตๆฐด็บฟ)ไธญ็š„่พ“ๅ‡บ็›ธๅŒใ€‚ + +## ๆญฅ้ชค3๏ผšๆž„ๅปบไธ€ไธช่ฏ†ๅˆซๅ™จ + +ๆŽฅไธ‹ๆฅ๏ผŒๆˆ‘ไปฌๅฐ†ๆž„ๅปบ `recognizer`๏ผŒๅฎƒไธป่ฆ็”ฑไธ‰้ƒจๅˆ†็ป„ๆˆ๏ผš็”จไบŽๆ‰นๅค„็†ๅ’Œ่ง„่ŒƒๅŒ–ๆ•ฐๆฎ็š„ `data preprocessor`๏ผŒ็”จไบŽ็‰นๅพๆๅ–็š„ `backbone` ๅ’Œ็”จไบŽๅˆ†็ฑป็š„ `cls_head` ใ€‚ + +`data_preprocessor` ็š„ๅฎž็Žฐๅฆ‚ไธ‹: + +```python +import torch +from mmengine.model import BaseDataPreprocessor, stack_batch +from mmaction.registry import MODELS + + +@MODELS.register_module() +class DataPreprocessorZelda(BaseDataPreprocessor): + def __init__(self, mean, std): + super().__init__() + + self.register_buffer( + 'mean', + torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1), + False) + self.register_buffer( + 'std', + torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1), + False) + + def forward(self, data, training=False): + data = self.cast_data(data) + inputs = data['inputs'] + batch_inputs = stack_batch(inputs) # ๆ‰นๅค„็† + batch_inputs = (batch_inputs - self.mean) / self.std # ๅฝ’ไธ€ๅŒ– + data['inputs'] = batch_inputs + return data +``` + +ไปฅไธ‹ๆ˜ฏ data_preprocessor ็š„็”จๆณ•๏ผšๅฐ†ไปŽ[ๆญฅ้ชค2๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎๅŠ ่ฝฝๅ™จ](#ๆญฅ้ชค2๏ผšๆž„ๅปบไธ€ไธชๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎๅŠ ่ฝฝๅ™จ)ไธญ่Žทๅพ—็š„ `batched_packed_results` ๆไพ›็ป™ `data_preprocessor` ่ฟ›่กŒๆ‰นๅค„็†ๅ’Œๅฝ’ไธ€ๅŒ–ใ€‚ + +```python +from mmaction.registry import MODELS + +data_preprocessor_cfg = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375]) + +data_preprocessor = MODELS.build(data_preprocessor_cfg) + +preprocessed_inputs = data_preprocessor(batched_packed_results) +print(preprocessed_inputs['inputs'].shape) +``` + +``` +torch.Size([2, 1, 3, 16, 224, 224]) +``` + +`backbone`ใ€`cls_head` ๅ’Œ `recognizer` ็š„ๅฎž็Žฐๅฆ‚ไธ‹: + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel, BaseModule, Sequential +from mmengine.structures import LabelData +from mmaction.registry import MODELS + + +@MODELS.register_module() +class BackBoneZelda(BaseModule): + def __init__(self, init_cfg=None): + if init_cfg is None: + init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"), + dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)] + + super(BackBoneZelda, self).__init__(init_cfg=init_cfg) + + self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7), + stride=(1, 2, 2), padding=(1, 3, 3)), + nn.BatchNorm3d(64), nn.ReLU()) + self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), + padding=(0, 1, 1)) + + self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1), + nn.BatchNorm3d(128), nn.ReLU()) + + def forward(self, imgs): + # imgs: [batch_size*num_views, 3, T, H, W] + # features: [batch_size*num_views, 128, T/2, H//8, W//8] + features = self.conv(self.maxpool(self.conv1(imgs))) + return features + + +@MODELS.register_module() +class ClsHeadZelda(BaseModule): + def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None): + if init_cfg is None: + init_cfg = dict(type='Normal', layer='Linear', std=0.01) + + super(ClsHeadZelda, self).__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.in_channels = in_channels + self.average_clips = average_clips + + if dropout != 0: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + self.fc = nn.Linear(self.in_channels, self.num_classes) + self.pool = nn.AdaptiveAvgPool3d(1) + self.loss_fn = nn.CrossEntropyLoss() + + def forward(self, x): + N, C, T, H, W = x.shape + x = self.pool(x) + x = x.view(N, C) + assert x.shape[1] == self.in_channels + + if self.dropout is not None: + x = self.dropout(x) + + cls_scores = self.fc(x) + return cls_scores + + def loss(self, feats, data_samples): + cls_scores = self(feats) + labels = torch.stack([x.gt_label for x in data_samples]) + labels = labels.squeeze() + + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + + loss_cls = self.loss_fn(cls_scores, labels) + return dict(loss_cls=loss_cls) + + def predict(self, feats, data_samples): + cls_scores = self(feats) + num_views = cls_scores.shape[0] // len(data_samples) + # assert num_views == data_samples[0].num_clips + cls_scores = self.average_clip(cls_scores, num_views) + + for ds, sc in zip(data_samples, cls_scores): + pred = LabelData(item=sc) + ds.pred_scores = pred + return data_samples + + def average_clip(self, cls_scores, num_views): + if self.average_clips not in ['score', 'prob', None]: + raise ValueError(f'{self.average_clips} is not supported. ' + f'Currently supported ones are ' + f'["score", "prob", None]') + + total_views = cls_scores.shape[0] + cls_scores = cls_scores.view(total_views // num_views, num_views, -1) + + if self.average_clips is None: + return cls_scores + elif self.average_clips == 'prob': + cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1) + elif self.average_clips == 'score': + cls_scores = cls_scores.mean(dim=1) + + return cls_scores + + +@MODELS.register_module() +class RecognizerZelda(BaseModel): + def __init__(self, backbone, cls_head, data_preprocessor): + super().__init__(data_preprocessor=data_preprocessor) + + self.backbone = MODELS.build(backbone) + self.cls_head = MODELS.build(cls_head) + + def extract_feat(self, inputs): + inputs = inputs.view((-1, ) + inputs.shape[2:]) + return self.backbone(inputs) + + def loss(self, inputs, data_samples): + feats = self.extract_feat(inputs) + loss = self.cls_head.loss(feats, data_samples) + return loss + + def predict(self, inputs, data_samples): + feats = self.extract_feat(inputs) + predictions = self.cls_head.predict(feats, data_samples) + return predictions + + def forward(self, inputs, data_samples=None, mode='tensor'): + if mode == 'tensor': + return self.extract_feat(inputs) + elif mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode: {mode}') +``` + +`init_cfg` ็”จไบŽๆจกๅž‹ๆƒ้‡ๅˆๅง‹ๅŒ–ใ€‚ๆœ‰ๅ…ณๆจกๅž‹ๆƒ้‡ๅˆๅง‹ๅŒ–็š„ๆ›ดๅคšไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html) ใ€‚ไธŠ่ฟฐๆจกๅ—็š„็”จๆณ•ๅฆ‚ไธ‹: + +```python +import torch +import copy +from mmaction.registry import MODELS + +model_cfg = dict( + type='RecognizerZelda', + backbone=dict(type='BackBoneZelda'), + cls_head=dict( + type='ClsHeadZelda', + num_classes=2, + in_channels=128, + average_clips='prob'), + data_preprocessor = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375])) + +model = MODELS.build(model_cfg) + +# ่ฎญ็ปƒ +model.train() +model.init_weights() +data_batch_train = copy.deepcopy(batched_packed_results) +data = model.data_preprocessor(data_batch_train, training=True) +loss = model(**data, mode='loss') +print('loss dict: ', loss) + +# ้ชŒ่ฏ +with torch.no_grad(): + model.eval() + data_batch_test = copy.deepcopy(batched_packed_results) + data = model.data_preprocessor(data_batch_test, training=False) + predictions = model(**data, mode='predict') +print('Label of Sample[0]', predictions[0].gt_label) +print('Scores of Sample[0]', predictions[0].pred_score) +``` + +```shell +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.bias - torch.Size([64]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.weight - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.bias - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.bias - torch.Size([128]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.weight - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.bias - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.weight - torch.Size([2, 128]): +NormalInit: mean=0, std=0.01, bias=0 + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.bias - torch.Size([2]): +NormalInit: mean=0, std=0.01, bias=0 + +loss dict: {'loss_cls': tensor(0.6853, grad_fn=)} +Label of Sample[0] tensor([0]) +Scores of Sample[0] tensor([0.5240, 0.4760]) +``` + +## ๆญฅ้ชค4๏ผšๆž„ๅปบไธ€ไธช่ฏ„ไผฐๆŒ‡ๆ ‡ + +่ฏทๆณจๆ„๏ผŒ`OpenMMLab` ไธญ็š„ๆ‰€ๆœ‰ `Metric` ็ฑป้ƒฝๅฟ…้กป็ปงๆ‰ฟ่‡ช `mmengine` ไธญ็š„ `BaseMetric` ็ฑป๏ผŒๅนถๅฎž็ŽฐๆŠฝ่ฑกๆ–นๆณ• `process` ๅ’Œ`compute_metrics`ใ€‚ๆœ‰ๅ…ณ่ฏ„ไผฐ็š„ๆ›ดๅคšไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) ใ€‚ + +```python +import copy +from collections import OrderedDict +from mmengine.evaluator import BaseMetric +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import METRICS + + +@METRICS.register_module() +class AccuracyMetric(BaseMetric): + def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'): + super().__init__(collect_device=collect_device, prefix=prefix) + self.topk = topk + + def process(self, data_batch, data_samples): + data_samples = copy.deepcopy(data_samples) + for data_sample in data_samples: + result = dict() + scores = data_sample['pred_score'].cpu().numpy() + label = data_sample['gt_label'].item() + result['scores'] = scores + result['label'] = label + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + eval_results = OrderedDict() + labels = [res['label'] for res in results] + scores = [res['scores'] for res in results] + topk_acc = top_k_accuracy(scores, labels, self.topk) + for k, acc in zip(self.topk, topk_acc): + eval_results[f'topk{k}'] = acc + return eval_results +``` + +```python +from mmaction.registry import METRICS + +metric_cfg = dict(type='AccuracyMetric', topk=(1, 5)) + +metric = METRICS.build(metric_cfg) + +data_samples = [d.to_dict() for d in predictions] + +metric.process(batched_packed_results, data_samples) +acc = metric.compute_metrics(metric.results) +print(acc) +``` + +```shell +OrderedDict([('topk1', 0.5), ('topk5', 1.0)]) +``` + +## ๆญฅ้ชค5๏ผšไฝฟ็”จๆœฌๅœฐ PyTorch ่ฎญ็ปƒๅ’Œๆต‹่ฏ• + +```python +import torch.optim as optim +from mmengine import track_iter_progress + + +device = 'cuda' # or 'cpu' +max_epochs = 10 + +optimizer = optim.Adam(model.parameters(), lr=0.01) + +for epoch in range(max_epochs): + model.train() + losses = [] + for data_batch in track_iter_progress(train_data_loader): + data = model.data_preprocessor(data_batch, training=True) + loss_dict = model(**data, mode='loss') + loss = loss_dict['loss_cls'] + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + + print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader)) + + with torch.no_grad(): + model.eval() + for data_batch in track_iter_progress(val_data_loader): + data = model.data_preprocessor(data_batch, training=False) + predictions = model(**data, mode='predict') + data_samples = [d.to_dict() for d in predictions] + metric.process(data_batch, data_samples) + + acc = metric.acc = metric.compute_metrics(metric.results) + for name, topk in acc.items(): + print(f'{name}: ', topk) +``` + +## ๆญฅ้ชค6๏ผšไฝฟ็”จ MMEngine ่ฎญ็ปƒๅ’Œๆต‹่ฏ•๏ผˆๆŽจ่๏ผ‰ + +ๅ…ณไบŽ่ฎญ็ปƒๅ’Œๆต‹่ฏ•็š„ๆ›ดๅคš็ป†่Š‚๏ผŒไฝ ๅฏไปฅๅ‚่€ƒ[ MMAction2 ๆ•™็จ‹](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html) ใ€‚ๆœ‰ๅ…ณ `Runner` ็š„ๆ›ดๅคšไฟกๆฏ๏ผŒ่ฏทๅ‚้˜…[ MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html) ใ€‚ + +```python +from mmengine.runner import Runner + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +val_cfg = dict(type='ValLoop') + +optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01)) + +runner = Runner(model=model_cfg, work_dir='./work_dirs/guide', + train_dataloader=train_dataloader_cfg, + train_cfg=train_cfg, + val_dataloader=val_dataloader_cfg, + val_cfg=val_cfg, + optim_wrapper=optim_wrapper, + val_evaluator=[metric_cfg], + default_scope='mmaction') +runner.train() +``` diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..91d95181fffd201beed2dbdd20d84942a71ee069 --- /dev/null +++ b/docs/zh_cn/get_started/installation.md @@ -0,0 +1,198 @@ +# ๅฎ‰่ฃ… + +## ๅ‰็ฝฎๆกไปถ + +ๅœจๆœฌ่Š‚ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ๆผ”็คบๅฆ‚ไฝ•ๅ‡†ๅค‡ PyTorch ็›ธๅ…ณ็š„ไพ่ต–็Žฏๅขƒใ€‚ + +MMAction2 ้€‚็”จไบŽ Linuxใ€Windows ๅ’Œ MacOSใ€‚ๅฎƒ้œ€่ฆ Python 3.7+๏ผŒCUDA 10.2+ ๅ’Œ PyTorch 1.8+ใ€‚ + +```{note} +ๅฆ‚ๆžœๆ‚จ็†Ÿๆ‚‰ PyTorch ๅนถไธ”ๅทฒ็ปๅฎ‰่ฃ…ไบ†ๅฎƒ๏ผŒๅฏไปฅ่ทณ่ฟ‡่ฟ™้ƒจๅˆ†ๅ†…ๅฎน๏ผŒ็›ดๆŽฅ่ฝฌๅˆฐ[ไธ‹ไธ€่Š‚](#installation)ใ€‚ๅฆๅˆ™๏ผŒๆ‚จๅฏไปฅๆŒ‰็…งไปฅไธ‹ๆญฅ้ชค่ฟ›่กŒๅ‡†ๅค‡ๅทฅไฝœใ€‚ +``` + +**็ฌฌไธ€ๆญฅใ€‚** ไปŽ[ๅฎ˜ๆ–น็ฝ‘็ซ™](https://docs.conda.io/en/latest/miniconda.html)ไธ‹่ฝฝๅนถๅฎ‰่ฃ… Minicondaใ€‚ + +**็ฌฌไบŒๆญฅใ€‚** ๅˆ›ๅปบไธ€ไธช conda ็Žฏๅขƒๅนถๆฟ€ๆดปๅฎƒใ€‚ + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +``` + +**็ฌฌไธ‰ๆญฅใ€‚** ๅฎ‰่ฃ… PyTorch๏ผŒๆŒ‰็…ง[ๅฎ˜ๆ–น่ฏดๆ˜Ž](https://pytorch.org/get-started/locally/)่ฟ›่กŒๆ“ไฝœ๏ผŒไพ‹ๅฆ‚๏ผš + +ๅœจ GPU ๅนณๅฐไธŠ๏ผš + +```shell +conda install pytorch torchvision -c pytorch +``` + +```{warning} +ๆญคๅ‘ฝไปคๅฐ†่‡ชๅŠจๅฎ‰่ฃ…ๆœ€ๆ–ฐ็‰ˆๆœฌ็š„ PyTorch ๅ’Œ cudatoolkit๏ผŒ่ฏท็กฎไฟๅฎƒไปฌไธŽๆ‚จ็š„็ŽฏๅขƒๅŒน้…ใ€‚ +``` + +ๅœจ CPU ๅนณๅฐไธŠ๏ผš + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +## ๆœ€ไฝณๅฎž่ทต + +ๆˆ‘ไปฌๅปบ่ฎฎ็”จๆˆท้ตๅพชๆˆ‘ไปฌ็š„ๆœ€ไฝณๅฎž่ทตๆฅๅฎ‰่ฃ… MMAction2ใ€‚็„ถ่€Œ๏ผŒๆ•ดไธช่ฟ‡็จ‹ๆ˜ฏ้ซ˜ๅบฆๅฏๅฎšๅˆถ็š„ใ€‚ๆ›ดๅคšไฟกๆฏ่ฏทๅ‚่ง[่‡ชๅฎšไน‰ๅฎ‰่ฃ…](#customize-installation)้ƒจๅˆ†ใ€‚ + +**็ฌฌไธ€ๆญฅใ€‚** ไฝฟ็”จ [MIM](https://github.com/open-mmlab/mim) ๅฎ‰่ฃ… [MMEngine](https://github.com/open-mmlab/mmengine)ใ€[MMCV](https://github.com/open-mmlab/mmcv)ใ€[MMDetection](https://github.com/open-mmlab/mmdetection)๏ผˆๅฏ้€‰๏ผ‰ๅ’Œ [MMPose](https://github.com/open-mmlab/mmpose)๏ผˆๅฏ้€‰๏ผ‰ใ€‚ + +```shell +pip install -U openmim +mim install mmengine +mim install mmcv +mim install mmdet +mim install mmpose +``` + +**็ฌฌไบŒๆญฅใ€‚** ๅฎ‰่ฃ… MMAction2ใ€‚ + +ๆ นๆฎๆ‚จ็š„้œ€ๆฑ‚๏ผŒๆˆ‘ไปฌๆ”ฏๆŒไธค็งๅฎ‰่ฃ…ๆจกๅผ๏ผš + +- [ไปŽๆบไปฃ็ ๆž„ๅปบ MMAction2๏ผˆๆŽจ่๏ผ‰](#build-mmaction2-from-source)๏ผšๆ‚จๆƒณๅœจ MMAction2 ๆก†ๆžถไธŠๅผ€ๅ‘่‡ชๅทฑ็š„ๅŠจไฝœ่ฏ†ๅˆซไปปๅŠกๆˆ–ๆ–ฐๅŠŸ่ƒฝใ€‚ไพ‹ๅฆ‚๏ผŒๆทปๅŠ ๆ–ฐ็š„ๆ•ฐๆฎ้›†ๆˆ–ๆ–ฐ็š„ๆจกๅž‹ใ€‚ๅ› ๆญค๏ผŒๆ‚จๅฏไปฅไฝฟ็”จๆˆ‘ไปฌๆไพ›็š„ๆ‰€ๆœ‰ๅทฅๅ…ทใ€‚ +- [ๅฎ‰่ฃ…ไธบ Python ๅŒ…](#install-as-a-python-package)๏ผšๆ‚จๅชๆƒณๅœจ้กน็›ฎไธญ่ฐƒ็”จ MMAction2 ็š„ API ๆˆ–ๅฏผๅ…ฅ MMAction2 ็š„ๆจกๅ—ใ€‚ + +### ไปŽๆบไปฃ็ ๆž„ๅปบ MMAction2 + +ๅœจ่ฟ™็งๆƒ…ๅ†ตไธ‹๏ผŒไปŽๆบไปฃ็ ๅฎ‰่ฃ… mmaction2๏ผš + +```shell +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +pip install -v -e . +# "-v" ่กจ็คบ่พ“ๅ‡บๆ›ดๅคšๅฎ‰่ฃ…็›ธๅ…ณ็š„ไฟกๆฏ +# "-e" ่กจ็คบไปฅๅฏ็ผ–่พ‘ๅฝขๅผๅฎ‰่ฃ…๏ผŒ่ฟ™ๆ ทๅฏไปฅๅœจไธ้‡ๆ–ฐๅฎ‰่ฃ…็š„ๆƒ…ๅ†ตไธ‹๏ผŒ่ฎฉๆœฌๅœฐไฟฎๆ”น็›ดๆŽฅ็”Ÿๆ•ˆใ€‚ +``` + +ๅฏ้€‰ๅœฐ๏ผŒๅฆ‚ๆžœๆ‚จๅธŒๆœ›ไธบ MMAction2 ๅšๅ‡บ่ดก็Œฎๆˆ–ไฝ“้ชŒๅฎž้ชŒๅŠŸ่ƒฝ๏ผŒ่ฏทๅˆ‡ๆขๅˆฐ `dev-1.x` ๅˆ†ๆ”ฏ๏ผš + +```shell +git checkout dev-1.x +``` + +### ๅฎ‰่ฃ…ไธบ Python ๅŒ… + +ๅช้œ€ไฝฟ็”จ pip ๅฎ‰่ฃ…ๅณๅฏใ€‚ + +```shell +pip install mmaction2 +``` + +## ้ชŒ่ฏๅฎ‰่ฃ… + +ไธบไบ†้ชŒ่ฏ MMAction2 ๆ˜ฏๅฆๅฎ‰่ฃ…ๆญฃ็กฎ๏ผŒๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไบ›็คบไพ‹ไปฃ็ ๆฅ่ฟ่กŒๆŽจ็†ๆผ”็คบใ€‚ + +**็ฌฌไธ€ๆญฅใ€‚** ไธ‹่ฝฝ้…็ฝฎๆ–‡ไปถๅ’Œๆƒ้‡ๆ–‡ไปถใ€‚ + +```shell +mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest . +``` + +**็ฌฌไบŒๆญฅใ€‚** ้ชŒ่ฏๆŽจ็†ๆผ”็คบใ€‚ + +้€‰้กน๏ผˆa๏ผ‰ใ€‚ๅฆ‚ๆžœๆ‚จๆ˜ฏไปŽๆบไปฃ็ ๅฎ‰่ฃ…็š„ mmaction2๏ผŒๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค๏ผš + +```shell +# demo.mp4 ๅ’Œ label_map_k400.txt ้ƒฝๆฅ่‡ชไบŽ Kinetics-400 +python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \ + tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \ + demo/demo.mp4 tools/data/kinetics/label_map_k400.txt +``` + +ๆ‚จๅฐ†ๅœจ็ปˆ็ซฏ็œ‹ๅˆฐๅ‰5ไธชๆ ‡็ญพๅŠๅ…ถๅฏนๅบ”็š„ๅˆ†ๆ•ฐใ€‚ + +้€‰้กน๏ผˆb๏ผ‰ใ€‚ๅฆ‚ๆžœๆ‚จๅฐ† mmaction2 ๅฎ‰่ฃ…ไธบไธ€ไธช Python ๅŒ…๏ผŒๅฏไปฅๅœจ Python ่งฃ้‡Šๅ™จไธญ่ฟ่กŒไปฅไธ‹ไปฃ็ ๏ผŒ่ฟ™ๅฐ†่ฟ›่กŒ็ฑปไผผ็š„้ชŒ่ฏ๏ผš + +```python +from operator import itemgetter +from mmaction.apis import init_recognizer, inference_recognizer + +config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py' +checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' +video_file = 'demo/demo.mp4' +label_file = 'tools/data/kinetics/label_map_k400.txt' +model = init_recognizer(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +pred_result = inference_recognizer(model, video_file) + +pred_scores = pred_result.pred_score.tolist() +score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) +score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) +top5_label = score_sorted[:5] + +labels = open(label_file).readlines() +labels = [x.strip() for x in labels] +results = [(labels[k[0]], k[1]) for k in top5_label] + +print('The top-5 labels with corresponding scores are:') +for result in results: + print(f'{result[0]}: ', result[1]) +``` + +## ่‡ชๅฎšไน‰ๅฎ‰่ฃ… + +### CUDA ็‰ˆๆœฌ + +ๅœจๅฎ‰่ฃ… PyTorch ๆ—ถ๏ผŒๆ‚จๅฏ่ƒฝ้œ€่ฆๆŒ‡ๅฎš CUDA ็š„็‰ˆๆœฌใ€‚ๅฆ‚ๆžœๆ‚จไธ็กฎๅฎš้€‰ๆ‹ฉๅ“ชไธช็‰ˆๆœฌ๏ผŒ่ฏท้ตๅพชๆˆ‘ไปฌ็š„ๅปบ่ฎฎ๏ผš + +- ๅฏนไบŽ Ampere ๆžถๆž„็š„ NVIDIA GPU๏ผŒไพ‹ๅฆ‚ GeForce 30 series ไปฅๅŠ NVIDIA A100๏ผŒCUDA 11 ๆ˜ฏๅฟ…้œ€็š„ใ€‚ +- ๅฏนไบŽๆ›ดๆ—ฉ็š„ NVIDIA GPU๏ผŒCUDA 11 ๆ˜ฏๅ‘ๅ‰ๅ…ผๅฎน็š„๏ผŒไฝ† CUDA 10.2 ่ƒฝๅคŸๆไพ›ๆ›ดๅฅฝ็š„ๅ…ผๅฎนๆ€ง๏ผŒไนŸๆ›ดๅŠ ่ฝป้‡ใ€‚ + +่ฏท็กฎไฟ GPU ้ฉฑๅŠจ็จ‹ๅบๆปก่ถณๆœ€ไฝŽ็‰ˆๆœฌ่ฆๆฑ‚ใ€‚ๆœ‰ๅ…ณๆ›ดๅคšไฟกๆฏ๏ผŒ่ฏทๅ‚่ง[ๆญค่กจๆ ผ](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)ใ€‚ + +```{note} +ๅฆ‚ๆžœๆŒ‰็…งๆˆ‘ไปฌ็š„ๆœ€ไฝณๅฎž่ทต่ฟ›่กŒๅฎ‰่ฃ…๏ผŒไป…ๅฎ‰่ฃ… CUDA ่ฟ่กŒๆ—ถๅบ“ๅฐฑ่ถณๅคŸไบ†๏ผŒๅ› ไธบไธไผšๅœจๆœฌๅœฐ็ผ–่ฏ‘ไปปไฝ• CUDA ไปฃ็ ใ€‚็„ถ่€Œ๏ผŒๅฆ‚ๆžœๆ‚จๅธŒๆœ›ไปŽๆบไปฃ็ ็ผ–่ฏ‘ MMCV ๆˆ–ๅผ€ๅ‘ๅ…ถไป– CUDA ่ฟ็ฎ—็ฌฆ๏ผŒๆ‚จ้œ€่ฆไปŽ NVIDIA ็š„[็ฝ‘็ซ™](https://developer.nvidia.com/cuda-downloads)ๅฎ‰่ฃ…ๅฎŒๆ•ด็š„ CUDA ๅทฅๅ…ทๅŒ…๏ผŒๅนถไธ”ๅ…ถ็‰ˆๆœฌๅบ”ไธŽ PyTorch ็š„ CUDA ็‰ˆๆœฌๅŒน้…๏ผŒๅณ `conda install` ๅ‘ฝไปคไธญๆŒ‡ๅฎš็š„ cudatoolkit ็š„็‰ˆๆœฌใ€‚ +``` + +### ไธไฝฟ็”จ MIM ๅฎ‰่ฃ… MMCV + +MMCV ๅŒ…ๅซ C++ ๅ’Œ CUDA ๆ‰ฉๅฑ•๏ผŒๅ› ๆญคๅฎƒไธŽ PyTorch ็š„ๅ…ณ็ณปๆฏ”่พƒๅคๆ‚ใ€‚MIM ๅฏไปฅ่‡ชๅŠจ่งฃๅ†ณ่ฟ™ไบ›ไพ่ต–ๅ…ณ็ณป๏ผŒไฝฟๅฎ‰่ฃ…ๅ˜ๅพ—ๆ›ดๅŠ ๅฎนๆ˜“ใ€‚ไฝ†่ฟ™ไธๆ˜ฏๅฟ…้กป็š„ใ€‚ + +ๅฆ‚ๆžœๆ‚จๅธŒๆœ›ไฝฟ็”จ pip ่€Œไธๆ˜ฏ MIM ๅฎ‰่ฃ… MMCV๏ผŒ่ฏทๅ‚่€ƒ[MMCV ๅฎ‰่ฃ…ๆŒ‡ๅ—](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)ใ€‚่ฟ™้œ€่ฆๆ‰‹ๅŠจๆŒ‡ๅฎšๅŸบไบŽ PyTorch ็‰ˆๆœฌๅ’Œๅ…ถ CUDA ็‰ˆๆœฌ็š„ find-urlใ€‚ + +ไพ‹ๅฆ‚๏ผŒไปฅไธ‹ๅ‘ฝไปคๅฎ‰่ฃ…ไบ†ไธบ PyTorch 1.10.x ๅ’Œ CUDA 11.3 ๆž„ๅปบ็š„ mmcvใ€‚ + +```shell +pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html +``` + +### ๅœจ CPU ็Žฏๅขƒไธญๅฎ‰่ฃ… + +MMAction2 ๅฏไปฅไป…ๅœจ CPU ็Žฏๅขƒไธญๅฎ‰่ฃ…ใ€‚ๅœจ CPU ๆจกๅผไธ‹๏ผŒไฝ ๅฏไปฅๅฎŒๆˆ่ฎญ็ปƒใ€ๆต‹่ฏ•ๅ’Œๆจกๅž‹ๆŽจ็†็ญ‰ๆ‰€ๆœ‰ๆ“ไฝœใ€‚ + +ๅœจ CPU ๆจกๅผไธ‹๏ผŒMMCV ็š„้ƒจๅˆ†ๅŠŸ่ƒฝๅฐ†ไธๅฏ็”จ๏ผŒ้€šๅธธๆ˜ฏไธ€ไบ› GPU ็ผ–่ฏ‘็š„็ฎ—ๅญใ€‚ไธ่ฟ‡ไธ็”จๆ‹…ๅฟƒ๏ผŒ MMAction2 ไธญๅ‡ ไนŽๆ‰€ๆœ‰็š„ๆจกๅž‹้ƒฝไธไผšไพ่ต–่ฟ™ไบ›็ฎ—ๅญใ€‚ + +### ้€š่ฟ‡ Docker ไฝฟ็”จ MMAction2 + +ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธช[Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile)ๆฅๆž„ๅปบ้•œๅƒใ€‚็กฎไฟๆ‚จ็š„[docker ็‰ˆๆœฌ](https://docs.docker.com/engine/install/) >=19.03ใ€‚ + +```shell +# ๆž„ๅปบไธ€ไธชๅŸบไบŽ PyTorch 1.6.0ใ€CUDA 10.1 ๅ’Œ CUDNN 7 ็š„้•œๅƒใ€‚ +# ๅฆ‚ๆžœๆ‚จๅ–œๆฌขๅ…ถไป–็‰ˆๆœฌ๏ผŒ่ฏทไฟฎๆ”น Dockerfileใ€‚ +docker build -f ./docker/Dockerfile --rm -t mmaction2 . +``` + +ไฝฟ็”จไปฅไธ‹ๅ‘ฝไปค่ฟ่กŒๅฎƒ๏ผš + +```shell +# ไพ‹ๅฆ‚ๆž„ๅปบPyTorch 1.6.0, CUDA 10.1, CUDNN 7็š„้•œๅƒ +# ๅฆ‚ๆžœไฝ ๅ–œๆฌขๅ…ถไป–็‰ˆๆœฌ,ๅช่ฆไฟฎๆ”นDockerfile +docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2 +``` + +## ๆ•…้šœๆŽ’้™ค + +1. ๅฝ“ไปŽๆ—ง็‰ˆๆœฌ `0.x` ่ฟ็งปๅˆฐๆ–ฐ็‰ˆๆœฌ `1.x` ๆ—ถ๏ผŒๆ‚จๅฏ่ƒฝไผš้‡ๅˆฐไพ่ต–ๅบ“็‰ˆๆœฌไธๅŒน้…็š„้—ฎ้ข˜ใ€‚ไธ‹้ขๆ˜ฏๅœจๆŒ‰็…งไธŠ่ฟฐๅฎ‰่ฃ…่ฟ‡็จ‹ๆ‰ง่กŒๅŽ๏ผŒ้€š่ฟ‡ `pip list` ๅ‘ฝไปคๆ˜พ็คบ็š„ๆฏไธชไพ่ต–ๅบ“็š„็‰ˆๆœฌใ€‚่ฏท็กฎไฟๅœจ็ปˆ็ซฏไธญๆ˜พ็คบ็š„ๆฏไธชไพ่ต–ๅบ“็‰ˆๆœฌ้ƒฝๅคงไบŽๆˆ–็ญ‰ไบŽ๏ผˆๅณ `>=`๏ผ‰ไธ‹้ขๆฏไธชไพ่ต–ๅบ“็š„็‰ˆๆœฌใ€‚ + +```shell +mmaction2 1.0.0 +mmcv 2.0.0 +mmdet 3.0.0 +mmengine 0.7.2 +mmpose 1.0.0 +``` diff --git a/docs/zh_cn/get_started/overview.md b/docs/zh_cn/get_started/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..74e72027e69b1157443ce81f3a81cddc1bb16e76 --- /dev/null +++ b/docs/zh_cn/get_started/overview.md @@ -0,0 +1,97 @@ +# ๆฆ‚่ฟฐ + +## ไป€ไนˆๆ˜ฏ MMAction2 + +MMAction2 ๆ˜ฏไธ€ไธชๅŸบไบŽ PyTorch ็š„ๅผ€ๆบๅทฅๅ…ทๅŒ…๏ผŒๆ”ฏๆŒไบ†ๅคง้‡็š„่ง†้ข‘็†่งฃๆจกๅž‹๏ผŒๅŒ…ๆ‹ฌ**่กŒไธบ่ฏ†ๅˆซใ€ๅŸบไบŽ้ชจๆžถ็š„่กŒไธบ่ฏ†ๅˆซใ€ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๅ’Œๆ—ถๅบๅŠจไฝœๅฎšไฝ**็ญ‰ๅคšไธชไธป่ฆๆ–นๅ‘ใ€‚ๅฎƒ่ฟ˜ๆ”ฏๆŒไบ†ๅคงๅคšๆ•ฐๆต่กŒ็š„ๅญฆๆœฏๆ•ฐๆฎ้›†๏ผŒๅนถๆไพ›ไบ†่ฎธๅคšๅฎž็”จๅทฅๅ…ทๅธฎๅŠฉ็”จๆˆทๅฏนๆ•ฐๆฎ้›†ๅ’Œๆจกๅž‹่ฟ›่กŒๅคšๆ–น้ข็š„ๆŽข็ดขๅ’Œ่ฐƒ่ฏ•ใ€‚ๅฎƒๅ…ทๆœ‰ไปฅไธ‹็‰น็‚น๏ผš + +**ๅ…จๆต็จ‹๏ผŒๅคšๆจกๅž‹**๏ผšMMAction2 ๆ”ฏๆŒๅ„็ง่ง†้ข‘็†่งฃไปปๅŠก๏ผŒๅฎž็Žฐไบ†ๆœ€ๅ…ˆ่ฟ›็š„่กŒไธบ่ฏ†ๅˆซใ€ๅฎšไฝใ€ๆฃ€ๆต‹ๆจกๅž‹ใ€‚ + +**ๆจกๅ—ๅŒ–่ฎพ่ฎก**๏ผšMMAction2 ็š„ๆจกๅ—ๅŒ–่ฎพ่ฎกไฝฟ็”จๆˆทๅฏไปฅๆ นๆฎ้œ€่ฆๅฎšไน‰ๅ’Œ้‡็”จๆจกๅž‹ไธญ็š„ๆจกๅ—ใ€‚ + +**ๅฎž็”จๅทฅๅ…ทไผ—ๅคš**๏ผšMMAction2 ๆไพ›ไบ†ไธ€็ณปๅˆ—็š„ๅˆ†ๆžๅทฅๅ…ท๏ผŒๅฆ‚ๅฏ่ง†ๅŒ–ๅ™จใ€้ชŒ่ฏ่„šๆœฌใ€่ฏ„ไผฐๅ™จ็ญ‰๏ผŒไปฅๅธฎๅŠฉ็”จๆˆท่ฟ›่กŒๆ•…้šœๆŽ’้™คใ€ๅพฎ่ฐƒๆˆ–ๆฏ”่พƒๆจกๅž‹ใ€‚ + +**็”ฑ OpenMMLab ๅผบๅŠ›้ฉฑๅŠจ**๏ผšไธŽๅฎถๆ—ๅ†…็š„ๅ…ถๅฎƒ็ฎ—ๆณ•ๅบ“ไธ€ๆ ท๏ผŒMMAction2 ้ตๅพช็€ OpenMMLab ไธฅ่ฐจ็š„ๅผ€ๅ‘ๅ‡†ๅˆ™ๅ’ŒๆŽฅๅฃ็บฆๅฎš๏ผŒๆžๅคงๅœฐ้™ไฝŽไบ†็”จๆˆทๅˆ‡ๆขๅ„็ฎ—ๆณ•ๅบ“ๆ—ถ็š„ๅญฆไน ๆˆๆœฌใ€‚ๅŒๆ—ถ๏ผŒMMAction2 ไนŸๅฏไปฅ้žๅธธไพฟๆทๅœฐไธŽๅฎถๆ—ๅ†…ๅ…ถไป–็ฎ—ๆณ•ๅบ“่ทจๅบ“่”ๅŠจ๏ผŒไปŽ่€Œๆปก่ถณ็”จๆˆท่ทจ้ข†ๅŸŸ็ ”็ฉถๅ’Œ่ฝๅœฐ็š„้œ€ๆฑ‚ใ€‚ + + + + +
+

่กŒไธบ่ฏ†ๅˆซ


+

ๅŸบไบŽ้ชจๆžถ็š„่กŒไธบ่ฏ†ๅˆซ

+ + + +
+

ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹


+

ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹

+ +## ๅฆ‚ไฝ•ไฝฟ็”จๆ–‡ๆกฃ + +้’ˆๅฏนไธๅŒ็ฑปๅž‹็š„็”จๆˆท๏ผŒๆˆ‘ไปฌๅ‡†ๅค‡ไบ†่ฏฆ็ป†็š„ๆŒ‡ๅ—๏ผš + +
+ MMAction2 ็š„ๅŸบ็ก€็”จๆณ• + +- [ๅฎ‰่ฃ…](installation.md) +- [ๅฟซ้€Ÿ่ฟ่กŒ](quick_run.md) +- [ๅˆฉ็”จ็Žฐๆœ‰ๆจกๅž‹่ฟ›่กŒๆŽจ็†](../user_guides/inference.md) + +
+ +
+ๅ…ณไบŽๅœจๅทฒๆ”ฏๆŒ็š„ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒ่ฎญ็ปƒ + +- [ไบ†่งฃ้…็ฝฎๆ–‡ไปถ](../user_guides/config.md) +- [ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](../user_guides/prepare_dataset.md) +- [่ฎญ็ปƒไธŽๆต‹่ฏ•](../user_guides/train_test.md) + +
+ +
+ๅ…ณไบŽไฝฟ็”จ่ฟ‡็จ‹ไธญ็š„ๅธธ่ง้—ฎ้ข˜ + +- [ๅธธ่ง้—ฎ้ข˜่งฃ็ญ”](faq.md) +- [ๆœ‰็”จ็š„ๅทฅๅ…ท](../useful_tools.md) + +
+ +
+ๅ…ณไบŽ MMAction2 ็š„ๆก†ๆžถ่ฎพ่ฎก + +- [20ๅˆ†้’Ÿ MMAction2 ๆก†ๆžถๆŒ‡ๅ—](guide_to_framework.md) +- [MMAction2 ไธญ็š„ๆ•ฐๆฎๆต](../advanced_guides/dataflow.md) + +
+ +
+ๅ…ณไบŽ่‡ชๅฎšไน‰่ฎญ็ปƒ็š„้ซ˜็บง็”จๆณ• + +- [่‡ชๅฎšไน‰ๆจกๅž‹](../advanced_guides/customize_models.md) +- [่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](../advanced_guides/customize_dataset.md) +- [่‡ชๅฎšไน‰ๆ•ฐๆฎ็ฎก้“](../advanced_guides/customize_pipeline.md) +- [่‡ชๅฎšไน‰ไผ˜ๅŒ–ๅ™จ](../advanced_guides/customize_optimizer.md) +- [่‡ชๅฎšไน‰ๆ—ฅๅฟ—่ฎฐๅฝ•](../advanced_guides/customize_logging.md) + +
+ +
+ๅ…ณไบŽๆ”ฏๆŒ็š„ๆจกๅž‹ๅบ“ๅ’Œๆ•ฐๆฎ้›† + +- [ๆจกๅž‹ๅบ“](../modelzoo_statistics.md) +- [ๆ•ฐๆฎ้›†](../datasetzoo_statistics.md) + +
+ +
+ๅ…ณไบŽไปŽ MMAction2 0.x ่ฟ็งป + +- [ไปŽ MMAction2 0.x ่ฟ็งป](../migration.md) + +
+ +
+ๅฏนไบŽๅธŒๆœ›ๅŠ ๅ…ฅๅผ€ๆบ็คพๅŒบ๏ผŒๅ‘ MMAction2 ่ดก็Œฎไปฃ็ ็š„็ ”็ฉถ่€…ๅ’Œๅผ€ๅ‘่€… + +- [ๅฆ‚ไฝ•ไธบ MMAction2 ๅšๅ‡บ่ดก็Œฎ](contribution_guide.md) + +
diff --git a/docs/zh_cn/get_started/quick_run.md b/docs/zh_cn/get_started/quick_run.md new file mode 100644 index 0000000000000000000000000000000000000000..b7faa7f92f2180ec0bfbfad0579de371bfd51d1f --- /dev/null +++ b/docs/zh_cn/get_started/quick_run.md @@ -0,0 +1,219 @@ +# ๅฟซ้€Ÿ่ฟ่กŒ + +ๆœฌ็ซ ๅฐ†ไป‹็ป MMAction2 ็š„ๅŸบๆœฌๅŠŸ่ƒฝใ€‚ๆˆ‘ไปฌๅ‡่ฎพไฝ ๅทฒ็ป[ๆบ็ ๅฎ‰่ฃ… MMAction2](installation.md#best-practices)ใ€‚ + +- [ๅฟซ้€Ÿ่ฟ่กŒ](#ๅฟซ้€Ÿ่ฟ่กŒ) + - [ๆŽจ็†](#ๆŽจ็†) + - [ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](#ๅ‡†ๅค‡ๆ•ฐๆฎ้›†) + - [ไฟฎๆ”น้…็ฝฎ](#ไฟฎๆ”น้…็ฝฎ) + - [ไฟฎๆ”นๆ•ฐๆฎ้›†](#ไฟฎๆ”นๆ•ฐๆฎ้›†) + - [ไฟฎๆ”น่ฟ่กŒ้…็ฝฎ](#ไฟฎๆ”น่ฟ่กŒ้…็ฝฎ) + - [ไฟฎๆ”นๆจกๅž‹้…็ฝฎ](#ไฟฎๆ”นๆจกๅž‹้…็ฝฎ) + - [ๆต่งˆๆ•ฐๆฎ้›†](#ๆต่งˆๆ•ฐๆฎ้›†) + - [่ฎญ็ปƒ](#่ฎญ็ปƒ) + - [ๆต‹่ฏ•](#ๆต‹่ฏ•) + +## ๆŽจ็† + +ๅœจ MMAction2 ็š„ๆ น็›ฎๅฝ•ไธ‹ๆ‰ง่กŒๅฆ‚ไธ‹ๅ‘ฝไปค: + +```shell +python demo/demo_inferencer.py demo/demo.mp4 \ + --rec tsn --print-result \ + --label-file tools/data/kinetics/label_map_k400.txt +``` + +ๆ‚จๅบ”่ฏฅ่ƒฝๅคŸ็œ‹ๅˆฐๅผนๅ‡บ็š„่ง†้ข‘็ช—ๅฃ๏ผŒๅ’ŒๅœจๆŽงๅˆถๅฐไธญๆ‰“ๅฐ็š„ๆŽจๆ–ญ็ป“ๆžœใ€‚ + +
+ +
+
+ +```bash +# ๆŽจ็†็ป“ๆžœ +{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]} +``` + +```{note} +ๅฆ‚ๆžœๆ‚จๅœจๆฒกๆœ‰ GUI ็š„ๆœๅŠกๅ™จไธŠ่ฟ่กŒ MMAction2๏ผŒๆˆ–่€…้€š่ฟ‡็ฆ็”จ X11 ่ฝฌๅ‘็š„ SSH ้šง้“่ฟ่กŒ MMAction2๏ผŒๅˆ™ๅฏ่ƒฝไธไผš็œ‹ๅˆฐๅผนๅ‡บ็ช—ๅฃใ€‚ +``` + +ๅ…ณไบŽ MMAction2 ๆŽจ็†ๆŽฅๅฃ็š„่ฏฆ็ป†ๆ่ฟฐๅฏไปฅๅœจ[่ฟ™้‡Œ](/demo/README.md#inferencer)ๆ‰พๅˆฐ. + +้™คไบ†ไฝฟ็”จๆˆ‘ไปฌๆไพ›็š„้ข„่ฎญ็ปƒๆจกๅž‹๏ผŒๆ‚จ่ฟ˜ๅฏไปฅๅœจ่‡ชๅทฑ็š„ๆ•ฐๆฎ้›†ไธŠ่ฎญ็ปƒๆจกๅž‹ใ€‚ๅœจไธ‹ไธ€่Š‚ไธญ๏ผŒๆˆ‘ไปฌๅฐ†้€š่ฟ‡ๅœจ็ฒพ็ฎ€็‰ˆ [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) ๆ•ฐๆฎ้›†ไธŠ่ฎญ็ปƒ TSN ไธบไพ‹๏ผŒๅธฆๆ‚จไบ†่งฃ MMAction2 ็š„ๅŸบๆœฌๅŠŸ่ƒฝใ€‚ + +## ๅ‡†ๅค‡ๆ•ฐๆฎ้›† + +็”ฑไบŽ่ง†้ข‘ๆ•ฐๆฎ้›†ๆ ผๅผ็š„ๅคšๆ ทๆ€งไธๅˆฉไบŽๆ•ฐๆฎ้›†็š„ๅˆ‡ๆข๏ผŒMMAction2 ๆๅ‡บไบ†็ปŸไธ€็š„[ๆ•ฐๆฎๆ ผๅผ](../user_guides/prepare_dataset.md) ๏ผŒๅนถไธบๅธธ็”จ็š„่ง†้ข‘ๆ•ฐๆฎ้›†ๆไพ›ไบ†[ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆŒ‡ๅ—](../user_guides/data_prepare/dataset_prepare.md)ใ€‚้€šๅธธ๏ผŒ่ฆๅœจ MMAction2 ไธญไฝฟ็”จ่ฟ™ไบ›ๆ•ฐๆฎ้›†๏ผŒไฝ ๅช้œ€่ฆๆŒ‰็…งๆญฅ้ชค่ฟ›่กŒๅ‡†ๅค‡ใ€‚ + +```{็ฌ”่ฎฐ} +ไฝ†ๅœจ่ฟ™้‡Œ๏ผŒๆ•ˆ็އๆ„ๅ‘ณ็€ไธ€ๅˆ‡ใ€‚ +``` + +้ฆ–ๅ…ˆ๏ผŒ่ฏทไธ‹่ฝฝๆˆ‘ไปฌ้ข„ๅ…ˆๅ‡†ๅค‡ๅฅฝ็š„ [kinetics400_tiny.zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) ๏ผŒๅนถๅฐ†ๅ…ถ่งฃๅŽ‹ๅˆฐ MMAction2 ๆ น็›ฎๅฝ•ไธ‹็š„ `data/` ็›ฎๅฝ•ใ€‚่ฟ™ๅฐ†ไธบๆ‚จๆไพ›ๅฟ…่ฆ็š„่ง†้ข‘ๅ’Œๆณจ้‡Šๆ–‡ไปถใ€‚ + +```Bash +wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip +mkdir -p data/ +unzip kinetics400_tiny.zip -d data/ +``` + +## ไฟฎๆ”น้…็ฝฎ + +ๅ‡†ๅค‡ๅฅฝๆ•ฐๆฎ้›†ไน‹ๅŽ๏ผŒไธ‹ไธ€ๆญฅๆ˜ฏไฟฎๆ”น้…็ฝฎๆ–‡ไปถ๏ผŒไปฅๆŒ‡ๅฎš่ฎญ็ปƒ้›†ๅ’Œ่ฎญ็ปƒๅ‚ๆ•ฐ็š„ไฝ็ฝฎใ€‚ + +ๅœจๆœฌไพ‹ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไฝฟ็”จ resnet50 ไฝœไธบไธปๅนฒ็ฝ‘็ปœๆฅ่ฎญ็ปƒ TSNใ€‚็”ฑไบŽ MMAction2 ๅทฒ็ปๆœ‰ไบ†ๅฎŒๆ•ด็š„ Kinetics400 ๆ•ฐๆฎ้›†็š„้…็ฝฎๆ–‡ไปถ (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`)๏ผŒๆˆ‘ไปฌๅช้œ€่ฆๅœจๅ…ถๅŸบ็ก€ไธŠ่ฟ›่กŒไธ€ไบ›ไฟฎๆ”นใ€‚ + +### ไฟฎๆ”นๆ•ฐๆฎ้›† + +ๆˆ‘ไปฌ้ฆ–ๅ…ˆ้œ€่ฆไฟฎๆ”นๆ•ฐๆฎ้›†็š„่ทฏๅพ„ใ€‚ๆ‰“ๅผ€ `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ๏ผŒๆŒ‰ๅฆ‚ไธ‹ๆ›ฟๆขๅ…ณ้”ฎๅญ—: + +```Python +data_root = 'data/kinetics400_tiny/train' +data_root_val = 'data/kinetics400_tiny/val' +ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt' +ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt' +``` + +### ไฟฎๆ”น่ฟ่กŒ้…็ฝฎ + +ๆญคๅค–๏ผŒ็”ฑไบŽๆ•ฐๆฎ้›†็š„ๅคงๅฐๅ‡ๅฐ‘๏ผŒๆˆ‘ไปฌๅปบ่ฎฎๅฐ†่ฎญ็ปƒๆ‰นๅคงๅฐๅ‡ๅฐ‘ๅˆฐ4ไธช๏ผŒ่ฎญ็ปƒepoch็š„ๆ•ฐ้‡็›ธๅบ”ๅ‡ๅฐ‘ๅˆฐ10ไธชใ€‚ๆญคๅค–๏ผŒๆˆ‘ไปฌๅปบ่ฎฎๅฐ†้ชŒ่ฏๅ’Œๆƒๅ€ผๅญ˜ๅ‚จ้—ด้š”็ผฉ็Ÿญไธบ1่ฝฎ๏ผŒๅนถไฟฎๆ”นๅญฆไน ็އ่กฐๅ‡็ญ–็•ฅใ€‚ไฟฎๆ”น `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ไธญๅฏนๅบ”็š„ๅ…ณ้”ฎๅญ—๏ผŒๅฆ‚ไธ‹ๆ‰€็คบ็”Ÿๆ•ˆใ€‚ + +```python +# ่ฎพ็ฝฎ่ฎญ็ปƒๆ‰นๅคงๅฐไธบ 4 +train_dataloader['batch_size'] = 4 + +# ๆฏ่ฝฎ้ƒฝไฟๅญ˜ๆƒ้‡๏ผŒๅนถไธ”ๅชไฟ็•™ๆœ€ๆ–ฐ็š„ๆƒ้‡ +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1)) +# ๅฐ†ๆœ€ๅคง epoch ๆ•ฐ่ฎพ็ฝฎไธบ 10๏ผŒๅนถๆฏ 1 ไธช epoch้ชŒ่ฏๆจกๅž‹ +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +#ๆ นๆฎ 10 ไธช epoch่ฐƒๆ•ดๅญฆไน ็އ่ฐƒๅบฆ +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[4, 8], + gamma=0.1) +] +``` + +### ไฟฎๆ”นๆจกๅž‹้…็ฝฎ + +ๆญคๅค–๏ผŒ็”ฑไบŽ็ฒพ็ฎ€็‰ˆ Kinetics ๆ•ฐๆฎ้›†่ง„ๆจก่พƒๅฐ๏ผŒๅปบ่ฎฎๅŠ ่ฝฝๅŽŸๅง‹ Kinetics ๆ•ฐๆฎ้›†ไธŠ็š„้ข„่ฎญ็ปƒๆจกๅž‹ใ€‚ๆญคๅค–๏ผŒๆจกๅž‹้œ€่ฆๆ นๆฎๅฎž้™…็ฑปๅˆซๆ•ฐ่ฟ›่กŒไฟฎๆ”นใ€‚่ฏท็›ดๆŽฅๅฐ†ไปฅไธ‹ไปฃ็ ๆทปๅŠ ๅˆฐ `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ไธญใ€‚ + +```python +model = dict( + cls_head=dict(num_classes=2)) +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' +``` + +ๅœจ่ฟ™้‡Œ๏ผŒๆˆ‘ไปฌ็›ดๆŽฅ้€š่ฟ‡็ปงๆ‰ฟ ({external+mmengine:doc} `MMEngine: Config `) ๆœบๅˆถ้‡ๅ†™ไบ†ๅŸบๆœฌ้…็ฝฎไธญ็š„็›ธๅบ”ๅ‚ๆ•ฐใ€‚ๅŽŸๅง‹ๅญ—ๆฎตๅˆ†ๅธƒๅœจ `configs/_base_/models/tsn_r50.py`ใ€`configs/_base_/schedules/sgd_100e.py` ๅ’Œ `configs/_base_/default_runtime.py`ไธญใ€‚ + +```{note} +ๅ…ณไบŽ้…็ฝฎ็š„ๆ›ด่ฏฆ็ป†็š„ๆ่ฟฐ๏ผŒ่ฏทๅ‚่€ƒ[่ฟ™้‡Œ](../user_guides/config.md)ใ€‚ +``` + +## ๆต่งˆๆ•ฐๆฎ้›† + +ๅœจๅผ€ๅง‹่ฎญ็ปƒไน‹ๅ‰๏ผŒๆˆ‘ไปฌ่ฟ˜ๅฏไปฅๅฐ†่ฎญ็ปƒๆ—ถๆ•ฐๆฎ่ฝฌๆขๅค„็†็š„ๅธงๅฏ่ง†ๅŒ–ใ€‚่ฟ™ๅพˆ็ฎ€ๅ•๏ผšไผ ้€’ๆˆ‘ไปฌ้œ€่ฆๅฏ่ง†ๅŒ–็š„้…็ฝฎๆ–‡ไปถๅˆฐ [browse_dataset.py](/tools/analysis_tools/browse_dataset.py)่„šๆœฌไธญใ€‚ + +```Bash +python tools/visualizations/browse_dataset.py \ + configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + browse_out --mode pipeline +``` + +่ฝฌๆขๅŽ็š„่ง†้ข‘ๅฐ†่ขซไฟๅญ˜ๅˆฐ `browse_out` ๆ–‡ไปถๅคนไธญใ€‚ + +
+ +
+ +```{note} +ๆœ‰ๅ…ณ่ฏฅ่„šๆœฌ็š„ๅ‚ๆ•ฐๅ’Œไฝฟ็”จๆ–นๆณ•็š„่ฏฆ็ป†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ[่ฟ™้‡Œ](../user_guides/useful_tools.md)ใ€‚ +``` + +```{tip} +้™คไบ†ๆปก่ถณๆˆ‘ไปฌ็š„ๅฅฝๅฅ‡ๅฟƒ๏ผŒๅฏ่ง†ๅŒ–่ฟ˜ๅฏไปฅๅธฎๅŠฉๆˆ‘ไปฌๅœจ่ฎญ็ปƒๅ‰ๆฃ€ๆŸฅๅฏ่ƒฝๅฝฑๅ“ๆจกๅž‹ๆ€ง่ƒฝ็š„้ƒจๅˆ†๏ผŒไพ‹ๅฆ‚้…็ฝฎใ€ๆ•ฐๆฎ้›†ๅ’Œๆ•ฐๆฎ่ฝฌๆขไธญ็š„้—ฎ้ข˜ใ€‚ +``` + +ๆˆ‘ไปฌๅฏไปฅ้€š่ฟ‡ไปฅไธ‹่„šๆœฌ่ฟ›ไธ€ๆญฅๅฏ่ง†ๅŒ–ๅญฆไน ็އ่ฐƒๅบฆ๏ผŒไปฅ็กฎไฟ้…็ฝฎ็ฌฆๅˆ้ข„ๆœŸ: + +```Bash +python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +่ฎญ็ปƒๅญฆไน ็އๆ—ถ้—ด่กจๅฐ†ๆ˜พ็คบๅœจๅผนๅ‡บ็ช—ๅฃไธญใ€‚ + +
+ +
+ +```{note} +ๅญฆไน ็އๆ นๆฎๅฎž้™…ๆ‰นๆ•ฐๆฎๅคงๅฐ่‡ชๅŠจ็ผฉๆ”พใ€‚ +``` + +## ่ฎญ็ปƒ + +่ฟ่กŒๅฆ‚ไธ‹ๅ‘ฝไปคๅฏๅŠจ่ฎญ็ปƒ: + +```Bash +python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +ๆ นๆฎ็ณป็ปŸ็Žฏๅขƒ๏ผŒMMAction2 ๅฐ†่‡ชๅŠจไฝฟ็”จๆœ€ไฝณ่ฎพๅค‡่ฟ›่กŒ่ฎญ็ปƒใ€‚ๅฆ‚ๆžœๆœ‰GPU๏ผŒๅˆ™้ป˜่ฎคๅฏๅŠจๅ•ไธชGPU่ฎญ็ปƒใ€‚ๅฝ“ไฝ ๅผ€ๅง‹็œ‹ๅˆฐ loss ็š„่พ“ๅ‡บๆ—ถ๏ผŒๅฐฑ่ฏดๆ˜Žไฝ ๅทฒ็ปๆˆๅŠŸๅฏๅŠจไบ†่ฎญ็ปƒใ€‚ + +```Bash +03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:15 - mmengine - INFO - Epoch(train) [1][8/8] lr: 1.5625e-04 eta: 0:00:15 time: 0.2151 data_time: 0.0845 memory: 1314 grad_norm: 8.5647 loss: 0.7267 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7267 +03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:16 - mmengine - INFO - Epoch(train) [2][8/8] lr: 1.5625e-04 eta: 0:00:12 time: 0.1979 data_time: 0.0717 memory: 1314 grad_norm: 8.4709 loss: 0.7130 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7130 +03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:18 - mmengine - INFO - Epoch(train) [3][8/8] lr: 1.5625e-04 eta: 0:00:10 time: 0.1691 data_time: 0.0478 memory: 1314 grad_norm: 8.2910 loss: 0.6900 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.6900 +03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs +03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 1.2716 time: 1.3658 +03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth. +``` + +ๅœจๆฒกๆœ‰้ขๅค–้…็ฝฎ็š„ๆƒ…ๅ†ตไธ‹๏ผŒๆจกๅž‹ๆƒ้‡ๅฐ†่ขซไฟๅญ˜ๅˆฐ `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`๏ผŒ่€Œๆ—ฅๅฟ—ๅฐ†่ขซๅญ˜ๅ‚จๅˆฐ `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`ใ€‚ๆŽฅไธ‹ๆฅ๏ผŒๆˆ‘ไปฌๅช้œ€่ฆ่€ๅฟƒ็ญ‰ๅพ…่ฎญ็ปƒๅฎŒๆˆใ€‚ + +```{note} +่ฎญ็ปƒ็š„้ซ˜็บง็”จๆณ•๏ผŒๅฆ‚ CPU ่ฎญ็ปƒใ€ๅคšๅก่ฎญ็ปƒๅŠ้›†็พค่ฎญ็ปƒ๏ผŒ่ฏทๅ‚่€ƒ[training and Testing](../user_guides/train_test.md) +``` + +## ๆต‹่ฏ• + +็ป่ฟ‡ 10 ไธช epoch ๅŽ๏ผŒๆˆ‘ไปฌ่ง‚ๅฏŸๅˆฐ TSN ๅœจ็ฌฌ 6 ไธช epoch ่กจ็Žฐๆœ€ๅฅฝ๏ผŒ`acc/top1` ่พพๅˆฐ1.0000: + +```Bash +03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000data_time: 1.0210 time: 1.1091 +``` + +```{note} +็”ฑไบŽๅœจๅŽŸๅง‹ Kinetics400 ไธŠ่ฟ›่กŒไบ†้ข„่ฎญ็ปƒ๏ผŒ็ป“ๆžœ้žๅธธ้ซ˜๏ผŒๆ‚จๅฏ่ƒฝไผš็œ‹ๅˆฐไธๅŒ็š„็ป“ๆžœ +``` + +็„ถ่€Œ๏ผŒ่ฏฅๅ€ผไป…ๅๆ˜ ไบ† TSN ๅœจ็ฒพ็ฎ€็‰ˆ Kinetics ๆ•ฐๆฎ้›†ไธŠ็š„้ชŒ่ฏๆ€ง่ƒฝ๏ผŒ่€Œๆต‹่ฏ•็ป“ๆžœ้€šๅธธๆ›ด้ซ˜๏ผŒๅ› ไธบๅœจๆต‹่ฏ•ๆ•ฐๆฎๆตๆฐด็บฟไธญๅขžๅŠ ไบ†ๆ›ดๅคš็š„ๆ•ฐๆฎๅขžๅผบใ€‚ + +ๅผ€ๅง‹ๆต‹่ฏ•๏ผš + +```Bash +python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth +``` + +ๅนถๅพ—ๅˆฐๅฆ‚ไธ‹่พ“ๅ‡บ: + +```Bash +03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 0.0420 time: 1.0795 +``` + +่ฏฅๆจกๅž‹ๅœจ่ฏฅๆ•ฐๆฎ้›†ไธŠๅฎž็Žฐไบ† 1.000 ็š„ top1 ๅ‡†็กฎ็އใ€‚ + +```{note} +ๆต‹่ฏ•็š„้ซ˜็บง็”จๆณ•๏ผŒๅฆ‚CPUๆต‹่ฏ•ใ€ๅคšgpuๆต‹่ฏ•ใ€้›†็พคๆต‹่ฏ•๏ผŒ่ฏทๅ‚่€ƒ[Training and testing](../user_guides/train_test.md) +``` diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..ab1afe741606f68ed26f43b79ec186d0db47dd91 --- /dev/null +++ b/docs/zh_cn/index.rst @@ -0,0 +1,96 @@ +ๆฌข่ฟŽๆฅๅˆฐ MMAction2 ไธญๆ–‡ๆ•™็จ‹! +===================================== + +You can switch between Chinese and English documents in the lower-left corner of the layout. + +.. toctree:: + :maxdepth: 1 + :caption: ๆ–ฐๆ‰‹ๅ…ฅ้—จ + + get_started/overview.md + get_started/installation.md + get_started/quick_run.md + get_started/guide_to_framework.md + get_started/contribution_guide.md + get_started/faq.md + +.. toctree:: + :maxdepth: 1 + :caption: ็”จๆˆทๆŒ‡ๅ— + + user_guides/inference.md + user_guides/config.md + user_guides/train_test.md + user_guides/prepare_dataset.md + user_guides/finetune.md + +.. toctree:: + :maxdepth: 1 + :caption: ่ฟ›้˜ถๆ•™็จ‹ + + advanced_guides/dataflow.md + advanced_guides/customize_models.md + advanced_guides/customize_dataset.md + advanced_guides/customize_pipeline.md + advanced_guides/customize_optimizer.md + advanced_guides/customize_logging.md + advanced_guides/deploy.md + useful_tools.md + + +.. toctree:: + :maxdepth: 1 + :caption: ๆจกๅž‹ๆ”ฏๆŒ + + modelzoo_statistics.md + model_zoo/recognition.md + model_zoo/recognition_audio.md + model_zoo/skeleton.md + model_zoo/detection.md + model_zoo/retrieval.md + model_zoo/localization.md + +.. toctree:: + :maxdepth: 1 + :caption: ๆ•ฐๆฎ้›†ๆ”ฏๆŒ + :glob: + + datasetzoo_statistics.md + dataset_zoo/* + +.. toctree:: + :maxdepth: 1 + :caption: ็›ธๅ…ณ้กน็›ฎ + + projectzoo.md + +.. toctree:: + :maxdepth: 1 + :caption: MMAction2 0.x ่ฟ็งปๆŒ‡ๅ— + + migration.md + +.. toctree:: + :maxdepth: 1 + :caption: API ๅ‚่€ƒๆ–‡ๆกฃ + + api.rst + +.. toctree:: + :maxdepth: 1 + :caption: ๅ…ถไป–่ฏดๆ˜Ž + + notes/ecosystem.md + +.. toctree:: + :caption: ๅˆ‡ๆข่ฏญ่จ€ + + switch_language.md + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/zh_cn/make.bat b/docs/zh_cn/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..2119f51099bf37e4fdb6071dce9f451ea44c62dd --- /dev/null +++ b/docs/zh_cn/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/zh_cn/migration.md b/docs/zh_cn/migration.md new file mode 100644 index 0000000000000000000000000000000000000000..37c5fb3172f369de1db6046f292eb44dc5ddbc2d --- /dev/null +++ b/docs/zh_cn/migration.md @@ -0,0 +1,488 @@ +# ไปŽ MMAction2 0.x ่ฟ็งป + +MMAction2 1.x ๅผ•ๅ…ฅไบ†ไธ€ไบ›้‡ๆž„ๅ’Œไฟฎๆ”น๏ผŒๅŒ…ๆ‹ฌไธ€ไบ›ๅ‘ๅŽไธๅ…ผๅฎน็š„ๆ›ดๆ”นใ€‚ๆˆ‘ไปฌๆไพ›่ฟ™ไธชๆ•™็จ‹๏ผŒๅธฎๅŠฉๆ‚จไปŽ MMAction2 0.x ่ฟ็งปๆ‚จ็š„้กน็›ฎใ€‚ + +## ๆ–ฐ็š„ไพ่ต–้กน + +MMAction2 1.x ไพ่ต–ไบŽไปฅไธ‹ๅบ“ใ€‚ๅปบ่ฎฎๆ‚จๅ‡†ๅค‡ไธ€ไธชๆ–ฐ็š„่ฟ่กŒ็Žฏๅขƒ๏ผŒๅนถๆ นๆฎ[ๅฎ‰่ฃ…ๆ•™็จ‹](./get_started/installation.md)่ฟ›่กŒๅฎ‰่ฃ…ใ€‚ + +1. [MMEngine](https://github.com/open-mmlab/mmengine)๏ผšMMEngine ๆ˜ฏๅผ•ๅ…ฅไบŽ OpenMMLab 2.0 ๆžถๆž„ไธญ็š„็”จไบŽ่ฎญ็ปƒๆทฑๅบฆๅญฆไน ๆจกๅž‹็š„ๅŸบ็ก€ๅบ“ใ€‚ +2. [MMCV](https://github.com/open-mmlab/mmcv)๏ผšMMCV ๆ˜ฏ็”จไบŽ่ฎก็ฎ—ๆœบ่ง†่ง‰็š„ๅŸบ็ก€ๅบ“ใ€‚MMAction2 1.x ้œ€่ฆ `mmcv>=2.0.0`๏ผŒๅฎƒๆฏ” `mmcv-full==2.0.0` ๆ›ด็ดงๅ‡‘ๅ’Œ้ซ˜ๆ•ˆใ€‚ + +## ้…็ฝฎๆ–‡ไปถ + +ๅœจ MMAction2 1.x ไธญ๏ผŒๆˆ‘ไปฌ้‡ๆž„ไบ†้…็ฝฎๆ–‡ไปถ็š„็ป“ๆž„ใ€‚ๆ—ง้ฃŽๆ ผ็š„้…็ฝฎๆ–‡ไปถๅฐ†ไธๅ…ผๅฎนใ€‚ + +ๅœจๆœฌ่Š‚ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ไป‹็ป้…็ฝฎๆ–‡ไปถ็š„ๆ‰€ๆœ‰ๆ›ดๆ”นใ€‚ๆˆ‘ไปฌๅ‡่ฎพๆ‚จๅทฒ็ป็†Ÿๆ‚‰[้…็ฝฎๆ–‡ไปถ](./user_guides/config.md)ใ€‚ + +### ๆจกๅž‹่ฎพ็ฝฎ + +`model.backbone` ๅ’Œ `model.neck` ๆฒกๆœ‰ๆ›ดๆ”นใ€‚ๅฏนไบŽ `model.cls_head`๏ผŒๆˆ‘ไปฌๅฐ† `average_clips` ็งปๅˆฐๅ…ถไธญ๏ผŒๅŽŸๆœฌ่ฎพ็ฝฎๅœจ `model.test_cfg` ไธญใ€‚ + +### ๆ•ฐๆฎ่ฎพ็ฝฎ + +#### **`data`** ไธญ็š„ๆ›ดๆ”น + +- ๅŽŸๅง‹็š„ `data` ๅญ—ๆฎต่ขซๆ‹†ๅˆ†ไธบ `train_dataloader`ใ€`val_dataloader` ๅ’Œ `test_dataloader`ใ€‚่ฟ™ๆ ทๅฏไปฅๅฏนๅฎƒไปฌ่ฟ›่กŒ็ป†็ฒ’ๅบฆ็š„้…็ฝฎใ€‚ไพ‹ๅฆ‚๏ผŒๆ‚จๅฏไปฅๅœจ่ฎญ็ปƒๅ’Œๆต‹่ฏ•่ฟ‡็จ‹ไธญๆŒ‡ๅฎšไธๅŒ็š„้‡‡ๆ ทๅ™จๅ’Œๆ‰นๅคงๅฐใ€‚ +- `videos_per_gpu` ๆ”นๅไธบ `batch_size`ใ€‚ +- `workers_per_gpu` ๆ”นๅไธบ `num_workers`ใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict(...), + val=dict(...), + test=dict(...), +) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +train_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=True) # ๅฟ…่ฆ +) + +val_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=False) # ๅฟ…่ฆ +) + +test_dataloader = val_dataloader +``` + +
+ +#### **`pipeline`** ไธญ็š„ๆ›ดๆ”น + +- ๅŽŸๆฅ็š„ๆ ผๅผๅŒ–ๅ˜ๆข **`ToTensor`**ใ€**`Collect`** ่ขซๅˆๅนถไธบ `PackActionInputs`ใ€‚ +- ๆˆ‘ไปฌไธๅปบ่ฎฎๅœจๆ•ฐๆฎ้›†ๆตๆฐด็บฟไธญ่ฟ›่กŒ **`Normalize`**ใ€‚่ฏทไปŽๆตๆฐด็บฟไธญ็งป้™คๅฎƒ๏ผŒๅนถๅœจ `model.data_preprocessor` ๅญ—ๆฎตไธญ่ฎพ็ฝฎใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +model.data_preprocessor = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False) + +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +``` + +
+ +#### **`evaluation`** ไธญ็š„ๆ›ดๆ”น + +- **`evaluation`** ๅญ—ๆฎต่ขซๆ‹†ๅˆ†ไธบ `val_evaluator` ๅ’Œ `test_evaluator`ใ€‚ไธๅ†ๆ”ฏๆŒ `interval` ๅ’Œ `save_best` ๅ‚ๆ•ฐใ€‚ +- `interval` ็งปๅˆฐ `train_cfg.val_interval`๏ผŒ`save_best` ็งปๅˆฐ `default_hooks.checkpoint.save_best`ใ€‚ +- 'mean_average_precision'ใ€'mean_class_accuracy'ใ€'mmit_mean_average_precision'ใ€'top_k_accuracy' ่ขซๅˆๅนถไธบ `AccMetric`๏ผŒๆ‚จๅฏไปฅไฝฟ็”จ `metric_list` ๆŒ‡ๅฎš่ฆ่ฎก็ฎ—็š„ๆŒ‡ๆ ‡ใ€‚ +- `AVAMetric` ็”จไบŽ่ฏ„ไผฐ AVA ๆ•ฐๆฎ้›†ใ€‚ +- `ANetMetric` ็”จไบŽ่ฏ„ไผฐ ActivityNet ๆ•ฐๆฎ้›†ใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +evaluation = dict( + interval=5, + metrics=['top_k_accuracy', 'mean_class_accuracy']) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +val_evaluator = dict( + type='AccMetric', + metric_list=('top_k_accuracy', 'mean_class_accuracy')) +test_evaluator = val_evaluator +``` + +
+ +### ๅญฆไน ็އ็ญ–็•ฅ่ฎพ็ฝฎ + +#### **`optimizer`** ๅ’Œ **`optimizer_config`** ไธญ็š„ๆ›ดๆ”น + +- ็Žฐๅœจๆˆ‘ไปฌไฝฟ็”จ `optim_wrapper` ๅญ—ๆฎตๆฅ้…็ฝฎไผ˜ๅŒ–่ฟ‡็จ‹ใ€‚`optimizer` ๆˆไธบ `optim_wrapper` ็š„ๅญๅญ—ๆฎตใ€‚ +- `paramwise_cfg` ไนŸๆ˜ฏ `optim_wrapper` ็š„ๅญๅญ—ๆฎต๏ผŒไธŽ `optimizer` ๅนณ่กŒใ€‚ +- ็Žฐๅœจๅทฒๅˆ ้™ค `optimizer_config`๏ผŒๅ…ถไธญ็š„ๆ‰€ๆœ‰้…็ฝฎ้ƒฝ็งปๅŠจๅˆฐ `optim_wrapper`ใ€‚ +- `grad_clip` ๆ”นๅไธบ `clip_grad`ใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +optimizer = dict( + type='AdamW', + lr=0.0015, + weight_decay=0.3, + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + )) + +optimizer_config = dict(grad_clip=dict(max_norm=1.0)) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3), + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + ), + clip_gard=dict(max_norm=1.0), +) +``` + +
+ +#### **`lr_config`** ไธญ็š„ๆ›ดๆ”น + +- ๅˆ ้™คไบ† `lr_config` ๅญ—ๆฎต๏ผŒๆˆ‘ไปฌไฝฟ็”จๆ–ฐ็š„ `param_scheduler` ๆฅๆ›ฟไปฃๅฎƒใ€‚ +- ๅˆ ้™คไบ†ไธŽ warmup ็›ธๅ…ณ็š„ๅ‚ๆ•ฐ๏ผŒๅ› ไธบๆˆ‘ไปฌไฝฟ็”จ็ญ–็•ฅ็ป„ๅˆๆฅๅฎž็Žฐ่ฟ™ไธชๅŠŸ่ƒฝใ€‚ + +ๆ–ฐ็š„็ป„ๅˆๆœบๅˆถ้žๅธธ็ตๆดป๏ผŒๆ‚จๅฏไปฅไฝฟ็”จๅฎƒๆฅ่ฎพ่ฎกๅคš็งๅญฆไน ็އ/ๅŠจ้‡ๆ›ฒ็บฟใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_iters=5, + warmup_ratio=0.01, + warmup_by_epoch=True) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +param_scheduler = [ + # ๅญฆไน ็އ้ข„็ƒญ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + end=5, + # ๅœจๆฏไธช่ฟญไปฃๅŽๆ›ดๆ–ฐๅญฆไน ็އใ€‚ + convert_to_iter_based=True), + # ไธป่ฆ็š„ๅญฆไน ็އ็ญ–็•ฅ + dict(type='CosineAnnealingLR', by_epoch=True, begin=5), +] +``` + +
+ +#### **`runner`** ไธญ็š„ๆ›ดๆ”น + +ๅŽŸๅง‹ `runner` ๅญ—ๆฎตไธญ็š„ๅคงๅคšๆ•ฐ้…็ฝฎๅทฒ็งป่‡ณ `train_cfg`ใ€`val_cfg` ๅ’Œ `test_cfg`๏ผŒ็”จไบŽ้…็ฝฎ่ฎญ็ปƒใ€้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๅพช็Žฏใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +runner = dict(type='EpochBasedRunner', max_epochs=100) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +# `val_interval` ๆ˜ฏๅŽŸ `evaluation.interval`ใ€‚ +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') # ไฝฟ็”จ้ป˜่ฎค้ชŒ่ฏๅพช็Žฏใ€‚ +test_cfg = dict(type='TestLoop') # ไฝฟ็”จ้ป˜่ฎคๆต‹่ฏ•ๅพช็Žฏใ€‚ +``` + +
+ +ไบ‹ๅฎžไธŠ๏ผŒๅœจ OpenMMLab 2.0 ไธญ๏ผŒๆˆ‘ไปฌๅผ•ๅ…ฅไบ† `Loop` ๆฅๆŽงๅˆถ่ฎญ็ปƒใ€้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„่กŒไธบใ€‚`Runner` ็š„ๅŠŸ่ƒฝไนŸๅ‘็”Ÿไบ†ๅ˜ๅŒ–ใ€‚ๆ‚จๅฏไปฅๅœจ[MMEngine ๆ•™็จ‹](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html)ไธญๆ‰พๅˆฐๆ›ดๅคš่ฏฆ็ป†ไฟกๆฏใ€‚ + +### ่ฟ่กŒๆ—ถ่ฎพ็ฝฎ + +#### **`checkpoint_config`** ๅ’Œ **`log_config`** ไธญ็š„ๆ›ดๆ”น + +`checkpoint_config` ็งปๅˆฐ `default_hooks.checkpoint`๏ผŒ`log_config` ็งปๅˆฐ `default_hooks.logger`ใ€‚ๆˆ‘ไปฌๅฐ†่ฎธๅคš้’ฉๅญ็š„่ฎพ็ฝฎไปŽ่„šๆœฌไปฃ็ ไธญ็งปๅŠจๅˆฐ่ฟ่กŒๆ—ถ้…็ฝฎ็š„ `default_hooks` ๅญ—ๆฎตไธญใ€‚ + +```python +default_hooks = dict( + # ๆ›ดๆ–ฐ่ฟ่กŒๆ—ถไฟกๆฏ๏ผŒๅฆ‚ๅฝ“ๅ‰่ฟญไปฃๅ’Œๅญฆไน ็އใ€‚ + runtime_info=dict(type='RuntimeInfoHook'), + + # ่ฎฐๅฝ•ๆฏไธช่ฟญไปฃ็š„ๆ—ถ้—ดใ€‚ + timer=dict(type='IterTimerHook'), + + # ๆฏ 100 ๆฌก่ฟญไปฃๆ‰“ๅฐๆ—ฅๅฟ—ใ€‚ + logger=dict(type='LoggerHook', interval=100), + + # ๅฏ็”จๅ‚ๆ•ฐ็ญ–็•ฅๅ™จใ€‚ + param_scheduler=dict(type='ParamSchedulerHook'), + + # ๆฏไธช epoch ไฟๅญ˜ไธ€ๆฌกๆƒ้‡๏ผŒๅนถ่‡ชๅŠจไฟๅญ˜ๆœ€ไฝณๆƒ้‡ใ€‚ + checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'), + + # ๅœจๅˆ†ๅธƒๅผ็Žฏๅขƒไธญ่ฎพ็ฝฎ้‡‡ๆ ทๅ™จ็งๅญใ€‚ + sampler_seed=dict(type='DistSamplerSeedHook'), + + # ๅœจๆฏไธช epoch ็ป“ๆŸๆ—ถๅŒๆญฅๆจกๅž‹็ผ“ๅ†ฒๅŒบใ€‚ + sync_buffers=dict(type='SyncBuffersHook') +) +``` + +ๆญคๅค–๏ผŒๆˆ‘ไปฌๅฐ†ๅŽŸๆฅ็š„ logger ๆ‹†ๅˆ†ไธบ logger ๅ’Œ visualizerใ€‚logger ็”จไบŽ่ฎฐๅฝ•ไฟกๆฏ๏ผŒvisualizer ็”จไบŽๅœจไธๅŒ็š„ๅŽ็ซฏ๏ผˆๅฆ‚็ปˆ็ซฏใ€TensorBoard ๅ’Œ Wandb๏ผ‰ไธญๆ˜พ็คบ loggerใ€‚ + + + + + + + + + +
ๆ—ง็‰ˆๆœฌ + +```python +log_config = dict( + interval=100, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + ]) +``` + +
ๆ–ฐ็‰ˆๆœฌ + +```python +default_hooks = dict( + ... + logger=dict(type='LoggerHook', interval=100), +) + +visualizer = dict( + type='ActionVisualizer', + vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], +) +``` + +
+ +#### **`load_from`** ๅ’Œ **`resume_from`** ไธญ็š„ๆ›ดๆ”น + +- ๅˆ ้™คไบ† `resume_from`ใ€‚็Žฐๅœจๆˆ‘ไปฌไฝฟ็”จ `resume` ๅ’Œ `load_from` ๆฅๆ›ฟไปฃๅฎƒใ€‚ + - ๅฆ‚ๆžœ `resume=True` ๅนถไธ” `load_from` ไธไธบ None๏ผŒๅˆ™ไปŽ `load_from` ไธญ็š„ๆƒ้‡ๆขๅค่ฎญ็ปƒใ€‚ + - ๅฆ‚ๆžœ `resume=True` ๅนถไธ” `load_from` ไธบ None๏ผŒๅˆ™ๅฐ่ฏ•ไปŽๅทฅไฝœ็›ฎๅฝ•ไธญ็š„ๆœ€ๆ–ฐๆƒ้‡ๆขๅคใ€‚ + - ๅฆ‚ๆžœ `resume=False` ๅนถไธ” `load_from` ไธไธบ None๏ผŒๅˆ™ๅชๅŠ ่ฝฝๆƒ้‡ๆ–‡ไปถ๏ผŒไธๆขๅค่ฎญ็ปƒใ€‚ + - ๅฆ‚ๆžœ `resume=False` ๅนถไธ” `load_from` ไธบ None๏ผŒๅˆ™ๆ—ขไธๅŠ ่ฝฝไนŸไธๆขๅคใ€‚ + +#### **`dist_params`** ไธญ็š„ๆ›ดๆ”น + +`dist_params` ๅญ—ๆฎต็Žฐๅœจๆ˜ฏ `env_cfg` ็š„ๅญๅญ—ๆฎตใ€‚`env_cfg` ไธญ่ฟ˜ๆœ‰ไธ€ไบ›ๆ–ฐ็š„้…็ฝฎใ€‚ + +```python +env_cfg = dict( + # ๆ˜ฏๅฆๅฏ็”จ cudnn benchmark + cudnn_benchmark=False, + + # ่ฎพ็ฝฎๅคš่ฟ›็จ‹ๅ‚ๆ•ฐ + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + + # ่ฎพ็ฝฎๅˆ†ๅธƒๅผๅ‚ๆ•ฐ + dist_cfg=dict(backend='nccl'), +) +``` + +#### **`workflow`** ไธญ็š„ๆ›ดๆ”น + +ๅˆ ้™คไบ†ไธŽ `workflow` ็›ธๅ…ณ็š„ๅŠŸ่ƒฝใ€‚ + +#### ๆ–ฐๅญ—ๆฎต **`visualizer`** + +visualizer ๆ˜ฏ OpenMMLab 2.0 ๆžถๆž„ไธญ็š„ๆ–ฐ่ฎพ่ฎกใ€‚ๆˆ‘ไปฌๅœจ runner ไธญไฝฟ็”จไธ€ไธช visualizer ๅฎžไพ‹ๆฅๅค„็†็ป“ๆžœๅ’Œๆ—ฅๅฟ—็š„ๅฏ่ง†ๅŒ–๏ผŒๅนถไฟๅญ˜ๅˆฐไธๅŒ็š„ๅŽ็ซฏ๏ผŒๅฆ‚็ปˆ็ซฏใ€TensorBoard ๅ’Œ Wandbใ€‚ + +```python +visualizer = dict( + type='ActionVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + # ๅ–ๆถˆไธ‹้ขไธ€่กŒ็š„ๆณจ้‡Š๏ผŒๅฐ†ๆ—ฅๅฟ—ๅ’Œๅฏ่ง†ๅŒ–็ป“ๆžœไฟๅญ˜ๅˆฐ TensorBoardใ€‚ + # dict(type='TensorboardVisBackend') + ] +) +``` + +#### ๆ–ฐๅญ—ๆฎต **`default_scope`** + +ๆ‰€ๆœ‰ๆณจๅ†Œ่กจๅœจไธๅŒๅŒ…ไธญ็š„ๅฎšไน‰ๅทฒ็งปๅŠจๅˆฐ `mmaction.registry` ๅŒ…ไธญใ€‚ + +## Packages + +### `mmaction.apis` + +ๆ–‡ๆกฃๅฏไปฅๅœจ[่ฟ™้‡Œ](mmaction.apis)ๆ‰พๅˆฐใ€‚ + +| ๅ‡ฝๆ•ฐ | ๆ›ดๆ”น | +| :--------------------: | :------------------------------------------: | +| `init_recognizer` | ๆ— ้œ€ๆ›ดๆ”น | +| `inference_recognizer` | ๆ— ้œ€ๆ›ดๆ”น | +| `train_model` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `runner.train` ่ฟ›่กŒ่ฎญ็ปƒ | +| `multi_gpu_test` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `runner.test` ่ฟ›่กŒๆต‹่ฏ• | +| `single_gpu_test` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `runner.test` ่ฟ›่กŒๆต‹่ฏ• | +| `set_random_seed` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.runner.set_random_seed` | +| `init_random_seed` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.dist.sync_random_seed` | + +### `mmaction.core` + +`mmaction.core` ๅŒ…ๅทฒ่ขซ้‡ๅ‘ฝๅไธบ [`mmaction.engine`](mmaction.engine)ใ€‚ + +| ๅญๅŒ… | ๆ›ดๆ”น | +| :----------: | :-------------------------------------------------------: | +| `evaluation` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmaction.evaluation` ไธญ็š„ๆŒ‡ๆ ‡ | +| `hooks` | ็งปๅŠจๅˆฐ `mmaction.engine.hooks` | +| `optimizer` | ็งปๅŠจๅˆฐ `mmaction.engine.optimizers` | +| `utils` | ๅˆ ้™ค๏ผŒๅˆ†ๅธƒๅผ็Žฏๅขƒ็›ธๅ…ณ็š„ๅ‡ฝๆ•ฐๅฏไปฅๅœจ `mmengine.dist` ๅŒ…ไธญๆ‰พๅˆฐ | + +### `mmaction.datasets` + +ๆ–‡ๆกฃๅฏไปฅๅœจ[่ฟ™้‡Œ](mmaction.datasets)ๆ‰พๅˆฐใ€‚ + +#### [`BaseActionDataset`](mmaction.datasets.BaseActionDataset) ไธญ็š„ๆ›ดๆ”น๏ผš + +| ๆ–นๆณ• | ๆ›ดๆ”น | +| :--------------------: | :-----------------------------------------: | +| `prepare_train_frames` | ็”ฑ `get_data_info` ๆ›ฟๆข | +| `preprare_test_frames` | ็”ฑ `get_data_info` ๆ›ฟๆข | +| `evaluate` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.evaluator.Evaluator` | +| `dump_results` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.evaluator.DumpResults` | +| `load_annotations` | ๆ›ฟๆขไธบ `load_data_list` | + +็Žฐๅœจ๏ผŒๆ‚จๅฏไปฅ็ผ–ๅ†™ไธ€ไธช็ปงๆ‰ฟ่‡ช `BaseActionDataset` ็š„ๆ–ฐ Dataset ็ฑป๏ผŒๅนถไป…้‡ๅ†™ `load_data_list`ใ€‚่ฆๅŠ ่ฝฝๆ›ดๅคš็š„ๆ•ฐๆฎไฟกๆฏ๏ผŒๆ‚จๅฏไปฅๅƒ `RawframeDataset` ๅ’Œ `AVADataset` ้‚ฃๆ ท้‡ๅ†™ `get_data_info`ใ€‚ +`mmaction.datasets.pipelines` ่ขซ้‡ๅ‘ฝๅไธบ `mmaction.datasets.transforms`๏ผŒ`mmaction.datasets.pipelines.augmentations` ่ขซ้‡ๅ‘ฝๅไธบ `mmaction.datasets.pipelines.processing`ใ€‚ + +### `mmaction.models` + +ๆ–‡ๆกฃๅฏไปฅๅœจ[่ฟ™้‡Œ](mmaction.models)ๆ‰พๅˆฐใ€‚ๆ‰€ๆœ‰ **backbones**ใ€**necks** ๅ’Œ **losses** ็š„ๆŽฅๅฃๆฒกๆœ‰ๆ›ดๆ”นใ€‚ + +[`BaseRecognizer`](mmaction.models.BaseRecognizer) ไธญ็š„ๆ›ดๆ”น๏ผš + +| ๆ–นๆณ• | ๆ›ดๆ”น | +| :-------------: | :----------------------------------------------------------------------------------------------------------------------------: | +| `extract_feat` | ๅขžๅผบ็š„ๆ–นๆณ•๏ผŒ็Žฐๅœจๆ”ฏๆŒไธ‰ไธช้˜ถๆฎต๏ผˆ`backbone`ใ€`neck`ใ€`head`๏ผ‰็š„่พ“ๅ‡บ็‰นๅพ๏ผŒๅนถไธ”ๅฏไปฅๅค„็†ไธๅŒ็š„ๆจกๅผ๏ผŒๅฆ‚ `train_mode` ๅ’Œ `test_mode`ใ€‚ | +| `forward` | ็ŽฐๅœจๅชๆŽฅๅ—ไธ‰ไธชๅ‚ๆ•ฐ๏ผš`inputs`ใ€`data_samples` ๅ’Œ `mode`ใ€‚่ฏฆ็ป†ไฟกๆฏ่ฏทๅ‚้˜…[ๆ–‡ๆกฃ](mmaction.models.BaseRecognizer)ใ€‚ | +| `forward_train` | ๅทฒๆ›ฟๆขไธบ `loss`ใ€‚ | +| `forward_test` | ๅทฒๆ›ฟๆขไธบ `predict`ใ€‚ | +| `train_step` | `optimizer` ๅ‚ๆ•ฐ่ขซๆ›ฟๆขไธบ `optim_wrapper`๏ผŒๅฎƒๆŽฅๅ— [`OptimWrapper`](mmengine.optim.OptimWrapper)ใ€‚ | +| `val_step` | ๅŽŸ `val_step` ไธŽ `train_step` ็›ธๅŒ๏ผŒ็Žฐๅœจ่ฐƒ็”จ `predict`ใ€‚ | +| `test_step` | ๆ–ฐๆ–นๆณ•๏ผŒไธŽ `val_step` ็›ธๅŒใ€‚ | + +[BaseHead](mmaction.models.BaseHead) ไธญ็š„ๆ›ดๆ”น๏ผš + +| ๆ–นๆณ• | ๆ›ดๆ”น | +| :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| `forward` | ๆ— ้œ€ๆ›ดๆ”น | +| `loss` | ๆŽฅๅ— `feats` ๅ’Œ `data_samples`๏ผŒ่€Œไธๆ˜ฏ `cls_score` ๅ’Œ `labels` ๆฅ่ฎก็ฎ—ๆŸๅคฑใ€‚`data_samples` ๆ˜ฏ [ActionDataSample](mmaction.structures.ActionDataSample) ็š„ๅˆ—่กจใ€‚ | +| `predict` | ๆŽฅๅ— `feats` ๅ’Œ `data_samples` ๆฅ้ข„ๆต‹ๅˆ†็ฑปๅˆ†ๆ•ฐใ€‚ | + +### `mmaction.utils` + +| ๅ‡ฝๆ•ฐ | ๆ›ดๆ”น | +| :---------------------: | :--------------------------------------------------------: | +| `collect_env` | ๆ— ้œ€ๆ›ดๆ”น | +| `get_root_logger` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.MMLogger.get_current_instance` | +| `setup_multi_processes` | ๅˆ ้™ค๏ผŒไฝฟ็”จ `mmengine.utils.dl_utils.setup_multi_processes` | + +### ๅ…ถไป–ๆ›ดๆ”น + +- ๆˆ‘ไปฌๅฐ†ๆ‰€ๆœ‰ๆณจๅ†Œๅ™จ็š„ๅฎšไน‰ไปŽๅ„ไธชๅŒ…็งปๅŠจๅˆฐไบ† `mmaction.registry` ใ€‚ diff --git a/docs/zh_cn/notes/ecosystem.md b/docs/zh_cn/notes/ecosystem.md new file mode 100644 index 0000000000000000000000000000000000000000..7fff5e41f0d22ef826b14d269f5994c4ea60c9d4 --- /dev/null +++ b/docs/zh_cn/notes/ecosystem.md @@ -0,0 +1,23 @@ +# ๅŸบไบŽ MMAction2 ็š„็”Ÿๆ€้กน็›ฎ + +ๆœ‰่ฎธๅคš็ ”็ฉถๅทฅไฝœๅ’Œ้กน็›ฎๆ˜ฏๅŸบไบŽ MMAction2 ๆž„ๅปบ็š„ใ€‚ +ๆˆ‘ไปฌๅˆ—ไธพไบ†ไธ€ไบ›ไพ‹ๅญ๏ผŒๅฑ•็คบไบ†ๅฆ‚ไฝ•ๆ‰ฉๅฑ• MMAction2 ๆฅ้€‚็”จไบŽๆ‚จ่‡ชๅทฑ็š„้กน็›ฎใ€‚ +็”ฑไบŽ้กต้ขๅฏ่ƒฝๅฐšๆœชๅฎŒๆˆ๏ผŒๆ‰€ไปฅ่ฏท้šๆ—ถ้€š่ฟ‡ๆไบคPRๆฅๆ›ดๆ–ฐๆญค้กต้ขใ€‚ + +## ไฝœไธบๆ‰ฉๅฑ•็š„้กน็›ฎ + +- [OTEAction2](https://github.com/openvinotoolkit/mmaction2)๏ผš็”จไบŽๅŠจไฝœ่ฏ†ๅˆซ็š„ OpenVINO ่ฎญ็ปƒๆ‰ฉๅฑ•ใ€‚ +- [PYSKL](https://github.com/kennymckormick/pyskl)๏ผšไธ€ไธชไธ“ๆณจไบŽๅŸบไบŽ้ชจ้ชผ็‚นๅŠจไฝœ่ฏ†ๅˆซ็š„ๅทฅๅ…ท็ฎฑใ€‚ + +## ่ฎบๆ–‡็›ธๅ…ณ็š„้กน็›ฎ + +่ฟ˜ๆœ‰ไธ€ไบ›ไธŽ่ฎบๆ–‡ไธ€่ตทๅ‘ๅธƒ็š„้กน็›ฎใ€‚ +ๅ…ถไธญไธ€ไบ›่ฎบๆ–‡ๅ‘่กจๅœจ้กถ็บงไผš่ฎฎ๏ผˆCVPRใ€ICCV ๅ’Œ ECCV๏ผ‰ไธŠ๏ผŒๅ…ถไป–ไธ€ไบ›ไนŸๅ…ทๆœ‰ๅพˆ้ซ˜็š„ๅฝฑๅ“ๅŠ›ใ€‚ +ๆˆ‘ไปฌๆŒ‰็…งไผš่ฎฎๆ—ถ้—ดๅˆ—ๅ‡บๅฎƒไปฌ๏ผŒๆ–นไพฟ็คพๅŒบๅ‚่€ƒใ€‚ + +- Video Swin Transformer๏ผŒCVPR 2022 [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer) +- Evidential Deep Learning for Open Set Action Recognition๏ผŒICCV 2021 Oral [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR) +- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective๏ผŒICCV 2021 Oral [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS) +- MGSampler: An Explainable Sampling Strategy for Video Action Recognition๏ผŒICCV 2021 [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2104.09952)[\[github\]](https://github.com/MCG-NJU/MGSampler) +- MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions๏ผŒICCV 2021 [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2105.07404) +- Long Short-Term Transformer for Online Action Detection๏ผŒNeurIPS 2021 [\[่ฎบๆ–‡\]](https://arxiv.org/abs/2107.03377)[\[github\]](https://github.com/amazon-research/long-short-term-transformer) diff --git a/docs/zh_cn/notes/pytorch2.0.md b/docs/zh_cn/notes/pytorch2.0.md new file mode 100644 index 0000000000000000000000000000000000000000..09499beacd30f21384ebf64ab62e2607a2675d11 --- /dev/null +++ b/docs/zh_cn/notes/pytorch2.0.md @@ -0,0 +1,21 @@ +# PyTorch 2.0 Compatibility and Benchmark + +PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation. + +| Config | compiled | Train time / iter (s) | GPU memory (M) | test metric | +| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | False | 0.50 | 42537 | 36.55 | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | True | 0.61 | 53149 | 36.72 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | False | 0.688 | 14263 | 77.69 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | True | 0.691 | 13863 | 77.57 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | False | 0.0305 | 1184 | 91.69 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | True | 0.0298 | 1273 | 91.64 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | False | 0.498 | 9581 | 93.6 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | True | 0.505 | 11968 | 93.49 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | False | 0.17 | 8278 | 20.76 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | True | 0.1835 | 12004 | 21.67 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | False | 0.323 | 21651 | 78.90 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | True | 0.262 | 20905 | 78.70 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False | 0.098 | 5777 | 75.12 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True | 0.0942 | 7095 | 75.15 | +| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb | Fail | incompatible | incompatible | incompatible | diff --git a/docs/zh_cn/project_zoo.py b/docs/zh_cn/project_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..88cd0484baf5c4de0ecca8c55abb2e59ad62dd6d --- /dev/null +++ b/docs/zh_cn/project_zoo.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +from pathlib import Path + +from utils import replace_link + +# This script reads /projects/*/README.md and generate projectzoo.md + +all_files = list(Path('../../projects/').glob('*/README.md')) +example_project = '../../projects/example_project/README.md' +all_files.remove(Path(example_project)) +all_files.insert(0, Path(example_project)) + +project_zoo = open('../../projects/README.md').read() +for file in all_files: + chinese_readme = Path(str(file).replace('README.md', 'README_zh-CN.md')) + if chinese_readme.exists(): + file = chinese_readme + with open(file) as f: + content = f.read() + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + file) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + file) + + project_zoo += content + +with open('projectzoo.md', 'w') as f: + f.write(project_zoo) diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py new file mode 100644 index 0000000000000000000000000000000000000000..1ee25119c162e39d7f5a33c0c5a0748bd53ca1d8 --- /dev/null +++ b/docs/zh_cn/stat.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +import re +import shutil +from collections import defaultdict +from pathlib import Path + +from modelindex.load_model_index import load +from modelindex.models.Result import Result +from tabulate import tabulate +from utils import replace_link + +MMACT_ROOT = Path(__file__).absolute().parents[2] +PAPERS_ROOT = Path('model_zoo') # Path to save generated paper pages. +GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/' +MODELZOO_TEMPLATE = """\ +# ๆจกๅž‹ๅบ“็ปŸ่ฎก + +ๅœจๆœฌ้กต้ขไธญ๏ผŒๆˆ‘ไปฌๅˆ—ไธพไบ†ๆˆ‘ไปฌๆ”ฏๆŒ็š„[ๆ‰€ๆœ‰็ฎ—ๆณ•](#ๆ‰€ๆœ‰ๅทฒๆ”ฏๆŒ็š„็ฎ—ๆณ•)ใ€‚ไฝ ๅฏไปฅ็‚นๅ‡ป้“พๆŽฅ่ทณ่ฝฌ่‡ณๅฏนๅบ”็š„ๆจกๅž‹่ฏฆๆƒ…้กต้ขใ€‚ + +ๅฆๅค–๏ผŒๆˆ‘ไปฌ่ฟ˜ๅˆ—ๅ‡บไบ†ๆˆ‘ไปฌๆไพ›็š„ๆ‰€ๆœ‰ๆจกๅž‹ๆƒ้‡ๆ–‡ไปถใ€‚ไฝ ๅฏไปฅไฝฟ็”จๆŽ’ๅบๅ’Œๆœ็ดขๅŠŸ่ƒฝๆ‰พๅˆฐ้œ€่ฆ็š„ๆจกๅž‹ๆƒ้‡๏ผŒๅนถไฝฟ็”จ้“พๆŽฅ่ทณ่ฝฌ่‡ณๆจกๅž‹่ฏฆๆƒ…้กต้ขใ€‚ + +## ๆ‰€ๆœ‰ๅทฒๆ”ฏๆŒ็š„็ฎ—ๆณ• + +* ่ฎบๆ–‡ๆ•ฐ้‡๏ผš{num_papers} +{type_msg} + +* ๆจกๅž‹ๆƒ้‡ๆ–‡ไปถๆ•ฐ้‡๏ผš{num_ckpts} +{paper_msg} + +""" # noqa: E501 + +METRIC_ALIAS = { + 'Top 1 Accuracy': 'Top-1 (%)', + 'Top 5 Accuracy': 'Top-5 (%)', +} + +TASK_MAP = dict( + detection='ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹', + localization='ๆ—ถๅบๅŠจไฝœๅฎšไฝๆจกๅž‹', + recognition='่กŒไธบ่ฏ†ๅˆซๆจกๅž‹', + skeleton='ๅŸบไบŽ้ชจ้ชผ็‚น็š„่กŒไธบ่ฏ†ๅˆซๆจกๅž‹', + retrieval='่ง†้ข‘ๆฃ€็ดขๆจกๅž‹', + recognition_audio='ๅŸบไบŽๅฃฐ้Ÿณ็š„่กŒไธบ่ฏ†ๅˆซๆจกๅž‹') + +model_index = load(str(MMACT_ROOT / 'model-index.yml')) + + +def build_collections(model_index): + # add models for collections + col_by_name = {} + for col in model_index.collections: + setattr(col, 'models', []) + col_by_name[col.name] = col + + for model in model_index.models: + col = col_by_name[model.in_collection] + col.models.append(model) + setattr(model, 'collection', col) + if model.results is None: + setattr(model, 'tasks', []) + else: + setattr(model, 'tasks', [result.task for result in model.results]) + + +build_collections(model_index) + +# save a map from model name to title in README +model2title = dict() + + +def count_papers(collections): + total_num_ckpts = 0 + type_count = defaultdict(int) + paper_msgs = [] + + for collection in collections: + with open(MMACT_ROOT / collection.readme) as f: + readme = f.read() + + ckpts = set(x.lower().strip() + for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme)) + total_num_ckpts += len(ckpts) + title = collection.paper['Title'] + papertype = collection.data.get('type', 'Algorithm') + type_count[papertype] += 1 + + readme_title = re.search(r'^#\s+.+', readme) + + readme = Path(collection.filepath).parents[1].with_suffix('.md').name + model = Path(collection.filepath).parent.name + model2title[model] = readme_title.group()[2:].replace(' ', '-') + paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}' + f'#{model2title[model]}) ({len(ckpts)} ckpts)') + + type_msg = '\n'.join( + [f'\t- {type_}: {count}' for type_, count in type_count.items()]) + paper_msg = '\n'.join(paper_msgs) + + modelzoo = MODELZOO_TEMPLATE.format( + num_papers=len(collections), + num_ckpts=total_num_ckpts, + type_msg=type_msg, + paper_msg=paper_msg, + ) + + with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +count_papers(model_index.collections) + + +def generate_paper_page(collection): + + # Write a copy of README + with open(MMACT_ROOT / collection.readme) as f: + content = f.read() + readme_path = Path(collection.filepath) + copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name + if not copy.exists(): + with open(copy, 'w') as copy_file: + task = readme_path.parents[1].name + head_content = f'# {TASK_MAP[task]}\n' + copy_file.write(head_content) + + def lower_heading(match): + return '#' + match.group() + + content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content, + Path(collection.readme)) + content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content, + Path(collection.readme)) + + content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M) + + with open(copy, 'a') as copy_file: + copy_file.write(content) + + +if PAPERS_ROOT.exists(): + shutil.rmtree(PAPERS_ROOT) +PAPERS_ROOT.mkdir(exist_ok=True) +for collection in model_index.collections: + generate_paper_page(collection) + + +def scatter_results(models): + model_result_pairs = [] + for model in models: + if model.results is None: + result = Result(task=None, dataset=None, metrics={}) + model_result_pairs.append((model, result)) + else: + for result in model.results: + model_result_pairs.append((model, result)) + return model_result_pairs + + +def generate_summary_table(task, model_result_pairs, title=None): + metrics = set() + for model, result in model_result_pairs: + if result.task == task: + metrics = metrics.union(result.metrics.keys()) + metrics = sorted(list(metrics)) + + rows = [] + + def convert2float(number): + units = {'M': 1e6, 'G': 1e9, 'T': 1e12} + if isinstance(number, str): + num = float(number.rstrip('MGT')) + number = num * units[number[-1]] + return number + + for model, result in model_result_pairs: + if result.task != task: + continue + name = model.name + if model.metadata.parameters is not None: + params = convert2float(model.metadata.parameters) + params = f'{params / 1e6:.2f}' # Params + else: + params = None + if model.metadata.flops is not None: + flops = convert2float(model.metadata.flops) + flops = f'{flops / 1e9:.2f}' # Flops + else: + flops = None + + readme = Path( + model.collection.filepath).parents[1].with_suffix('.md').name + model = Path(model.collection.filepath).parent.name + page = f'[้“พๆŽฅ]({PAPERS_ROOT / readme}#{model2title[model]})' + model_metrics = [] + for metric in metrics: + model_metrics.append(str(result.metrics.get(metric, ''))) + + rows.append([name, params, flops, *model_metrics, page]) + + with open('modelzoo_statistics.md', 'a') as f: + if title is not None: + f.write(f'\n{title}') + f.write("""\n```{table}\n:class: model-summary\n""") + header = [ + 'ๆจกๅž‹', + 'ๅ‚ๆ•ฐ้‡ (M)', + 'Flops (G)', + *[METRIC_ALIAS.get(metric, metric) for metric in metrics], + 'Readme', + ] + table_cfg = dict( + tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(rows, header, **table_cfg)) + f.write('\n```\n') + + +def generate_dataset_wise_table(task, model_result_pairs, title=None): + dataset_rows = defaultdict(list) + for model, result in model_result_pairs: + if result.task == task: + dataset_rows[result.dataset].append((model, result)) + + if title is not None: + with open('modelzoo_statistics.md', 'a') as f: + f.write(f'\n{title}') + for dataset, pairs in dataset_rows.items(): + generate_summary_table(task, pairs, title=f'### {dataset}') + + +model_result_pairs = scatter_results(model_index.models) + +# Generate Action Recognition Summary +generate_dataset_wise_table( + task='Action Recognition', + model_result_pairs=model_result_pairs, + title='## ่กŒไธบ่ฏ†ๅˆซ', +) + +# Generate Action Detection Summary +generate_dataset_wise_table( + task='Action Detection', + model_result_pairs=model_result_pairs, + title='## ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹', +) + +# Generate Skeleton-based Action Recognition Summary +generate_dataset_wise_table( + task='Skeleton-based Action Recognition', + model_result_pairs=model_result_pairs, + title='## ้ชจ้ชผ็‚น่กŒไธบ่ฏ†ๅˆซ', +) + +# Generate Video Retrieval Summary +generate_dataset_wise_table( + task='Video Retrieval', + model_result_pairs=model_result_pairs, + title='## ่ง†้ข‘ๆฃ€็ดข', +) + +# Generate Temporal Action Localization Summary +generate_dataset_wise_table( + task='Temporal Action Localization', + model_result_pairs=model_result_pairs, + title='## ๆ—ถๅบๅŠจไฝœๅฎšไฝ', +) diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md new file mode 100644 index 0000000000000000000000000000000000000000..88b3a3777af732797f98e5cba78c68808fa655c2 --- /dev/null +++ b/docs/zh_cn/switch_language.md @@ -0,0 +1,3 @@ +## English + +## ็ฎ€ไฝ“ไธญๆ–‡ diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md new file mode 100644 index 0000000000000000000000000000000000000000..986153fd75700fff514570e2bee8f94761cce264 --- /dev/null +++ b/docs/zh_cn/useful_tools.md @@ -0,0 +1,91 @@ +# ๅˆ†ๆžๅทฅๅ…ท + +้™คไบ†่ฎญ็ปƒ/ๆต‹่ฏ•่„šๆœฌๅค–๏ผŒๆˆ‘ไปฌๅœจ `tools/` ็›ฎๅฝ•ไธ‹่ฟ˜ๆไพ›ไบ†่ฎธๅคšๆœ‰็”จ็š„ๅทฅๅ…ทใ€‚ + +## ๅˆ†ๆžๅทฅๅ…ท้“พๆŽฅ + + + +- [](#ๅˆ†ๆžๅทฅๅ…ท) + - [ๅˆ†ๆžๅทฅๅ…ท](#ๅˆ†ๆžๅทฅๅ…ท) + - [ๆจกๅž‹่ฝฌๆข](#ๆจกๅž‹่ฝฌๆข) + - [ๅ‡†ๅค‡ๆจกๅž‹่ฟ›่กŒๅ‘ๅธƒ](#ๅ‡†ๅค‡ๆจกๅž‹่ฟ›่กŒๅ‘ๅธƒ) + - [ๆ‚้กน](#ๆ‚้กน) + - [่ฏ„ไผฐๆŒ‡ๆ ‡](#่ฏ„ไผฐๆŒ‡ๆ ‡) + - [ๆ‰“ๅฐๅฎŒๆ•ด้…็ฝฎ](#ๆ‰“ๅฐๅฎŒๆ•ด้…็ฝฎ) + - [ๆฃ€ๆŸฅ่ง†้ข‘](#ๆฃ€ๆŸฅ่ง†้ข‘) + - [ๅคšๆต่žๅˆ](#ๅคšๆต่žๅˆ) + + + +## ๆจกๅž‹่ฝฌๆข + +### ๅ‡†ๅค‡ๆจกๅž‹่ฟ›่กŒๅ‘ๅธƒ + +`tools/deployment/publish_model.py` ๅธฎๅŠฉ็”จๆˆทๅ‡†ๅค‡ไป–ไปฌ็š„ๆจกๅž‹่ฟ›่กŒๅ‘ๅธƒใ€‚ + +ๅœจๅฐ†ๆจกๅž‹ไธŠไผ ๅˆฐ AWS ไน‹ๅ‰๏ผŒๆ‚จๅฏ่ƒฝๆƒณ่ฆ๏ผš + +๏ผˆ1๏ผ‰ๅฐ†ๆจกๅž‹ๆƒ้‡่ฝฌๆขไธบ CPU ๅผ ้‡ใ€‚ +๏ผˆ2๏ผ‰ๅˆ ้™คไผ˜ๅŒ–ๅ™จ็Šถๆ€ไฟกๆฏใ€‚ +๏ผˆ3๏ผ‰่ฎก็ฎ—ๆƒ้‡ๆ–‡ไปถ็š„ๅ“ˆๅธŒๅ€ผ๏ผŒๅนถๅฐ†ๅ“ˆๅธŒๅ€ผๆทปๅŠ ๅˆฐๆ–‡ไปถๅไธญใ€‚ + +```shell +python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME} +``` + +ไพ‹ๅฆ‚๏ผŒ + +```shell +python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth +``` + +ๆœ€็ปˆ่พ“ๅ‡บ็š„ๆ–‡ไปถๅๅฐ†ๆ˜ฏ `tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb-{hash id}.pth`ใ€‚ + +## ๆ‚้กน + +### ่ฏ„ไผฐๆŒ‡ๆ ‡ + +`tools/analysis_tools/eval_metric.py` ๆ นๆฎ้…็ฝฎๆ–‡ไปถ่ฏ„ไผฐไฟๅญ˜ๅœจๆ–‡ไปถไธญ็š„็ป“ๆžœ็š„ๆŸไบ›ๆŒ‡ๆ ‡ใ€‚ + +ไฟๅญ˜็š„็ป“ๆžœๆ–‡ไปถๆ˜ฏ้€š่ฟ‡ๅœจ `tools/test.py` ไธญ่ฎพ็ฝฎๅ‚ๆ•ฐ `--out ${RESULT_FILE}` ๆฅๅˆ›ๅปบ็š„๏ผŒไปฅๆŒ‡็คบ็ป“ๆžœๆ–‡ไปถ๏ผŒๅ…ถไธญๅญ˜ๅ‚จไบ†ๆ•ดไธชๆจกๅž‹็š„ๆœ€็ปˆ่พ“ๅ‡บใ€‚ + +```shell +python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}] +``` + +### ๆ‰“ๅฐๅฎŒๆ•ด้…็ฝฎ + +`tools/analysis_tools/print_config.py` ้€ๅญ—ๆ‰“ๅฐๆ•ดไธช้…็ฝฎ๏ผŒๅฑ•ๅผ€ๆ‰€ๆœ‰ๅฏผๅ…ฅ้กนใ€‚ + +```shell +python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` + +### ๆฃ€ๆŸฅ่ง†้ข‘ + +`tools/analysis_tools/check_videos.py` ไฝฟ็”จๆŒ‡ๅฎš็š„่ง†้ข‘็ผ–็ ๅ™จ่ฟญไปฃ็”ฑ่พ“ๅ…ฅ้…็ฝฎๆ–‡ไปถๆŒ‡ๅฎš็š„ๆ‰€ๆœ‰ๆ ทๆœฌ๏ผŒๆŸฅๆ‰พๆ— ๆ•ˆ็š„่ง†้ข‘๏ผˆๆŸๅๆˆ–็ผบๅคฑ๏ผ‰๏ผŒๅนถๅฐ†็›ธๅบ”็š„ๆ–‡ไปถ่ทฏๅพ„ไฟๅญ˜ๅˆฐ่พ“ๅ‡บๆ–‡ไปถไธญใ€‚่ฏทๆณจๆ„๏ผŒๅˆ ้™คๆ— ๆ•ˆ่ง†้ข‘ๅŽ๏ผŒ็”จๆˆท้œ€่ฆ้‡ๆ–ฐ็”Ÿๆˆ่ง†้ข‘ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +python tools/analysis_tools/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos] +``` + +### ๅคšๆต่žๅˆ + +`tools/analysis_tools/report_accuracy.py` ไฝฟ็”จๆŽจ็†ไฟๅญ˜็š„็ป“ๆžœ๏ผˆๅœจๆต‹่ฏ•ๆ—ถ่ฎพ็ฝฎ `--dump res.pkl`๏ผ‰ๆฅ่žๅˆๅคšๆต้ข„ๆต‹ๅˆ†ๆ•ฐ๏ผŒๅณๅŽ่žๅˆ๏ผˆlate fusion๏ผ‰ใ€‚ + +```shell +python tools/analysis_tools/report_accuracy.py [--preds ${RESULT_PKL_1 [RESULT_PKL_2 ...]}] [--coefficients ${COEFFICIENT_1 [COEFFICIENT_2, ...]}] [--apply-softmax] +``` + +ไปฅ joint-bone ่žๅˆไธบไพ‹๏ผŒ่ฟ™ๆ˜ฏๅŸบไบŽ้ชจ้ชผๅŠจไฝœ่ฏ†ๅˆซไปปๅŠก็š„ไธ€็งๅธธ่งๅฎž่ทตใ€‚ + +```shell +python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0 +``` + +``` +Mean Class Accuracy: 0.9180 +Top 1 Accuracy: 0.9333 +Top 5 Accuracy: 0.9833 +``` diff --git a/docs/zh_cn/user_guides/config.md b/docs/zh_cn/user_guides/config.md new file mode 100644 index 0000000000000000000000000000000000000000..798a9f8884449437550d86eebb7ead7a02a51607 --- /dev/null +++ b/docs/zh_cn/user_guides/config.md @@ -0,0 +1,711 @@ +# ๅญฆไน ้…็ฝฎๆ–‡ไปถ + +ๆˆ‘ไปฌไฝฟ็”จ Python ๆ–‡ไปถไฝœไธบ้…็ฝฎๆ–‡ไปถ๏ผŒๅฐ†ๆจกๅ—ๅŒ–ๅ’Œ็ปงๆ‰ฟ่ฎพ่ฎก่žๅ…ฅๆˆ‘ไปฌ็š„้…็ฝฎ็ณป็ปŸไธญ๏ผŒ่ฟ™ๆ–นไพฟ่ฟ›่กŒๅ„็งๅฎž้ชŒใ€‚ +ๆ‚จๅฏไปฅๅœจ `$MMAction2/configs` ็›ฎๅฝ•ไธ‹ๆ‰พๅˆฐๆ‰€ๆœ‰ๆไพ›็š„้…็ฝฎๆ–‡ไปถใ€‚ๅฆ‚ๆžœๆ‚จๆƒณ่ฆๆŸฅ็œ‹้…็ฝฎๆ–‡ไปถ๏ผŒ +ๆ‚จๅฏไปฅ่ฟ่กŒ `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` ๆฅๆŸฅ็œ‹ๅฎŒๆ•ด็š„้…็ฝฎๆ–‡ไปถใ€‚ + + + +- [ๅญฆไน ้…็ฝฎๆ–‡ไปถ](#ๅญฆไน ้…็ฝฎๆ–‡ไปถ) + - [้€š่ฟ‡่„šๆœฌๅ‚ๆ•ฐไฟฎๆ”น้…็ฝฎ](#้€š่ฟ‡่„šๆœฌๅ‚ๆ•ฐไฟฎๆ”น้…็ฝฎ) + - [้…็ฝฎๆ–‡ไปถ็ป“ๆž„](#้…็ฝฎๆ–‡ไปถ็ป“ๆž„) + - [้…็ฝฎๆ–‡ไปถๅ‘ฝๅ็บฆๅฎš](#้…็ฝฎๆ–‡ไปถๅ‘ฝๅ็บฆๅฎš) + - [ๅŠจไฝœ่ฏ†ๅˆซ็š„้…็ฝฎ็ณป็ปŸ](#ๅŠจไฝœ่ฏ†ๅˆซ็š„้…็ฝฎ็ณป็ปŸ) + - [ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹็š„้…็ฝฎ็ณป็ปŸ](#ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹็š„้…็ฝฎ็ณป็ปŸ) + - [ๅŠจไฝœๅฎšไฝ็š„้…็ฝฎ็ณป็ปŸ](#ๅŠจไฝœๅฎšไฝ็š„้…็ฝฎ็ณป็ปŸ) + + + +## ้€š่ฟ‡่„šๆœฌๅ‚ๆ•ฐไฟฎๆ”น้…็ฝฎ + +ๅœจไฝฟ็”จ `tools/train.py` ๆˆ– `tools/test.py` ๆไบคไฝœไธšๆ—ถ๏ผŒๆ‚จๅฏไปฅ้€š่ฟ‡ๆŒ‡ๅฎš `--cfg-options` ๆฅๅŽŸๅœฐไฟฎๆ”น้…็ฝฎใ€‚ + +- ๆ›ดๆ–ฐๅญ—ๅ…ธ็š„้…็ฝฎ้”ฎใ€‚ + + ๅฏไปฅๆŒ‰็…งๅŽŸๅง‹้…็ฝฎไธญๅญ—ๅ…ธ้”ฎ็š„้กบๅบๆฅๆŒ‡ๅฎš้…็ฝฎ้€‰้กนใ€‚ + ไพ‹ๅฆ‚๏ผŒ`--cfg-options model.backbone.norm_eval=False` ๅฐ†ๆจกๅž‹้ชจๅนฒไธญ็š„ๆ‰€ๆœ‰ BN ๆจกๅ—ๆ›ดๆ”นไธบ `train` ๆจกๅผใ€‚ + +- ๆ›ดๆ–ฐ้…็ฝฎๅˆ—่กจไธญ็š„้”ฎใ€‚ + + ไธ€ไบ›้…็ฝฎๅญ—ๅ…ธๅœจ้…็ฝฎๆ–‡ไปถไธญไปฅๅˆ—่กจๅฝขๅผ็ป„ๆˆใ€‚ไพ‹ๅฆ‚๏ผŒ่ฎญ็ปƒๆต็จ‹ `train_pipeline` ้€šๅธธๆ˜ฏไธ€ไธชๅˆ—่กจ๏ผŒ + ไพ‹ๅฆ‚ `[dict(type='SampleFrames'), ...]`ใ€‚ๅฆ‚ๆžœๆ‚จๆƒณ่ฆๅœจๆต็จ‹ไธญๅฐ† `'SampleFrames'` ๆ›ดๆ”นไธบ `'DenseSampleFrames'`๏ผŒ + ๆ‚จๅฏไปฅๆŒ‡ๅฎš `--cfg-options train_pipeline.0.type=DenseSampleFrames`ใ€‚ + +- ๆ›ดๆ–ฐๅˆ—่กจ/ๅ…ƒ็ป„็š„ๅ€ผใ€‚ + + ๅฆ‚ๆžœ่ฆๆ›ดๆ–ฐ็š„ๅ€ผๆ˜ฏๅˆ—่กจๆˆ–ๅ…ƒ็ป„ใ€‚ไพ‹ๅฆ‚๏ผŒ้…็ฝฎๆ–‡ไปถ้€šๅธธ่ฎพ็ฝฎ `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`ใ€‚ๅฆ‚ๆžœๆ‚จๆƒณ่ฆ + ๆ›ดๆ”นๆญค้”ฎ๏ผŒๆ‚จๅฏไปฅๆŒ‡ๅฎš `--cfg-options model.data_preprocessor.mean="[128,128,128]"`ใ€‚่ฏทๆณจๆ„๏ผŒๅผ•ๅท " ๆ˜ฏๆ”ฏๆŒๅˆ—่กจ/ๅ…ƒ็ป„ๆ•ฐๆฎ็ฑปๅž‹็š„ๅฟ…้œ€ๅ†…ๅฎนใ€‚ + +## ้…็ฝฎๆ–‡ไปถ็ป“ๆž„ + +`configs/_base_` ไธ‹ๆœ‰ 3 ็งๅŸบๆœฌ็ป„ไปถ็ฑปๅž‹๏ผŒๅณ modelsใ€schedules ๅ’Œ default_runtimeใ€‚ +่ฎธๅคšๆ–นๆณ•ๅช้œ€่ฆไธ€ไธชๆจกๅž‹ใ€ไธ€ไธช่ฎญ็ปƒ่ฎกๅˆ’ๅ’Œไธ€ไธช้ป˜่ฎค่ฟ่กŒๆ—ถ็ป„ไปถๅฐฑๅฏไปฅ่ฝปๆพๆž„ๅปบ๏ผŒๅฆ‚ TSNใ€I3Dใ€SlowOnly ็ญ‰ใ€‚ +็”ฑ `_base_` ็ป„ไปถ็ป„ๆˆ็š„้…็ฝฎๆ–‡ไปถ่ขซ็งฐไธบ _primitive_ใ€‚ + +ๅฏนไบŽๅŒไธ€ๆ–‡ไปถๅคนไธ‹็š„ๆ‰€ๆœ‰้…็ฝฎๆ–‡ไปถ๏ผŒๅปบ่ฎฎๅชๆœ‰**ไธ€ไธช** _primitive_ ้…็ฝฎๆ–‡ไปถใ€‚ๅ…ถไป–ๆ‰€ๆœ‰้…็ฝฎๆ–‡ไปถ้ƒฝๅบ”่ฏฅ็ปงๆ‰ฟ่‡ช _primitive_ ้…็ฝฎๆ–‡ไปถใ€‚่ฟ™ๆ ท๏ผŒ็ปงๆ‰ฟ็บงๅˆซ็š„ๆœ€ๅคงๅ€ผไธบ 3ใ€‚ + +ไธบไบ†ๆ–นไพฟ็†่งฃ๏ผŒๆˆ‘ไปฌๅปบ่ฎฎ่ดก็Œฎ่€…็ปงๆ‰ฟ็Žฐๆœ‰ๆ–นๆณ•ใ€‚ +ไพ‹ๅฆ‚๏ผŒๅฆ‚ๆžœๅŸบไบŽ TSN ่ฟ›่กŒไบ†ไธ€ไบ›ไฟฎๆ”น๏ผŒ็”จๆˆทๅฏไปฅ้ฆ–ๅ…ˆ้€š่ฟ‡ๆŒ‡ๅฎš `_base_ = ../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ๆฅ็ปงๆ‰ฟๅŸบๆœฌ็š„ TSN ็ป“ๆž„๏ผŒ็„ถๅŽๅœจ้…็ฝฎๆ–‡ไปถไธญไฟฎๆ”นๅฟ…่ฆ็š„ๅญ—ๆฎตใ€‚ + +ๅฆ‚ๆžœๆ‚จๆญฃๅœจๆž„ๅปบไธ€ไธชไธŽไปปไฝ•็Žฐๆœ‰ๆ–นๆณ•็š„็ป“ๆž„ไธๅ…ฑไบซ็š„ๅ…จๆ–ฐๆ–นๆณ•๏ผŒๅฏไปฅๅœจ `configs/TASK` ไธ‹ๅˆ›ๅปบไธ€ไธชๆ–‡ไปถๅคนใ€‚ + +่ฏทๅ‚่€ƒ [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) ่Žทๅ–่ฏฆ็ป†ๆ–‡ๆกฃใ€‚ + +## ้…็ฝฎๆ–‡ไปถๅ‘ฝๅ็บฆๅฎš + +ๆˆ‘ไปฌ้ตๅพชไปฅไธ‹ๆ ทๅผๆฅๅ‘ฝๅ้…็ฝฎๆ–‡ไปถใ€‚ๅปบ่ฎฎ่ดก็Œฎ่€…้ตๅพช็›ธๅŒ็š„ๆ ทๅผใ€‚้…็ฝฎๆ–‡ไปถๅๅˆ†ไธบๅ‡ ไธช้ƒจๅˆ†๏ผŒไธๅŒ้ƒจๅˆ†้€ป่พ‘ไธŠ็”จไธ‹ๅˆ’็บฟ `'_'` ่ฟžๆŽฅ๏ผŒๅŒไธ€้ƒจๅˆ†็š„่ฎพ็ฝฎ็”จ็ ดๆŠ˜ๅท `'-'` ่ฟžๆŽฅใ€‚ + +``` +{็ฎ—ๆณ•ไฟกๆฏ}_{ๆจกๅ—ไฟกๆฏ}_{่ฎญ็ปƒไฟกๆฏ}_{ๆ•ฐๆฎไฟกๆฏ}.py +``` + +`{xxx}` ๆ˜ฏๅฟ…ๅกซๅญ—ๆฎต๏ผŒ`[yyy]` ๆ˜ฏๅฏ้€‰ๅญ—ๆฎตใ€‚ + +- `{็ฎ—ๆณ•ไฟกๆฏ}`: + - `{ๆจกๅž‹}`: ๆจกๅž‹็ฑปๅž‹๏ผŒไพ‹ๅฆ‚ `tsn`ใ€`i3d`ใ€`swin`ใ€`vit` ็ญ‰ใ€‚ + - `[ๆจกๅž‹่ฎพ็ฝฎ]`: ๆŸไบ›ๆจกๅž‹็š„็‰นๅฎš่ฎพ็ฝฎ๏ผŒไพ‹ๅฆ‚ `base`ใ€`p16`ใ€`w877` ็ญ‰ใ€‚ +- `{ๆจกๅ—ไฟกๆฏ}`: + - `[้ข„่ฎญ็ปƒไฟกๆฏ]`: ้ข„่ฎญ็ปƒไฟกๆฏ๏ผŒไพ‹ๅฆ‚ `kinetics400-pretrained`ใ€`in1k-pre` ็ญ‰ใ€‚ + - `{้ชจๅนฒ็ฝ‘็ปœ}`: ้ชจๅนฒ็ฝ‘็ปœ็ฑปๅž‹๏ผŒไพ‹ๅฆ‚ `r50`๏ผˆResNet-50๏ผ‰็ญ‰ใ€‚ + - `[้ชจๅนฒ็ฝ‘็ปœ่ฎพ็ฝฎ]`: ๆŸไบ›้ชจๅนฒ็ฝ‘็ปœ็š„็‰นๅฎš่ฎพ็ฝฎ๏ผŒไพ‹ๅฆ‚ `nl-dot-product`ใ€`bnfrozen`ใ€`nopool` ็ญ‰ใ€‚ +- `{่ฎญ็ปƒไฟกๆฏ}`: + - `{gpu x batch_per_gpu]}`: GPU ๅ’Œๆฏไธช GPU ไธŠ็š„ๆ ทๆœฌๆ•ฐใ€‚ + - `{pipeline่ฎพ็ฝฎ}`: ๅธง้‡‡ๆ ท่ฎพ็ฝฎ๏ผŒไพ‹ๅฆ‚ `dense`ใ€`{clip_len}x{frame_interval}x{num_clips}`ใ€`u48` ็ญ‰ใ€‚ + - `{schedule}`: ่ฎญ็ปƒ่ฎกๅˆ’๏ผŒไพ‹ๅฆ‚ `coslr-20e`ใ€‚ +- `{ๆ•ฐๆฎไฟกๆฏ}`: + - `{ๆ•ฐๆฎ้›†}`: ๆ•ฐๆฎ้›†ๅ็งฐ๏ผŒไพ‹ๅฆ‚ `kinetics400`ใ€`mmit` ็ญ‰ใ€‚ + - `{ๆจกๆ€}`: ๆ•ฐๆฎๆจกๆ€๏ผŒไพ‹ๅฆ‚ `rgb`ใ€`flow`ใ€`keypoint-2d` ็ญ‰ใ€‚ + +### ๅŠจไฝœ่ฏ†ๅˆซ็š„้…็ฝฎ็ณป็ปŸ + +ๆˆ‘ไปฌๅฐ†ๆจกๅ—ๅŒ–่ฎพ่ฎก่žๅ…ฅๆˆ‘ไปฌ็š„้…็ฝฎ็ณป็ปŸไธญ๏ผŒ +่ฟ™ๆ–นไพฟ่ฟ›่กŒๅ„็งๅฎž้ชŒใ€‚ + +- TSN ็š„็คบไพ‹ + + ไธบไบ†ๅธฎๅŠฉ็”จๆˆทๅฏนๅฎŒๆ•ด็š„้…็ฝฎ็ป“ๆž„ๅ’ŒๅŠจไฝœ่ฏ†ๅˆซ็ณป็ปŸไธญ็š„ๆจกๅ—ๆœ‰ไธ€ไธชๅŸบๆœฌ็š„ไบ†่งฃ๏ผŒ + ๆˆ‘ไปฌๅฏน TSN ็š„้…็ฝฎ่ฟ›่กŒ็ฎ€่ฆๆณจ้‡Šๅฆ‚ไธ‹ใ€‚ๆœ‰ๅ…ณๆฏไธชๆจกๅ—ไธญๆฏไธชๅ‚ๆ•ฐ็š„ๆ›ด่ฏฆ็ป†็”จๆณ•ๅ’Œๆ›ฟไปฃๆ–นๆณ•๏ผŒ่ฏทๅ‚้˜… API ๆ–‡ๆกฃใ€‚ + + ```python + # ๆจกๅž‹่ฎพ็ฝฎ + model = dict( # ๆจกๅž‹็š„้…็ฝฎ + type='Recognizer2D', # ่ฏ†ๅˆซๅ™จ็š„็ฑปๅ + backbone=dict( # ้ชจๅนฒ็ฝ‘็ปœ็š„้…็ฝฎ + type='ResNet', # ้ชจๅนฒ็ฝ‘็ปœ็š„ๅ็งฐ + pretrained='torchvision://resnet50', # ้ข„่ฎญ็ปƒๆจกๅž‹็š„ URL/็ฝ‘็ซ™ + depth=50, # ResNet ๆจกๅž‹็š„ๆทฑๅบฆ + norm_eval=False), # ๆ˜ฏๅฆๅœจ่ฎญ็ปƒๆ—ถๅฐ† BN ๅฑ‚่ฎพ็ฝฎไธบ่ฏ„ไผฐๆจกๅผ + cls_head=dict( # ๅˆ†็ฑปๅคด็š„้…็ฝฎ + type='TSNHead', # ๅˆ†็ฑปๅคด็š„ๅ็งฐ + num_classes=400, # ่ฆๅˆ†็ฑป็š„็ฑปๅˆซๆ•ฐ้‡ใ€‚ + in_channels=2048, # ๅˆ†็ฑปๅคด็š„่พ“ๅ…ฅ้€š้“ๆ•ฐใ€‚ + spatial_type='avg', # ็ฉบ้—ด็ปดๅบฆๆฑ ๅŒ–็š„็ฑปๅž‹ + consensus=dict(type='AvgConsensus', dim=1), # ไธ€่‡ดๆ€งๆจกๅ—็š„้…็ฝฎ + dropout_ratio=0.4, # dropout ๅฑ‚ไธญ็š„ๆฆ‚็އ + init_std=0.01, # ็บฟๆ€งๅฑ‚ๅˆๅง‹ๅŒ–็š„ๆ ‡ๅ‡†ๅทฎๅ€ผ + average_clips='prob'), # ๅนณๅ‡ๅคšไธชๅ‰ช่พ‘็ป“ๆžœ็š„ๆ–นๆณ• + data_preprocessor=dict( # ๆ•ฐๆฎ้ข„ๅค„็†ๅ™จ็š„้…็ฝฎ + type='ActionDataPreprocessor', # ๆ•ฐๆฎ้ข„ๅค„็†ๅ™จ็š„ๅ็งฐ + mean=[123.675, 116.28, 103.53], # ไธๅŒ้€š้“็š„ๅ‡ๅ€ผ็”จไบŽๅฝ’ไธ€ๅŒ– + std=[58.395, 57.12, 57.375], # ไธๅŒ้€š้“็š„ๆ ‡ๅ‡†ๅทฎ็”จไบŽๅฝ’ไธ€ๅŒ– + format_shape='NCHW'), # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + # ๆจกๅž‹่ฎญ็ปƒๅ’Œๆต‹่ฏ•่ฎพ็ฝฎ + train_cfg=None, # TSN ็š„่ฎญ็ปƒ่ถ…ๅ‚ๆ•ฐ็š„้…็ฝฎ + test_cfg=None) # TSN ็š„ๆต‹่ฏ•่ถ…ๅ‚ๆ•ฐ็š„้…็ฝฎ + + # ๆ•ฐๆฎ้›†่ฎพ็ฝฎ + dataset_type = 'RawframeDataset' # ็”จไบŽ่ฎญ็ปƒใ€้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๆ•ฐๆฎ้›†็ฑปๅž‹ + data_root = 'data/kinetics400/rawframes_train/' # ็”จไบŽ่ฎญ็ปƒ็š„ๆ•ฐๆฎ็š„ๆ น่ทฏๅพ„ + data_root_val = 'data/kinetics400/rawframes_val/' # ็”จไบŽ้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๆ•ฐๆฎ็š„ๆ น่ทฏๅพ„ + ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' # ็”จไบŽ่ฎญ็ปƒ็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # ็”จไบŽ้ชŒ่ฏ็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # ็”จไบŽๆต‹่ฏ•็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + + train_pipeline = [ # ่ฎญ็ปƒๆ•ฐๆฎๅค„็†ๆต็จ‹ + dict( # SampleFrames ็š„้…็ฝฎ + type='SampleFrames', # ้‡‡ๆ ทๅธง็š„ๆต็จ‹๏ผŒไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง + clip_len=1, # ๆฏไธช้‡‡ๆ ท่พ“ๅ‡บๅ‰ช่พ‘็š„ๅธงๆ•ฐ + frame_interval=1, # ็›ธ้‚ป้‡‡ๆ ทๅธง็š„ๆ—ถ้—ด้—ด้š” + num_clips=3), # ่ฆ้‡‡ๆ ท็š„ๅ‰ช่พ‘ๆ•ฐ + dict( # RawFrameDecode ็š„้…็ฝฎ + type='RawFrameDecode'), # ๅŠ ่ฝฝๅ’Œ่งฃ็ ๅธง็š„ๆต็จ‹๏ผŒ้€‰ๆ‹ฉ็ป™ๅฎš็ดขๅผ•็š„ๅŽŸๅง‹ๅธง + dict( # Resize ็š„้…็ฝฎ + type='Resize', # ่ฐƒๆ•ดๅคงๅฐ็š„ๆต็จ‹ + scale=(-1, 256)), # ่ฆ่ฐƒๆ•ดๅ›พๅƒ็š„ๆฏ”ไพ‹ + dict( # MultiScaleCrop ็š„้…็ฝฎ + type='MultiScaleCrop', # ๅคšๅฐบๅบฆ่ฃๅ‰ช็š„ๆต็จ‹๏ผŒๆ นๆฎ้šๆœบ้€‰ๆ‹ฉ็š„ๅฐบๅบฆๅˆ—่กจ่ฃๅ‰ชๅ›พๅƒ + input_size=224, # ็ฝ‘็ปœ็š„่พ“ๅ…ฅๅคงๅฐ + scales=(1, 0.875, 0.75, 0.66), # ่ฆ้€‰ๆ‹ฉ็š„ๅฎฝๅบฆๅ’Œ้ซ˜ๅบฆ็š„ๅฐบๅบฆ + random_crop=False, # ๆ˜ฏๅฆ้šๆœบ้‡‡ๆ ท่ฃๅ‰ชๆก† + max_wh_scale_gap=1), # ๅฎฝๅบฆๅ’Œ้ซ˜ๅบฆๅฐบๅบฆ็บงๅˆซ็š„ๆœ€ๅคงๅทฎ่ท + dict( # Resize ็š„้…็ฝฎ + type='Resize', # ่ฐƒๆ•ดๅคงๅฐ็š„ๆต็จ‹ + scale=(224, 224), # ่ฆ่ฐƒๆ•ดๅ›พๅƒ็š„ๆฏ”ไพ‹ + keep_ratio=False), # ๆ˜ฏๅฆไฟๆŒ็บตๆจชๆฏ”่ฟ›่กŒ่ฐƒๆ•ดๅคงๅฐ + dict( # Flip ็š„้…็ฝฎ + type='Flip', # ็ฟป่ฝฌ็š„ๆต็จ‹ + flip_ratio=0.5), # ๅฎžๆ–ฝ็ฟป่ฝฌ็š„ๆฆ‚็އ + dict( # FormatShape ็š„้…็ฝฎ + type='FormatShape', # ๆ ผๅผๅŒ–ๅฝข็Šถ็š„ๆต็จ‹๏ผŒๅฐ†ๆœ€็ปˆๅ›พๅƒๅฝข็Šถๆ ผๅผๅŒ–ไธบ็ป™ๅฎš็š„ input_format + input_format='NCHW'), # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + dict(type='PackActionInputs') # PackActionInputs ็š„้…็ฝฎ + ] + val_pipeline = [ # ้ชŒ่ฏๆ•ฐๆฎๅค„็†ๆต็จ‹ + dict( # SampleFrames ็š„้…็ฝฎ + type='SampleFrames', # ้‡‡ๆ ทๅธง็š„ๆต็จ‹๏ผŒไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง + clip_len=1, # ๆฏไธช้‡‡ๆ ท่พ“ๅ‡บๅ‰ช่พ‘็š„ๅธงๆ•ฐ + frame_interval=1, # ็›ธ้‚ป้‡‡ๆ ทๅธง็š„ๆ—ถ้—ด้—ด้š” + num_clips=3, # ่ฆ้‡‡ๆ ท็š„ๅ‰ช่พ‘ๆ•ฐ + test_mode=True), # ๆ˜ฏๅฆๅœจ้‡‡ๆ ทๆ—ถ่ฎพ็ฝฎไธบๆต‹่ฏ•ๆจกๅผ + dict( # RawFrameDecode ็š„้…็ฝฎ + type='RawFrameDecode'), # ๅŠ ่ฝฝๅ’Œ่งฃ็ ๅธง็š„ๆต็จ‹๏ผŒ้€‰ๆ‹ฉ็ป™ๅฎš็ดขๅผ•็š„ๅŽŸๅง‹ๅธง + dict( # Resize ็š„้…็ฝฎ + type='Resize', # ่ฐƒๆ•ดๅคงๅฐ็š„ๆต็จ‹ + scale=(-1, 256)), # ่ฆ่ฐƒๆ•ดๅ›พๅƒ็š„ๆฏ”ไพ‹ + dict( # CenterCrop ็š„้…็ฝฎ + type='CenterCrop', # ไธญๅฟƒ่ฃๅ‰ช็š„ๆต็จ‹๏ผŒไปŽๅ›พๅƒไธญ่ฃๅ‰ชไธญๅฟƒๅŒบๅŸŸ + crop_size=224), # ่ฆ่ฃๅ‰ช็š„ๅ›พๅƒๅคงๅฐ + dict( # Flip ็š„้…็ฝฎ + type='Flip', # ็ฟป่ฝฌ็š„ๆต็จ‹ + flip_ratio=0), # ๅฎžๆ–ฝ็ฟป่ฝฌ็š„ๆฆ‚็އ + dict( # FormatShape ็š„้…็ฝฎ + type='FormatShape', # ๆ ผๅผๅŒ–ๅฝข็Šถ็š„ๆต็จ‹๏ผŒๅฐ†ๆœ€็ปˆๅ›พๅƒๅฝข็Šถๆ ผๅผๅŒ–ไธบ็ป™ๅฎš็š„ input_format + input_format='NCHW'), # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + dict(type='PackActionInputs') # PackActionInputs ็š„้…็ฝฎ + ] + test_pipeline = [ # ๆต‹่ฏ•ๆ•ฐๆฎๅค„็†ๆต็จ‹ + dict( # SampleFrames ็š„้…็ฝฎ + type='SampleFrames', # ้‡‡ๆ ทๅธง็š„ๆต็จ‹๏ผŒไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง + clip_len=1, # ๆฏไธช้‡‡ๆ ท่พ“ๅ‡บๅ‰ช่พ‘็š„ๅธงๆ•ฐ + frame_interval=1, # ็›ธ้‚ป้‡‡ๆ ทๅธง็š„ๆ—ถ้—ด้—ด้š” + num_clips=25, # ่ฆ้‡‡ๆ ท็š„ๅ‰ช่พ‘ๆ•ฐ + test_mode=True), # ๆ˜ฏๅฆๅœจ้‡‡ๆ ทๆ—ถ่ฎพ็ฝฎไธบๆต‹่ฏ•ๆจกๅผ + dict( # RawFrameDecode ็š„้…็ฝฎ + type='RawFrameDecode'), # ๅŠ ่ฝฝๅ’Œ่งฃ็ ๅธง็š„ๆต็จ‹๏ผŒ้€‰ๆ‹ฉ็ป™ๅฎš็ดขๅผ•็š„ๅŽŸๅง‹ๅธง + dict( # Resize ็š„้…็ฝฎ + type='Resize', # ่ฐƒๆ•ดๅคงๅฐ็š„ๆต็จ‹ + scale=(-1, 256)), # ่ฆ่ฐƒๆ•ดๅ›พๅƒ็š„ๆฏ”ไพ‹ + dict( # TenCrop ็š„้…็ฝฎ + type='TenCrop', # ๅๆฌก่ฃๅ‰ช็š„ๆต็จ‹๏ผŒไปŽๅ›พๅƒไธญ่ฃๅ‰ชๅไธชๅŒบๅŸŸ + crop_size=224), # ่ฆ่ฃๅ‰ช็š„ๅ›พๅƒๅคงๅฐ + dict( # Flip ็š„้…็ฝฎ + type='Flip', # ็ฟป่ฝฌ็š„ๆต็จ‹ + flip_ratio=0), # ๅฎžๆ–ฝ็ฟป่ฝฌ็š„ๆฆ‚็އ + dict( # FormatShape ็š„้…็ฝฎ + type='FormatShape', # ๆ ผๅผๅŒ–ๅฝข็Šถ็š„ๆต็จ‹๏ผŒๅฐ†ๆœ€็ปˆๅ›พๅƒๅฝข็Šถๆ ผๅผๅŒ–ไธบ็ป™ๅฎš็š„ input_format + input_format='NCHW'), # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + dict(type='PackActionInputs') # PackActionInputs ็š„้…็ฝฎ + ] + + train_dataloader = dict( # ่ฎญ็ปƒๆ•ฐๆฎๅŠ ่ฝฝๅ™จ็š„้…็ฝฎ + batch_size=32, # ่ฎญ็ปƒๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ‰น้‡ๅคงๅฐ + num_workers=8, # ่ฎญ็ปƒๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ•ฐๆฎ้ข„ๅ–่ฟ›็จ‹ๆ•ฐ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญๅทฅไฝœ่ฟ›็จ‹๏ผŒ่ฟ™ๅฏไปฅๅŠ ้€Ÿ่ฎญ็ปƒ้€Ÿๅบฆ + sampler=dict( + type='DefaultSampler', # ๆ”ฏๆŒๅˆ†ๅธƒๅผๅ’Œ้žๅˆ†ๅธƒๅผ่ฎญ็ปƒ็š„ DefaultSamplerใ€‚ๅ‚่€ƒ https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # ๆฏไธช epoch ้šๆœบๆ‰“ไนฑ่ฎญ็ปƒๆ•ฐๆฎ + dataset=dict( # ่ฎญ็ปƒๆ•ฐๆฎ้›†็š„้…็ฝฎ + type=dataset_type, + ann_file=ann_file_train, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(img=data_root), # ๅธง่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=train_pipeline)) + val_dataloader = dict( # ้ชŒ่ฏๆ•ฐๆฎๅŠ ่ฝฝๅ™จ็š„้…็ฝฎ + batch_size=1, # ้ชŒ่ฏๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ‰น้‡ๅคงๅฐ + num_workers=8, # ้ชŒ่ฏๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ•ฐๆฎ้ข„ๅ–่ฟ›็จ‹ๆ•ฐ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญๅทฅไฝœ่ฟ›็จ‹ + sampler=dict( + type='DefaultSampler', + shuffle=False), # ้ชŒ่ฏๅ’Œๆต‹่ฏ•ๆ—ถไธ่ฟ›่กŒ้šๆœบๆ‰“ไนฑ + dataset=dict( # ้ชŒ่ฏๆ•ฐๆฎ้›†็š„้…็ฝฎ + type=dataset_type, + ann_file=ann_file_val, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(img=data_root_val), # ๅธง่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=val_pipeline, + test_mode=True)) + test_dataloader = dict( # ๆต‹่ฏ•ๆ•ฐๆฎๅŠ ่ฝฝๅ™จ็š„้…็ฝฎ + batch_size=32, # ๆต‹่ฏ•ๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ‰น้‡ๅคงๅฐ + num_workers=8, # ๆต‹่ฏ•ๆ—ถๆฏไธชๅ•ไธช GPU ็š„ๆ•ฐๆฎ้ข„ๅ–่ฟ›็จ‹ๆ•ฐ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญๅทฅไฝœ่ฟ›็จ‹ + sampler=dict( + type='DefaultSampler', + shuffle=False), # ้ชŒ่ฏๅ’Œๆต‹่ฏ•ๆ—ถไธ่ฟ›่กŒ้šๆœบๆ‰“ไนฑ + dataset=dict( # ๆต‹่ฏ•ๆ•ฐๆฎ้›†็š„้…็ฝฎ + type=dataset_type, + ann_file=ann_file_val, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(img=data_root_val), # ๅธง่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=test_pipeline, + test_mode=True)) + + # ่ฏ„ไผฐ่ฎพ็ฝฎ + val_evaluator = dict(type='AccMetric') # ้ชŒ่ฏ่ฏ„ไผฐๅ™จ็š„้…็ฝฎ + test_evaluator = val_evaluator # ๆต‹่ฏ•่ฏ„ไผฐๅ™จ็š„้…็ฝฎ + + train_cfg = dict( # ่ฎญ็ปƒๅพช็Žฏ็š„้…็ฝฎ + type='EpochBasedTrainLoop', # ่ฎญ็ปƒๅพช็Žฏ็š„ๅ็งฐ + max_epochs=100, # ๆ€ป็š„่ฎญ็ปƒๅ‘จๆœŸๆ•ฐ + val_begin=1, # ๅผ€ๅง‹้ชŒ่ฏ็š„่ฎญ็ปƒๅ‘จๆœŸ + val_interval=1) # ้ชŒ่ฏ้—ด้š” + val_cfg = dict( # ้ชŒ่ฏๅพช็Žฏ็š„้…็ฝฎ + type='ValLoop') # ้ชŒ่ฏๅพช็Žฏ็š„ๅ็งฐ + test_cfg = dict( # ๆต‹่ฏ•ๅพช็Žฏ็š„้…็ฝฎ + type='TestLoop') # ๆต‹่ฏ•ๅพช็Žฏ็š„ๅ็งฐ + + # ๅญฆไน ็ญ–็•ฅ + param_scheduler = [ # ๆ›ดๆ–ฐไผ˜ๅŒ–ๅ™จๅ‚ๆ•ฐ็š„ๅญฆไน ็އๆต‹็އ๏ผŒๆ”ฏๆŒๅญ—ๅ…ธๆˆ–ๅˆ—่กจ + dict(type='MultiStepLR', # ่พพๅˆฐไธ€ไธช้‡Œ็จ‹็ข‘ๆ—ถ่กฐๅ‡ๅญฆไน ็އ + begin=0, # ๅผ€ๅง‹ๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + end=100, # ็ป“ๆŸๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + by_epoch=True, # ๆ˜ฏๅฆๆŒ‰ epoch ๆ›ดๆ–ฐๅญฆไน ็އ + milestones=[40, 80], # ่กฐๅ‡ๅญฆไน ็އ็š„ๆญฅ้ชค + gamma=0.1)] # ๅญฆไน ็އ่กฐๅ‡็š„ไน˜ๆณ•ๅ› ๅญ + + # ไผ˜ๅŒ–ๅ™จ + optim_wrapper = dict( # ไผ˜ๅŒ–ๅ™จๅŒ…่ฃ…ๅ™จ็š„้…็ฝฎ + type='OptimWrapper', # ไผ˜ๅŒ–ๅ™จๅŒ…่ฃ…ๅ™จ็š„ๅ็งฐ๏ผŒๅˆ‡ๆขๅˆฐ AmpOptimWrapper ๅฏไปฅๅฏ็”จๆททๅˆ็ฒพๅบฆ่ฎญ็ปƒ + optimizer=dict( # ไผ˜ๅŒ–ๅ™จ็š„้…็ฝฎใ€‚ๆ”ฏๆŒ PyTorch ไธญ็š„ๅ„็งไผ˜ๅŒ–ๅ™จใ€‚ๅ‚่€ƒ https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # ไผ˜ๅŒ–ๅ™จ็š„ๅ็งฐ + lr=0.01, # ๅญฆไน ็އ + momentum=0.9, # ๅŠจ้‡ๅ› ๅญ + weight_decay=0.0001), # ๆƒ้‡่กฐๅ‡ + clip_grad=dict(max_norm=40, norm_type=2)) # ๆขฏๅบฆ่ฃๅ‰ช็š„้…็ฝฎ + + # ่ฟ่กŒๆ—ถ่ฎพ็ฝฎ + default_scope = 'mmaction' # ็”จไบŽๆŸฅๆ‰พๆจกๅ—็š„้ป˜่ฎคๆณจๅ†Œ่กจไฝœ็”จๅŸŸใ€‚ๅ‚่€ƒ https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( # ๆ‰ง่กŒ้ป˜่ฎคๆ“ไฝœ็š„้’ฉๅญ๏ผŒๅฆ‚ๆ›ดๆ–ฐๆจกๅž‹ๅ‚ๆ•ฐๅ’Œไฟๅญ˜ๆƒ้‡ใ€‚ + runtime_info=dict(type='RuntimeInfoHook'), # ๅฐ†่ฟ่กŒๆ—ถไฟกๆฏๆ›ดๆ–ฐๅˆฐๆถˆๆฏไธญๅฟƒ็š„้’ฉๅญ + timer=dict(type='IterTimerHook'), # ็”จไบŽ่ฎฐๅฝ•่ฟญไปฃ่ฟ‡็จ‹ไธญ่Šฑ่ดน็š„ๆ—ถ้—ด็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + logger=dict( + type='LoggerHook', # ็”จไบŽ่ฎฐๅฝ•่ฎญ็ปƒ/้ชŒ่ฏ/ๆต‹่ฏ•้˜ถๆฎต็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + interval=20, # ๆ‰“ๅฐๆ—ฅๅฟ—็š„้—ด้š” + ignore_last=False), # ๅฟฝ็•ฅๆฏไธช epoch ไธญๆœ€ๅŽๅ‡ ไธช่ฟญไปฃ็š„ๆ—ฅๅฟ— + param_scheduler=dict(type='ParamSchedulerHook'), # ๆ›ดๆ–ฐไผ˜ๅŒ–ๅ™จไธญๆŸไบ›่ถ…ๅ‚ๆ•ฐ็š„้’ฉๅญ + checkpoint=dict( + type='CheckpointHook', # ๅฎšๆœŸไฟๅญ˜ๆƒ้‡็š„้’ฉๅญ + interval=3, # ไฟๅญ˜็š„ๅ‘จๆœŸ + save_best='auto', # ็”จไบŽ่ฏ„ไผฐๆœ€ไฝณๆƒ้‡็š„ๆŒ‡ๆ ‡ + max_keep_ckpts=3), # ไฟ็•™็š„ๆœ€ๅคงๆƒ้‡ๆ–‡ไปถๆ•ฐ้‡ + sampler_seed=dict(type='DistSamplerSeedHook'), # ็”จไบŽๅˆ†ๅธƒๅผ่ฎญ็ปƒ็š„ๆ•ฐๆฎๅŠ ่ฝฝ้‡‡ๆ ทๅ™จ + sync_buffers=dict(type='SyncBuffersHook')) # ๅœจๆฏไธช epoch ็ป“ๆŸๆ—ถๅŒๆญฅๆจกๅž‹็ผ“ๅ†ฒๅŒบ + + env_cfg = dict( # ่ฎพ็ฝฎ็Žฏๅขƒ็š„ๅญ—ๅ…ธ + cudnn_benchmark=False, # ๆ˜ฏๅฆๅฏ็”จ cudnn benchmark + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # ่ฎพ็ฝฎๅคš่ฟ›็จ‹็š„ๅ‚ๆ•ฐ + dist_cfg=dict(backend='nccl')) # ่ฎพ็ฝฎๅˆ†ๅธƒๅผ็Žฏๅขƒ็š„ๅ‚ๆ•ฐ๏ผŒไนŸๅฏไปฅ่ฎพ็ฝฎ็ซฏๅฃๅท + + log_processor = dict( + type='LogProcessor', # ็”จไบŽๆ ผๅผๅŒ–ๆ—ฅๅฟ—ไฟกๆฏ็š„ๆ—ฅๅฟ—ๅค„็†ๅ™จ + window_size=20, # ้ป˜่ฎค็š„ๅนณๆป‘้—ด้š” + by_epoch=True) # ๆ˜ฏๅฆไฝฟ็”จ epoch ็ฑปๅž‹ๆ ผๅผๅŒ–ๆ—ฅๅฟ— + vis_backends = [ # ๅฏ่ง†ๅŒ–ๅŽ็ซฏ็š„ๅˆ—่กจ + dict(type='LocalVisBackend')] # ๆœฌๅœฐๅฏ่ง†ๅŒ–ๅŽ็ซฏ + visualizer = dict( # ๅฏ่ง†ๅŒ–ๅ™จ็š„้…็ฝฎ + type='ActionVisualizer', # ๅฏ่ง†ๅŒ–ๅ™จ็š„ๅ็งฐ + vis_backends=vis_backends) + log_level = 'INFO' # ๆ—ฅๅฟ—่ฎฐๅฝ•็š„็บงๅˆซ + load_from = None # ไปŽ็ป™ๅฎš่ทฏๅพ„ๅŠ ่ฝฝๆจกๅž‹ๆƒ้‡ไฝœไธบ้ข„่ฎญ็ปƒๆจกๅž‹ใ€‚่ฟ™ไธไผšๆขๅค่ฎญ็ปƒใ€‚ + resume = False # ๆ˜ฏๅฆไปŽ `load_from` ไธญๅฎšไน‰็š„ๆƒ้‡ๆขๅคใ€‚ๅฆ‚ๆžœ `load_from` ไธบ None๏ผŒๅˆ™ไผšไปŽ `work_dir` ไธญๆขๅคๆœ€ๆ–ฐ็š„ๆƒ้‡ใ€‚ + ``` + +### ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹็š„้…็ฝฎ็ณป็ปŸ + +ๆˆ‘ไปฌๅฐ†ๆจกๅ—ๅŒ–่ฎพ่ฎก่žๅ…ฅๆˆ‘ไปฌ็š„้…็ฝฎ็ณป็ปŸไธญ๏ผŒ่ฟ™ๆ–นไพฟ่ฟ›่กŒๅ„็งๅฎž้ชŒใ€‚ + +- FastRCNN ็š„็คบไพ‹ + + ไธบไบ†ๅธฎๅŠฉ็”จๆˆทๅฏนๅฎŒๆ•ด็š„้…็ฝฎ็ป“ๆž„ๅ’Œๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹็ณป็ปŸไธญ็š„ๆจกๅ—ๆœ‰ไธ€ไธชๅŸบๆœฌ็š„ไบ†่งฃ๏ผŒ + ๆˆ‘ไปฌๅฏน FastRCNN ็š„้…็ฝฎ่ฟ›่กŒ็ฎ€่ฆๆณจ้‡Šๅฆ‚ไธ‹ใ€‚ๆœ‰ๅ…ณๆฏไธชๆจกๅ—ไธญๆฏไธชๅ‚ๆ•ฐ็š„ๆ›ด่ฏฆ็ป†็”จๆณ•ๅ’Œๆ›ฟไปฃๆ–นๆณ•๏ผŒ่ฏทๅ‚้˜… API ๆ–‡ๆกฃใ€‚ + + ```python + # ๆจกๅž‹่ฎพ็ฝฎ + model = dict( # ๆจกๅž‹็š„้…็ฝฎ + type='FastRCNN', # ๆฃ€ๆต‹ๅ™จ็š„็ฑปๅ + _scope_='mmdet', # ๅฝ“ๅ‰้…็ฝฎ็š„่Œƒๅ›ด + backbone=dict( # ้ชจๅนฒ็ฝ‘็ปœ็š„้…็ฝฎ + type='ResNet3dSlowOnly', # ้ชจๅนฒ็ฝ‘็ปœ็š„ๅ็งฐ + depth=50, # ResNet ๆจกๅž‹็š„ๆทฑๅบฆ + pretrained=None, # ้ข„่ฎญ็ปƒๆจกๅž‹็š„ URL/็ฝ‘็ซ™ + pretrained2d=False, # ๅฆ‚ๆžœ้ข„่ฎญ็ปƒๆจกๅž‹ๆ˜ฏ 2D ็š„ + lateral=False, # ๅฆ‚ๆžœ้ชจๅนฒ็ฝ‘็ปœๅธฆๆœ‰ๆจชๅ‘่ฟžๆŽฅ + num_stages=4, # ResNet ๆจกๅž‹็š„้˜ถๆฎตๆ•ฐ + conv1_kernel=(1, 7, 7), # Conv1 ็š„ๅท็งฏๆ ธๅคงๅฐ + conv1_stride_t=1, # Conv1 ็š„ๆ—ถ้—ดๆญฅ้•ฟ + pool1_stride_t=1, # Pool1 ็š„ๆ—ถ้—ดๆญฅ้•ฟ + spatial_strides=(1, 2, 2, 1)), # ๆฏไธช ResNet ้˜ถๆฎต็š„็ฉบ้—ดๆญฅ้•ฟ + roi_head=dict( # roi_head ็š„้…็ฝฎ + type='AVARoIHead', # roi_head ็š„ๅ็งฐ + bbox_roi_extractor=dict( # bbox_roi_extractor ็š„้…็ฝฎ + type='SingleRoIExtractor3D', # bbox_roi_extractor ็š„ๅ็งฐ + roi_layer_type='RoIAlign', # RoI ๆ“ไฝœ็š„็ฑปๅž‹ + output_size=8, # RoI ๆ“ไฝœ็š„่พ“ๅ‡บ็‰นๅพๅคงๅฐ + with_temporal_pool=True), # ๆ˜ฏๅฆ่ฟ›่กŒๆ—ถ้—ด็ปดๅบฆ็š„ๆฑ ๅŒ– + bbox_head=dict( # bbox_head ็š„้…็ฝฎ + type='BBoxHeadAVA', # bbox_head ็š„ๅ็งฐ + in_channels=2048, # ่พ“ๅ…ฅ็‰นๅพ็š„้€š้“ๆ•ฐ + num_classes=81, # ๅŠจไฝœ็ฑปๅˆซๆ•ฐ + 1 + multilabel=True, # ๆ•ฐๆฎ้›†ๆ˜ฏๅฆไธบๅคšๆ ‡็ญพ + dropout_ratio=0.5), # ไฝฟ็”จ็š„ dropout ๆฏ”ไพ‹ + data_preprocessor=dict( # ๆ•ฐๆฎ้ข„ๅค„็†ๅ™จ็š„้…็ฝฎ + type='ActionDataPreprocessor', # ๆ•ฐๆฎ้ข„ๅค„็†ๅ™จ็š„ๅ็งฐ + mean=[123.675, 116.28, 103.53], # ไธๅŒ้€š้“็š„ๅ‡ๅ€ผ็”จไบŽๅฝ’ไธ€ๅŒ– + std=[58.395, 57.12, 57.375], # ไธๅŒ้€š้“็š„ๆ ‡ๅ‡†ๅทฎ็”จไบŽๅฝ’ไธ€ๅŒ– + format_shape='NCHW')) # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', # ๅˆ†้…ๅ™จ็š„ๅ็งฐ + pos_iou_thr=0.9, # ๆญฃๆ ทๆœฌ็š„ IoU ้˜ˆๅ€ผ๏ผŒ> pos_iou_thr -> ๆญฃๆ ทๆœฌ + neg_iou_thr=0.9, # ่ดŸๆ ทๆœฌ็š„ IoU ้˜ˆๅ€ผ๏ผŒ< neg_iou_thr -> ่ดŸๆ ทๆœฌ + min_pos_iou=0.9), # ๆญฃๆ ทๆœฌ็š„ๆœ€ๅฐๅฏๆŽฅๅ— IoU + sampler=dict( + type='RandomSampler', # ้‡‡ๆ ทๅ™จ็š„ๅ็งฐ + num=32, # ้‡‡ๆ ทๅ™จ็š„ๆ‰นๅค„็†ๅคงๅฐ + pos_fraction=1, # ้‡‡ๆ ทๅ™จ็š„ๆญฃๆ ทๆœฌๆฏ”ไพ‹ + neg_pos_ub=-1, # ่ดŸๆ ทๆœฌไธŽๆญฃๆ ทๆœฌๆ•ฐ้‡ๆฏ”็އ็š„ไธŠ้™ + add_gt_as_proposals=True), # ๅฐ† gt ่พน็•Œๆก†ๆทปๅŠ ๅˆฐ proposals ไธญ + pos_weight=1.0)), # ๆญฃๆ ทๆœฌ็š„ๆŸๅคฑๆƒ้‡ + test_cfg=dict(rcnn=None)) # ๆต‹่ฏ•็š„้…็ฝฎ + + # ๆ•ฐๆฎ้›†่ฎพ็ฝฎ + dataset_type = 'AVADataset' # ่ฎญ็ปƒใ€้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๆ•ฐๆฎ้›†็ฑปๅž‹ + data_root = 'data/ava/rawframes' # ๆ•ฐๆฎ็š„ๆ น็›ฎๅฝ• + anno_root = 'data/ava/annotations' # ๆณจ้‡Š็š„ๆ น็›ฎๅฝ• + + ann_file_train = f'{anno_root}/ava_train_v2.1.csv' # ่ฎญ็ปƒๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + ann_file_val = f'{anno_root}/ava_val_v2.1.csv' # ้ชŒ่ฏๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + + exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' # ่ฎญ็ปƒๆŽ’้™คๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' # ้ชŒ่ฏๆŽ’้™คๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + + label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' # ๆ ‡็ญพๆ–‡ไปถ็š„่ทฏๅพ„ + + proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl' # ่ฎญ็ปƒ็คบไพ‹็š„ไบบไฝ“ๆฃ€ๆต‹ proposals ๆ–‡ไปถ็š„่ทฏๅพ„ + proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' # ้ชŒ่ฏ็คบไพ‹็š„ไบบไฝ“ๆฃ€ๆต‹ proposals ๆ–‡ไปถ็š„่ทฏๅพ„ + + train_pipeline = [ + dict( + type='AVASampleFrames', # ไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง็š„็ฎก้“ + clip_len=4, # ๆฏไธช้‡‡ๆ ท่พ“ๅ‡บ็š„ๅธงๆ•ฐ + frame_interval=16), # ็›ธ้‚ป้‡‡ๆ ทๅธงไน‹้—ด็š„ๆ—ถ้—ด้—ด้š” + dict( + type='RawFrameDecode'), # ๅŠ ่ฝฝๅ’Œ่งฃ็ ๅธง็š„็ฎก้“๏ผŒไฝฟ็”จ็ป™ๅฎš็š„็ดขๅผ•้€‰ๆ‹ฉๅŽŸๅง‹ๅธง + dict( + type='RandomRescale', # ้šๆœบ็ผฉๆ”พ็Ÿญ่พน + scale_range=(256, 320)), # ้šๆœบ็ผฉๆ”พ็š„็Ÿญ่พนๅฐบๅฏธ่Œƒๅ›ด + dict( + type='RandomCrop', # ้šๆœบ่ฃๅ‰ช็ป™ๅฎšๅคงๅฐ็š„่กฅไธ + size=256), # ่ฃๅ‰ช่กฅไธ็š„ๅคงๅฐ + dict( + type='Flip', # ็ฟป่ฝฌ็ฎก้“ + flip_ratio=0.5), # ็ฟป่ฝฌ็š„ๆฆ‚็އ + dict( + type='FormatShape', # ๆ ผๅผๅŒ–ๅฝข็Šถ็š„็ฎก้“๏ผŒๅฐ†ๆœ€็ปˆๅ›พๅƒๅฝข็Šถๆ ผๅผๅŒ–ไธบ็ป™ๅฎš็š„่พ“ๅ…ฅๆ ผๅผ + input_format='NCTHW', # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + collapse=True), # ๅฆ‚ๆžœ N == 1๏ผŒๅˆ™ๅ‡ๅฐ‘็ปดๅบฆ N + dict(type='PackActionInputs') # ๆ‰“ๅŒ…่พ“ๅ…ฅๆ•ฐๆฎ + ] + + val_pipeline = [ + dict( + type='AVASampleFrames', # ไปŽ่ง†้ข‘ไธญ้‡‡ๆ ทๅธง็š„็ฎก้“ + clip_len=4, # ๆฏไธช้‡‡ๆ ท่พ“ๅ‡บ็š„ๅธงๆ•ฐ + frame_interval=16), # ็›ธ้‚ป้‡‡ๆ ทๅธงไน‹้—ด็š„ๆ—ถ้—ด้—ด้š” + dict( + type='RawFrameDecode'), # ๅŠ ่ฝฝๅ’Œ่งฃ็ ๅธง็š„็ฎก้“๏ผŒไฝฟ็”จ็ป™ๅฎš็š„็ดขๅผ•้€‰ๆ‹ฉๅŽŸๅง‹ๅธง + dict( + type='Resize', # ่ฐƒๆ•ดๅคงๅฐ็š„็ฎก้“ + scale=(-1, 256)), # ่ฐƒๆ•ดๅ›พๅƒ็š„ๅฐบๅบฆ + dict( + type='FormatShape', # ๆ ผๅผๅŒ–ๅฝข็Šถ็š„็ฎก้“๏ผŒๅฐ†ๆœ€็ปˆๅ›พๅƒๅฝข็Šถๆ ผๅผๅŒ–ไธบ็ป™ๅฎš็š„่พ“ๅ…ฅๆ ผๅผ + input_format='NCTHW', # ๆœ€็ปˆๅ›พๅƒๅฝข็Šถ็š„ๆ ผๅผ + collapse=True), # ๅฆ‚ๆžœ N == 1๏ผŒๅˆ™ๅ‡ๅฐ‘็ปดๅบฆ N + dict(type='PackActionInputs') # ๆ‰“ๅŒ…่พ“ๅ…ฅๆ•ฐๆฎ + ] + + train_dataloader = dict( + batch_size=32, # ๆฏไธชๅ• GPU ่ฎญ็ปƒ็š„ๆ‰นๅค„็†ๅคงๅฐ + num_workers=8, # ๆฏไธชๅ• GPU ่ฎญ็ปƒๆ—ถ้ข„ๅ–ๆ•ฐๆฎ็š„ worker ๆ•ฐ้‡ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญ worker ่ฟ›็จ‹๏ผŒ่ฟ™ๅฏไปฅๅŠ ๅฟซ่ฎญ็ปƒ้€Ÿๅบฆ + sampler=dict( + type='DefaultSampler', # ้ป˜่ฎค้‡‡ๆ ทๅ™จ๏ผŒๆ”ฏๆŒๅˆ†ๅธƒๅผๅ’Œ้žๅˆ†ๅธƒๅผ่ฎญ็ปƒใ€‚ๅ‚่€ƒ https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # ๅœจๆฏไธช epoch ไธญ้šๆœบๆ‰“ไนฑ่ฎญ็ปƒๆ•ฐๆฎ + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + exclude_file=exclude_file_train, # ๆŽ’้™คๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + label_file=label_file, # ๆ ‡็ญพๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(img=data_root), # ๅธง่ทฏๅพ„็š„ๅ‰็ผ€ + proposal_file=proposal_file_train, # ไบบไฝ“ๆฃ€ๆต‹ proposals ็š„่ทฏๅพ„ + pipeline=train_pipeline) + ) + val_dataloader = dict( + batch_size=1, # ๆฏไธชๅ• GPU ่ฏ„ไผฐ็š„ๆ‰นๅค„็†ๅคงๅฐ + num_workers=8, # ๆฏไธชๅ• GPU ่ฏ„ไผฐๆ—ถ้ข„ๅ–ๆ•ฐๆฎ็š„ worker ๆ•ฐ้‡ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญ worker ่ฟ›็จ‹ + sampler=dict( + type='DefaultSampler', + shuffle=False), # ๅœจ้ชŒ่ฏๅ’Œๆต‹่ฏ•ๆ—ถไธๆ‰“ไนฑๆ•ฐๆฎ + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + exclude_file=exclude_file_val, # ๆŽ’้™คๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + label_file=label_file, # ๆ ‡็ญพๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(img=data_root_val), # ๅธง่ทฏๅพ„็š„ๅ‰็ผ€ + proposal_file=proposal_file_val, # ไบบไฝ“ๆฃ€ๆต‹ proposals ็š„่ทฏๅพ„ + pipeline=val_pipeline, + test_mode=True) + ) + test_dataloader = val_dataloader # ๆต‹่ฏ•ๆ•ฐๆฎๅŠ ่ฝฝๅ™จ็š„้…็ฝฎ + + # ่ฏ„ไผฐ่ฎพ็ฝฎ + val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) + test_evaluator = val_evaluator # ๆต‹่ฏ•่ฏ„ไผฐๅ™จ็š„้…็ฝฎ + + train_cfg = dict( + type='EpochBasedTrainLoop', # ่ฎญ็ปƒๅพช็Žฏ็š„ๅ็งฐ + max_epochs=20, # ๆ€ป็š„่ฎญ็ปƒ epoch ๆ•ฐ้‡ + val_begin=1, # ๅผ€ๅง‹้ชŒ่ฏ็š„ epoch + val_interval=1) # ้ชŒ่ฏ็š„้—ด้š” + val_cfg = dict( + type='ValLoop') # ้ชŒ่ฏๅพช็Žฏ็š„ๅ็งฐ + test_cfg = dict( + type='TestLoop') # ๆต‹่ฏ•ๅพช็Žฏ็š„ๅ็งฐ + + # ๅญฆไน ็ญ–็•ฅ + param_scheduler = [ + dict( + type='LinearLR', # ็บฟๆ€งๅ‡ๅฐ‘ๆฏไธชๅ‚ๆ•ฐ็ป„็š„ๅญฆไน ็އ + start_factor=0.1, # ็ฌฌไธ€ไธช epoch ไธญๅญฆไน ็އ็š„ไน˜ๆณ•ๅ› ๅญ + by_epoch=True, # ๆ˜ฏๅฆๆŒ‰ epoch ๆ›ดๆ–ฐๅญฆไน ็އ + begin=0, # ๅผ€ๅง‹ๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + end=5), # ๅœๆญขๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + dict( + type='MultiStepLR', # ๅฝ“ epoch ๆ•ฐ่พพๅˆฐ้‡Œ็จ‹็ข‘ๆ—ถ๏ผŒๅ‡ๅฐ‘ๅญฆไน ็އ + begin=0, # ๅผ€ๅง‹ๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + end=20, # ๅœๆญขๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + by_epoch=True, # ๆ˜ฏๅฆๆŒ‰ epoch ๆ›ดๆ–ฐๅญฆไน ็އ + milestones=[10, 15], # ๅญฆไน ็އ่กฐๅ‡็š„ๆญฅ้ชค + gamma=0.1) # ๅญฆไน ็އ่กฐๅ‡็š„ไน˜ๆณ•ๅ› ๅญ + ] + + # ไผ˜ๅŒ–ๅ™จ + optim_wrapper = dict( + type='OptimWrapper', # ไผ˜ๅŒ–ๅ™จๅŒ…่ฃ…ๅ™จ็š„ๅ็งฐ๏ผŒๅˆ‡ๆขๅˆฐ AmpOptimWrapper ไปฅๅฏ็”จๆททๅˆ็ฒพๅบฆ่ฎญ็ปƒ + optimizer=dict( + type='SGD', # ไผ˜ๅŒ–ๅ™จ็š„ๅ็งฐ + lr=0.2, # ๅญฆไน ็އ + momentum=0.9, # ๅŠจ้‡ๅ› ๅญ + weight_decay=0.0001), # ๆƒ้‡่กฐๅ‡ + clip_grad=dict(max_norm=40, norm_type=2)) # ๆขฏๅบฆๅ‰ช่ฃ็š„้…็ฝฎ + + # ่ฟ่กŒๆ—ถ่ฎพ็ฝฎ + default_scope = 'mmaction' # ้ป˜่ฎคๆณจๅ†Œ่กจ่Œƒๅ›ด๏ผŒ็”จไบŽๆŸฅๆ‰พๆจกๅ—ใ€‚ๅ‚่€ƒ https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), # ๅฐ†่ฟ่กŒๆ—ถไฟกๆฏๆ›ดๆ–ฐๅˆฐๆถˆๆฏไธญๅฟƒ็š„้’ฉๅญ + timer=dict(type='IterTimerHook'), # ็”จไบŽ่ฎฐๅฝ•่ฟญไปฃ่ฟ‡็จ‹ไธญ่Šฑ่ดน็š„ๆ—ถ้—ด็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + logger=dict( + type='LoggerHook', # ็”จไบŽ่ฎฐๅฝ•่ฎญ็ปƒ/้ชŒ่ฏ/ๆต‹่ฏ•้˜ถๆฎต็š„ๆ—ฅๅฟ—็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + interval=20, # ๆ‰“ๅฐๆ—ฅๅฟ—็š„้—ด้š” + ignore_last=False), # ๅฟฝ็•ฅๆฏไธช epoch ไธญๆœ€ๅŽๅ‡ ๆฌก่ฟญไปฃ็š„ๆ—ฅๅฟ— + param_scheduler=dict(type='ParamSchedulerHook'), # ๆ›ดๆ–ฐไผ˜ๅŒ–ๅ™จไธญ็š„ๆŸไบ›่ถ…ๅ‚ๆ•ฐ็š„้’ฉๅญ + checkpoint=dict( + type='CheckpointHook', # ๅฎšๆœŸไฟๅญ˜ๆƒ้‡็š„้’ฉๅญ + interval=3, # ไฟๅญ˜ๅ‘จๆœŸ + save_best='auto', # ๅœจ่ฏ„ไผฐ่ฟ‡็จ‹ไธญๆต‹้‡ๆœ€ไฝณๆƒ้‡็š„ๆŒ‡ๆ ‡ + max_keep_ckpts=3), # ไฟ็•™็š„ๆœ€ๅคงๆƒ้‡ๆ–‡ไปถๆ•ฐ้‡ + sampler_seed=dict(type='DistSamplerSeedHook'), # ็”จไบŽๅˆ†ๅธƒๅผ่ฎญ็ปƒ็š„ๆ•ฐๆฎๅŠ ่ฝฝ้‡‡ๆ ทๅ™จ + sync_buffers=dict(type='SyncBuffersHook')) # ๅœจๆฏไธช epoch ็ป“ๆŸๆ—ถๅŒๆญฅๆจกๅž‹็ผ“ๅ†ฒๅŒบ็š„้’ฉๅญ + env_cfg = dict( + cudnn_benchmark=False, # ๆ˜ฏๅฆๅฏ็”จ cudnn ็š„ๅŸบๅ‡†ๆต‹่ฏ• + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # ่ฎพ็ฝฎๅคš่ฟ›็จ‹็š„ๅ‚ๆ•ฐ + dist_cfg=dict(backend='nccl')) # ่ฎพ็ฝฎๅˆ†ๅธƒๅผ็Žฏๅขƒ็š„ๅ‚ๆ•ฐ๏ผŒไนŸๅฏไปฅ่ฎพ็ฝฎ็ซฏๅฃ + + log_processor = dict( + type='LogProcessor', # ็”จไบŽๆ ผๅผๅŒ–ๆ—ฅๅฟ—ไฟกๆฏ็š„ๆ—ฅๅฟ—ๅค„็†ๅ™จ + window_size=20, # ้ป˜่ฎคๅนณๆป‘้—ด้š” + by_epoch=True) # ๆ˜ฏๅฆไฝฟ็”จ epoch ็ฑปๅž‹ๆ ผๅผๅŒ–ๆ—ฅๅฟ— + vis_backends = [ + dict(type='LocalVisBackend')] # ๅฏ่ง†ๅŒ–ๅŽ็ซฏ็š„ๅˆ—่กจ + visualizer = dict( + type='ActionVisualizer', # ๅฏ่ง†ๅŒ–ๅ™จ็š„ๅ็งฐ + vis_backends=vis_backends) + log_level = 'INFO' # ๆ—ฅๅฟ—็บงๅˆซ + load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth') # ไปŽ็ป™ๅฎš่ทฏๅพ„ๅŠ ่ฝฝๆจกๅž‹ๆƒ้‡ไฝœไธบ้ข„่ฎญ็ปƒๆจกๅž‹ใ€‚่ฟ™ไธไผšๆขๅค่ฎญ็ปƒใ€‚ + resume = False # ๆ˜ฏๅฆไปŽ `load_from` ไธญๅฎšไน‰็š„ๆƒ้‡ๆขๅค่ฎญ็ปƒใ€‚ๅฆ‚ๆžœ `load_from` ไธบ None๏ผŒๅˆ™ไผšไปŽ `work_dir` ไธญๆขๅคๆœ€ๆ–ฐ็š„ๆƒ้‡ใ€‚ + ``` + +### ๅŠจไฝœๅฎšไฝ็š„้…็ฝฎ็ณป็ปŸ + +ๆˆ‘ไปฌๅฐ†ๆจกๅ—ๅŒ–่ฎพ่ฎกๅผ•ๅ…ฅไบ†้…็ฝฎ็ณป็ปŸไธญ๏ผŒๆ–นไพฟ่ฟ›่กŒๅ„็งๅฎž้ชŒใ€‚ + +- BMN ็š„็คบไพ‹ + + ไธบไบ†ๅธฎๅŠฉ็”จๆˆทๅฏนๅฎŒๆ•ด็š„้…็ฝฎ็ป“ๆž„ๅ’ŒๅŠจไฝœๅฎšไฝ็ณป็ปŸไธญ็š„ๆจกๅ—ๆœ‰ไธ€ไธชๅŸบๆœฌ็š„ไบ†่งฃ๏ผŒๆˆ‘ไปฌๅฏน BMN ็š„้…็ฝฎ่ฟ›่กŒไบ†็ฎ€่ฆๆณจ้‡Š๏ผŒๅ…ทไฝ“ๅฆ‚ไธ‹ๆ‰€็คบใ€‚ๆœ‰ๅ…ณๆฏไธชๆจกๅ—ไธญๆฏไธชๅ‚ๆ•ฐ็š„ๆ›ด่ฏฆ็ป†็”จๆณ•ๅ’Œๆ›ฟไปฃๆ–นๆณ•๏ผŒ่ฏทๅ‚้˜… [API ๆ–‡ๆกฃ](https://mmaction2.readthedocs.io/en/latest/api.html)ใ€‚ + + ```python + # ๆจกๅž‹่ฎพ็ฝฎ + model = dict( + type='BMN', # ๅฎšไฝๅ™จ็š„็ฑปๅ + temporal_dim=100, # ๆฏไธช่ง†้ข‘้€‰ๅ–็š„ๆ€ปๅธงๆ•ฐ + boundary_ratio=0.5, # ็กฎๅฎš่ง†้ข‘่พน็•Œ็š„ๆฏ”็އ + num_samples=32, # ๆฏไธช proposal ็š„้‡‡ๆ ทๆ•ฐ้‡ + num_samples_per_bin=3, # ๆฏไธช้‡‡ๆ ท็š„ bin ็š„้‡‡ๆ ทๆ•ฐ้‡ + feat_dim=400, # ็‰นๅพ็š„็ปดๅบฆ + soft_nms_alpha=0.4, # Soft NMS ็š„ alpha ๅ€ผ + soft_nms_low_threshold=0.5, # Soft NMS ็š„ไฝŽ้˜ˆๅ€ผ + soft_nms_high_threshold=0.9, # Soft NMS ็š„้ซ˜้˜ˆๅ€ผ + post_process_top_k=100) # ๅŽๅค„็†ไธญ็š„ top-k proposal ๆ•ฐ้‡ + + # ๆ•ฐๆฎ้›†่ฎพ็ฝฎ + dataset_type = 'ActivityNetDataset' # ็”จไบŽ่ฎญ็ปƒใ€้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๆ•ฐๆฎ้›†็ฑปๅž‹ + data_root = 'data/activitynet_feature_cuhk/csv_mean_100/' # ็”จไบŽ่ฎญ็ปƒ็š„ๆ•ฐๆฎ็š„ๆ น็›ฎๅฝ• + data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/' # ็”จไบŽ้ชŒ่ฏๅ’Œๆต‹่ฏ•็š„ๆ•ฐๆฎ็š„ๆ น็›ฎๅฝ• + ann_file_train = 'data/ActivityNet/anet_anno_train.json' # ็”จไบŽ่ฎญ็ปƒ็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + ann_file_val = 'data/ActivityNet/anet_anno_val.json' # ็”จไบŽ้ชŒ่ฏ็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + ann_file_test = 'data/ActivityNet/anet_anno_test.json' # ็”จไบŽๆต‹่ฏ•็š„ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + + train_pipeline = [ + dict(type='LoadLocalizationFeature'), # ๅŠ ่ฝฝๅฎšไฝ็‰นๅพ็š„็ฎก้“ + dict(type='GenerateLocalizationLabels'), # ็”Ÿๆˆๅฎšไฝๆ ‡็ญพ็š„็ฎก้“ + dict( + type='PackLocalizationInputs', # ๆ‰“ๅŒ…ๅฎšไฝๆ•ฐๆฎ + keys=('gt_bbox'), # ่พ“ๅ…ฅ็š„้”ฎ + meta_keys=('video_name'))] # ่พ“ๅ…ฅ็š„ๅ…ƒ้”ฎ + val_pipeline = [ + dict(type='LoadLocalizationFeature'), # ๅŠ ่ฝฝๅฎšไฝ็‰นๅพ็š„็ฎก้“ + dict(type='GenerateLocalizationLabels'), # ็”Ÿๆˆๅฎšไฝๆ ‡็ญพ็š„็ฎก้“ + dict( + type='PackLocalizationInputs', # ๆ‰“ๅŒ…ๅฎšไฝๆ•ฐๆฎ + keys=('gt_bbox'), # ่พ“ๅ…ฅ็š„้”ฎ + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame'))] # ่พ“ๅ…ฅ็š„ๅ…ƒ้”ฎ + test_pipeline = [ + dict(type='LoadLocalizationFeature'), # ๅŠ ่ฝฝๅฎšไฝ็‰นๅพ็š„็ฎก้“ + dict( + type='PackLocalizationInputs', # ๆ‰“ๅŒ…ๅฎšไฝๆ•ฐๆฎ + keys=('gt_bbox'), # ่พ“ๅ…ฅ็š„้”ฎ + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame'))] # ่พ“ๅ…ฅ็š„ๅ…ƒ้”ฎ + train_dataloader = dict( + batch_size=8, # ๆฏไธชๅ• GPU ่ฎญ็ปƒ็š„ๆ‰นๅค„็†ๅคงๅฐ + num_workers=8, # ๆฏไธชๅ• GPU ่ฎญ็ปƒๆ—ถ้ข„ๅ–ๆ•ฐๆฎ็š„ worker ๆ•ฐ้‡ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญ worker ่ฟ›็จ‹๏ผŒ่ฟ™ๅฏไปฅๅŠ ๅฟซ่ฎญ็ปƒ้€Ÿๅบฆ + sampler=dict( + type='DefaultSampler', # ้ป˜่ฎค้‡‡ๆ ทๅ™จ๏ผŒๆ”ฏๆŒๅˆ†ๅธƒๅผๅ’Œ้žๅˆ†ๅธƒๅผ่ฎญ็ปƒใ€‚ๅ‚่€ƒ https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # ๅœจๆฏไธช epoch ไธญ้šๆœบๆ‰“ไนฑ่ฎญ็ปƒๆ•ฐๆฎ + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(video=data_root), # ่ง†้ข‘่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=train_pipeline) + ) + val_dataloader = dict( + batch_size=1, # ๆฏไธชๅ• GPU ่ฏ„ไผฐ็š„ๆ‰นๅค„็†ๅคงๅฐ + num_workers=8, # ๆฏไธชๅ• GPU ่ฏ„ไผฐๆ—ถ้ข„ๅ–ๆ•ฐๆฎ็š„ worker ๆ•ฐ้‡ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญ worker ่ฟ›็จ‹ + sampler=dict( + type='DefaultSampler', + shuffle=False), # ๅœจ้ชŒ่ฏๅ’Œๆต‹่ฏ•ๆ—ถไธๆ‰“ไนฑๆ•ฐๆฎ + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(video=data_root_val), # ่ง†้ข‘่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=val_pipeline, + test_mode=True) + ) + test_dataloader = dict( + batch_size=1, # ๆฏไธชๅ• GPU ๆต‹่ฏ•็š„ๆ‰นๅค„็†ๅคงๅฐ + num_workers=8, # ๆฏไธชๅ• GPU ๆต‹่ฏ•ๆ—ถ้ข„ๅ–ๆ•ฐๆฎ็š„ worker ๆ•ฐ้‡ + persistent_workers=True, # ๅฆ‚ๆžœไธบ `True`๏ผŒๅˆ™ๆ•ฐๆฎๅŠ ่ฝฝๅ™จๅœจไธ€ไธช epoch ็ป“ๆŸๅŽไธไผšๅ…ณ้—ญ worker ่ฟ›็จ‹ + sampler=dict( + type='DefaultSampler', + shuffle=False), # ๅœจ้ชŒ่ฏๅ’Œๆต‹่ฏ•ๆ—ถไธๆ‰“ไนฑๆ•ฐๆฎ + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, # ๆณจ้‡Šๆ–‡ไปถ็š„่ทฏๅพ„ + data_prefix=dict(video=data_root_val), # ่ง†้ข‘่ทฏๅพ„็š„ๅ‰็ผ€ + pipeline=test_pipeline, + test_mode=True) + ) + + # ่ฏ„ไผฐ่ฎพ็ฝฎ + work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' # ไฟๅญ˜ๅฝ“ๅ‰ๅฎž้ชŒ็š„ๆจกๅž‹ๆƒ้‡ๅ’Œๆ—ฅๅฟ—็š„็›ฎๅฝ• + val_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict( + out=f'{work_dir}/results.json', # ่พ“ๅ‡บๆ–‡ไปถ็š„่ทฏๅพ„ + output_format='json')) # ่พ“ๅ‡บๆ–‡ไปถ็š„ๆ ผๅผ + test_evaluator = val_evaluator # ๅฐ† test_evaluator ่ฎพ็ฝฎไธบ val_evaluator + + max_epochs = 9 # ่ฎญ็ปƒๆจกๅž‹็š„ๆ€ป epoch ๆ•ฐ้‡ + train_cfg = dict( + type='EpochBasedTrainLoop', # ่ฎญ็ปƒๅพช็Žฏ็š„ๅ็งฐ + max_epochs=max_epochs, # ๆ€ป็š„่ฎญ็ปƒ epoch ๆ•ฐ้‡ + val_begin=1, # ๅผ€ๅง‹้ชŒ่ฏ็š„ epoch + val_interval=1) # ้ชŒ่ฏ็š„้—ด้š” + val_cfg = dict( + type='ValLoop') # ้ชŒ่ฏๅพช็Žฏ็š„ๅ็งฐ + test_cfg = dict( + type='TestLoop') # ๆต‹่ฏ•ๅพช็Žฏ็š„ๅ็งฐ + + # ๅญฆไน ็ญ–็•ฅ + param_scheduler = [ + dict( + type='MultiStepLR', # ๅฝ“ epoch ๆ•ฐ่พพๅˆฐ้‡Œ็จ‹็ข‘ๆ—ถ๏ผŒๅ‡ๅฐ‘ๅญฆไน ็އ + begin=0, # ๅผ€ๅง‹ๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + end=max_epochs, # ๅœๆญขๆ›ดๆ–ฐๅญฆไน ็އ็š„ๆญฅ้ชค + by_epoch=True, # ๆ˜ฏๅฆๆŒ‰ epoch ๆ›ดๆ–ฐๅญฆไน ็އ + milestones=[7, ], # ๅญฆไน ็އ่กฐๅ‡็š„ๆญฅ้ชค + gamma=0.1) # ๅญฆไน ็އ่กฐๅ‡็š„ไน˜ๆณ•ๅ› ๅญ + ] + + # ไผ˜ๅŒ–ๅ™จ + optim_wrapper = dict( + type='OptimWrapper', # ไผ˜ๅŒ–ๅ™จๅŒ…่ฃ…ๅ™จ็š„ๅ็งฐ๏ผŒๅˆ‡ๆขๅˆฐ AmpOptimWrapper ไปฅๅฏ็”จๆททๅˆ็ฒพๅบฆ่ฎญ็ปƒ + optimizer=dict( + type='Adam', # ไผ˜ๅŒ–ๅ™จ็š„ๅ็งฐ + lr=0.001, # ๅญฆไน ็އ + weight_decay=0.0001), # ๆƒ้‡่กฐๅ‡ + clip_grad=dict(max_norm=40, norm_type=2)) # ๆขฏๅบฆๅ‰ช่ฃ็š„้…็ฝฎ + + # ่ฟ่กŒๆ—ถ่ฎพ็ฝฎ + default_scope = 'mmaction' # ้ป˜่ฎคๆณจๅ†Œ่กจ่Œƒๅ›ด๏ผŒ็”จไบŽๆŸฅๆ‰พๆจกๅ—ใ€‚ๅ‚่€ƒ https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), # ๅฐ†่ฟ่กŒๆ—ถไฟกๆฏๆ›ดๆ–ฐๅˆฐๆถˆๆฏไธญๅฟƒ็š„้’ฉๅญ + timer=dict(type='IterTimerHook'), # ็”จไบŽ่ฎฐๅฝ•่ฟญไปฃ่ฟ‡็จ‹ไธญ่Šฑ่ดน็š„ๆ—ถ้—ด็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + logger=dict( + type='LoggerHook', # ็”จไบŽ่ฎฐๅฝ•่ฎญ็ปƒ/้ชŒ่ฏ/ๆต‹่ฏ•้˜ถๆฎต็š„ๆ—ฅๅฟ—็š„ๆ—ฅๅฟ—่ฎฐๅฝ•ๅ™จ + interval=20, # ๆ‰“ๅฐๆ—ฅๅฟ—็š„้—ด้š” + ignore_last=False), # ๅฟฝ็•ฅๆฏไธช epoch ไธญๆœ€ๅŽๅ‡ ๆฌก่ฟญไปฃ็š„ๆ—ฅๅฟ— + param_scheduler=dict(type='ParamSchedulerHook'), # ๆ›ดๆ–ฐไผ˜ๅŒ–ๅ™จไธญ็š„ๆŸไบ›่ถ…ๅ‚ๆ•ฐ็š„้’ฉๅญ + checkpoint=dict( + type='CheckpointHook', # ๅฎšๆœŸไฟๅญ˜ๆƒ้‡็š„้’ฉๅญ + interval=3, # ไฟๅญ˜ๅ‘จๆœŸ + save_best='auto', # ๅœจ่ฏ„ไผฐ่ฟ‡็จ‹ไธญๆต‹้‡ๆœ€ไฝณๆƒ้‡็š„ๆŒ‡ๆ ‡ + max_keep_ckpts=3), # ไฟ็•™็š„ๆœ€ๅคงๆƒ้‡ๆ–‡ไปถๆ•ฐ้‡ + sampler_seed=dict(type='DistSamplerSeedHook'), # ็”จไบŽๅˆ†ๅธƒๅผ่ฎญ็ปƒ็š„ๆ•ฐๆฎๅŠ ่ฝฝ้‡‡ๆ ทๅ™จ + sync_buffers=dict(type='SyncBuffersHook')) # ๅœจๆฏไธช epoch ็ป“ๆŸๆ—ถๅŒๆญฅๆจกๅž‹็ผ“ๅ†ฒๅŒบ็š„้’ฉๅญ + env_cfg = dict( + cudnn_benchmark=False, # ๆ˜ฏๅฆๅฏ็”จ cudnn ็š„ๅŸบๅ‡†ๆต‹่ฏ• + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # ่ฎพ็ฝฎๅคš่ฟ›็จ‹็š„ๅ‚ๆ•ฐ + dist_cfg=dict(backend='nccl')) # ่ฎพ็ฝฎๅˆ†ๅธƒๅผ็Žฏๅขƒ็š„ๅ‚ๆ•ฐ๏ผŒไนŸๅฏไปฅ่ฎพ็ฝฎ็ซฏๅฃ + + log_processor = dict( + type='LogProcessor', # ็”จไบŽๆ ผๅผๅŒ–ๆ—ฅๅฟ—ไฟกๆฏ็š„ๆ—ฅๅฟ—ๅค„็†ๅ™จ + window_size=20, # ้ป˜่ฎคๅนณๆป‘้—ด้š” + by_epoch=True) # ๆ˜ฏๅฆไฝฟ็”จ epoch ็ฑปๅž‹ๆ ผๅผๅŒ–ๆ—ฅๅฟ— + vis_backends = [ + dict(type='LocalVisBackend')] # ๅฏ่ง†ๅŒ–ๅŽ็ซฏ็š„ๅˆ—่กจ + visualizer = dict( + type='ActionVisualizer', # ๅฏ่ง†ๅŒ–ๅ™จ็š„ๅ็งฐ + vis_backends=vis_backends) + log_level = 'INFO' # ๆ—ฅๅฟ—็บงๅˆซ + load_from = None # ไปŽ็ป™ๅฎš่ทฏๅพ„ๅŠ ่ฝฝๆจกๅž‹ๆƒ้‡ไฝœไธบ้ข„่ฎญ็ปƒๆจกๅž‹ใ€‚่ฟ™ไธไผšๆขๅค่ฎญ็ปƒใ€‚ + resume = False # ๆ˜ฏๅฆไปŽ `load_from` ไธญๅฎšไน‰็š„ๆƒ้‡ๆขๅค่ฎญ็ปƒใ€‚ๅฆ‚ๆžœ `load_from` ไธบ None๏ผŒๅˆ™ไผšไปŽ `work_dir` ไธญๆขๅคๆœ€ๆ–ฐ็š„ๆƒ้‡ใ€‚ + ``` diff --git a/docs/zh_cn/user_guides/finetune.md b/docs/zh_cn/user_guides/finetune.md new file mode 100644 index 0000000000000000000000000000000000000000..9c5e674e5afab07c532dfed5b08bb04dec05e7fe --- /dev/null +++ b/docs/zh_cn/user_guides/finetune.md @@ -0,0 +1,320 @@ +# ๆจกๅž‹ๅพฎ่ฐƒ + +ๆœฌๆ•™็จ‹ๆไพ›ไบ†ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹ๅœจๅ…ถไป–ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒๅพฎ่ฐƒ็š„ๆŒ‡ๅฏผใ€‚้€š่ฟ‡ๅพฎ่ฐƒ๏ผŒๅฏไปฅ่Žทๅพ—ๆ›ดๅฅฝ็š„ๆ€ง่ƒฝใ€‚ + +- [ๆจกๅž‹ๅพฎ่ฐƒ](#ๆจกๅž‹ๅพฎ่ฐƒ) + - [ๆฆ‚่ฟฐ](#ๆฆ‚่ฟฐ) + - [้€‰ๆ‹ฉๆจกๆฟ้…็ฝฎ](#้€‰ๆ‹ฉๆจกๆฟ้…็ฝฎ) + - [ไฟฎๆ”น Head](#ไฟฎๆ”น-head) + - [ไฟฎๆ”นๆ•ฐๆฎ้›†](#ไฟฎๆ”นๆ•ฐๆฎ้›†) + - [ไฟฎๆ”น่ฎญ็ปƒ่ฎกๅˆ’](#ไฟฎๆ”น่ฎญ็ปƒ่ฎกๅˆ’) + - [ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹](#ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹) + - [ๅผ€ๅง‹่ฎญ็ปƒ](#ๅผ€ๅง‹่ฎญ็ปƒ) + +## ๆฆ‚่ฟฐ + +ๅœจๆ–ฐๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒๆจกๅž‹ๅพฎ่ฐƒๆœ‰ไธคไธชๆญฅ้ชคใ€‚ + +1. ๆทปๅŠ ๅฏนๆ–ฐๆ•ฐๆฎ้›†็š„ๆ”ฏๆŒใ€‚่ฏทๅ‚่€ƒ[ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](prepare_dataset.md)ๅ’Œ[่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](../advanced_guides/customize_dataset.md)ใ€‚ +2. ไฟฎๆ”น้…็ฝฎๆ–‡ไปถใ€‚ๆœฌๆ•™็จ‹ๅฐ†่ฎจ่ฎบ่ฟ™ไธ€้ƒจๅˆ†ใ€‚ + +## ้€‰ๆ‹ฉๆจกๆฟ้…็ฝฎ + +่ฟ™้‡Œๆˆ‘ไปฌไปฅ `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ไธบไพ‹ใ€‚ๆˆ‘ไปฌ้ฆ–ๅ…ˆๅฐ†่ฏฅ้…็ฝฎๆ–‡ไปถๅคๅˆถๅˆฐๅŒไธ€ๆ–‡ไปถๅคน๏ผŒๅนถๅฐ†ๅ…ถ้‡ๅ‘ฝๅไธบ `tsn_ucf101.py`๏ผŒ็„ถๅŽ้œ€่ฆๆณจๆ„้…็ฝฎไธญ็š„ๅ››ไธช้ƒจๅˆ†๏ผŒๅ…ทไฝ“ๆฅ่ฏด๏ผŒไธบไธๅญ˜ๅœจ็š„้”ฎๆทปๅŠ ๆ–ฐ้”ฎ๏ผŒๅนถไฟฎๆ”น็Žฐๆœ‰้”ฎ็š„ๅŽŸๅง‹้”ฎใ€‚ + +## ไฟฎๆ”น Head + +`cls_head` ไธญ็š„ `num_classes` ้œ€่ฆๆ›ดๆ”นไธบๆ–ฐๆ•ฐๆฎ้›†็š„็ฑปๅˆซๆ•ฐใ€‚้ข„่ฎญ็ปƒๆจกๅž‹็š„ๆƒ้‡ไผš่ขซ้‡็”จ๏ผŒ้™คไบ†ๆœ€ๅŽ็š„้ข„ๆต‹ๅฑ‚ใ€‚ๅ› ๆญค๏ผŒๆ›ดๆ”น็ฑปๅˆซๆ•ฐๆ˜ฏๅฎ‰ๅ…จ็š„ใ€‚ๅœจๆˆ‘ไปฌ็š„ไพ‹ๅญไธญ๏ผŒUCF101 ๆœ‰ 101 ไธช็ฑปๅˆซใ€‚ๆ‰€ไปฅๆˆ‘ไปฌๅฐ†ๅ…ถไปŽ 400๏ผˆKinetics-400 ็š„็ฑปๅˆซๆ•ฐ๏ผ‰ๆ”นไธบ 101ใ€‚ + +```python +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101 # ๅฐ† 400 ไฟฎๆ”นไธบ 101 + )) +``` + +## ไฟฎๆ”นๆ•ฐๆฎ้›† + +MMAction2 ๆ”ฏๆŒ UCF101ใ€Kinetics-400ใ€Moments in Timeใ€Multi-Moments in Timeใ€THUMOS14ใ€Something-Something V1&V2ใ€ActivityNet ๆ•ฐๆฎ้›†ใ€‚็”จๆˆทๅฏ่ƒฝ้œ€่ฆๅฐ†ไธŠ่ฟฐๅ…ถไธญไธ€ไธชๆ•ฐๆฎ้›†้€‚ๅบ”ๅˆฐไป–ไปฌ็š„็‰นๆฎŠๆ•ฐๆฎ้›†ไธŠใ€‚ไฝ ๅฏไปฅๅ‚่€ƒ[ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](prepare_dataset.md)ๅ’Œ[่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](../advanced_guides/customize_dataset.md)ไบ†่งฃๆ›ดๅคš็ป†่Š‚ใ€‚ๅœจๆˆ‘ไปฌ็š„ไพ‹ๅญไธญ๏ผŒUCF101 ๅทฒ็ป็”ฑๅ„็งๆ•ฐๆฎ้›†็ฑปๅž‹ๆ”ฏๆŒ๏ผŒไพ‹ๅฆ‚ `VideoDataset`๏ผŒๅ› ๆญคๆˆ‘ไปฌๅฐ†้…็ฝฎไฟฎๆ”นๅฆ‚ไธ‹ใ€‚ + +```python +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' +``` + +## ไฟฎๆ”น่ฎญ็ปƒ่ฎกๅˆ’ + +ๅพฎ่ฐƒ้€šๅธธ้œ€่ฆ่พƒๅฐ็š„ๅญฆไน ็އๅ’Œ่พƒๅฐ‘็š„่ฎญ็ปƒๅ‘จๆœŸใ€‚ + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + by_epoch=True, + milestones=[20, 40], # ไฟฎๆ”น milestones + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # ๅฐ† 0.01 ไฟฎๆ”นไธบ 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) +``` + +## ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹ + +ไธบไบ†ๅœจๆ•ดไธช็ฝ‘็ปœไธŠไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹๏ผŒๆ–ฐ้…็ฝฎๆ–‡ไปถๅœจ `load_from` ไธญๆทปๅŠ ไบ†้ข„่ฎญ็ปƒๆจกๅž‹็š„้“พๆŽฅใ€‚ๆˆ‘ไปฌๅœจ `configs/_base_/default_runtime.py` ไธญ่ฎพ็ฝฎ `load_from=None` ไฝœไธบ้ป˜่ฎคๅ€ผ๏ผŒๅนถไธ”ๆ นๆฎ[็ปงๆ‰ฟ่ฎพ่ฎก](config.md)๏ผŒ็”จๆˆทๅฏไปฅ้€š่ฟ‡ๅœจๅ…ถ้…็ฝฎไธญ่ฎพ็ฝฎ `load_from` ๆฅ็›ดๆŽฅๆ›ดๆ”นๅฎƒใ€‚ + +```python +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' # ๆจกๅž‹่ทฏๅพ„ๅฏไปฅๅœจๆจกๅž‹ๅบ“ไธญๆ‰พๅˆฐ +``` + +## ๅผ€ๅง‹่ฎญ็ปƒ + +็Žฐๅœจ๏ผŒๆˆ‘ไปฌๅทฒ็ปๅฎŒๆˆไบ†ๅพฎ่ฐƒ็š„้…็ฝฎๆ–‡ไปถ๏ผŒๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +```python +_base_ = [ + '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101 # ๅฐ† 400 ไฟฎๆ”นไธบ 101 + )) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + by_epoch=True, + milestones=[20, 40], # ไฟฎๆ”น milestones + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # ๅฐ† 0.01 ไฟฎๆ”นไธบ 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (32 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=256) + +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' + +``` + +ๅฆไธ€็งๆ›ด็ฎ€ๅ•็š„ๆ–นๆณ•ๆ˜ฏ็ปงๆ‰ฟ kinetics400 ้…็ฝฎ๏ผŒๅนถๅชๆŒ‡ๅฎšไฟฎๆ”น็š„้”ฎใ€‚่ฏท็กฎไฟ่‡ชๅฎšไน‰้…็ฝฎไธŽ `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ๅœจๅŒไธ€ไธชๆ–‡ไปถๅคนไธญใ€‚ + +```python +_base_ = [ + 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py' # ็ปงๆ‰ฟๆจกๆฟ้…็ฝฎ +] + +# model settings +model = dict( + cls_head=dict( + type='TSNHead', + num_classes=101)) # ๅฐ† 400 ไฟฎๆ”นไธบ 101 + + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/ucf101/videos_train/' +data_root_val = 'data/ucf101/videos_val/' +ann_file_train = 'data/ucf101/ucf101_train_list.txt' +ann_file_val = 'data/ucf101/ucf101_val_list.txt' + +train_dataloader = dict( + dataset=dict( + ann_file=ann_file_train, + data_prefix=dict(video=data_root))) +val_dataloader = dict( + dataset=dict( + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val))) +test_dataloader = dict( + dataset=dict( + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val))) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + val_begin=1, + val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=50, # ๅฐ† 100 ไฟฎๆ”นไธบ 50 + by_epoch=True, + milestones=[20, 40], # ไฟฎๆ”น milestones + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', + lr=0.005, # ๅฐ† 0.01 ไฟฎๆ”นไธบ 0.005 + momentum=0.9, + weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# use the pre-trained model for the whole TSN network +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' + +``` + +ไฝ ๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๅœจไฝ ็š„ๆ•ฐๆฎ้›†ไธŠๅพฎ่ฐƒๆจกๅž‹ใ€‚ + +```shell +python tools/train.py ${CONFIG_FILE} [ๅฏ้€‰ๅ‚ๆ•ฐ] +``` + +ไพ‹ๅฆ‚๏ผšๅœจ็กฎๅฎšๆ€ง้€‰้กนไธ‹๏ผŒๅœจ Kinetics-400 ๆ•ฐๆฎ้›†ไธŠ่ฎญ็ปƒ TSN ๆจกๅž‹ใ€‚ + +```shell +python tools/train.py configs/recognition/tsn/tsn_ucf101.py \ + --seed=0 --deterministic +``` + +ๆ›ดๅคš็ป†่Š‚๏ผŒ่ฏทๅ‚่€ƒ[่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](train_test.md)ไธญ็š„**่ฎญ็ปƒ**้ƒจๅˆ†ใ€‚ diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md new file mode 100644 index 0000000000000000000000000000000000000000..d6b61091d074885c583234b113eb13b87f3d0360 --- /dev/null +++ b/docs/zh_cn/user_guides/inference.md @@ -0,0 +1,39 @@ +# ไฝฟ็”จ็Žฐๆœ‰ๆจกๅž‹่ฟ›่กŒๆŽจ็† + +MMAction2 ๅœจ[ๆจกๅž‹ๅบ“](../modelzoo.md)ไธญๆไพ›ไบ†้ข„่ฎญ็ปƒ็š„่ง†้ข‘็†่งฃๆจกๅž‹ใ€‚ๆœฌๆ–‡ๅฐ†ๅฑ•็คบๅฆ‚ไฝ•ไฝฟ็”จ็Žฐๆœ‰ๆจกๅž‹ๅฏน็ป™ๅฎš็š„่ง†้ข‘่ฟ›่กŒๆŽจ็†ใ€‚ + +ๅ…ณไบŽๅฆ‚ไฝ•ๅœจๆ ‡ๅ‡†ๆ•ฐๆฎ้›†ไธŠๆต‹่ฏ•็Žฐๆœ‰ๆจกๅž‹๏ผŒ่ฏทๅ‚่€ƒ่ฟ™ไธช[ๆŒ‡ๅ—](./train_test.md#test)ใ€‚ + +## ๅฏน็ป™ๅฎš่ง†้ข‘่ฟ›่กŒๆŽจ็† + +MMAction2 ๆไพ›ไบ†็”จไบŽๅฏน็ป™ๅฎš่ง†้ข‘่ฟ›่กŒๆŽจ็†็š„้ซ˜็บง Python API๏ผš + +- [init_recognizer](mmaction.apis.init_recognizer): ไฝฟ็”จ้…็ฝฎๆ–‡ไปถๅ’Œๆƒ้‡ๆ–‡ไปถๅˆๅง‹ๅŒ–ไธ€ไธช่ฏ†ๅˆซๅ™จ +- [inference_recognizer](mmaction.apis.inference_recognizer): ๅฏน็ป™ๅฎš่ง†้ข‘่ฟ›่กŒๆŽจ็† + +ไธ‹้ขๆ˜ฏไธ€ไธชไฝฟ็”จ Kinitics-400 ้ข„่ฎญ็ปƒๆƒ้‡ๆž„ๅปบๆจกๅž‹ๅนถๅฏน็ป™ๅฎš่ง†้ข‘่ฟ›่กŒๆŽจ็†็š„็คบไพ‹ใ€‚ + +```{note} +ๅฆ‚ๆžœๆ‚จๅฐ† mmaction2 ็”จไฝœ็ฌฌไธ‰ๆ–นๅŒ…๏ผŒๆ‚จ้œ€่ฆไธ‹่ฝฝ็คบไพ‹ไธญ็š„้…็ฝฎๆ–‡ไปถๅ’Œๆผ”็คบ่ง†้ข‘ใ€‚ + +่ฟ่กŒ 'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .' ไธ‹่ฝฝๆ‰€้œ€็š„้…็ฝฎๆ–‡ไปถใ€‚ + +่ฟ่กŒ 'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4' ไธ‹่ฝฝๆ‰€้œ€็š„ๆผ”็คบ่ง†้ข‘ใ€‚ +``` + +```python +from mmaction.apis import inference_recognizer, init_recognizer + +config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py' +checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # ๅฏไปฅๆ˜ฏๆœฌๅœฐ่ทฏๅพ„ +img_path = 'demo/demo.mp4' # ๆ‚จๅฏไปฅๆŒ‡ๅฎš่‡ชๅทฑ็š„ๅ›พ็‰‡่ทฏๅพ„ + +# ไปŽ้…็ฝฎๆ–‡ไปถๅ’Œๆƒ้‡ๆ–‡ไปถไธญๆž„ๅปบๆจกๅž‹ +model = init_recognizer(config_path, checkpoint_path, device="cpu") # device ๅฏไปฅๆ˜ฏ 'cuda:0' +# ๅฏนๅ•ไธช่ง†้ข‘่ฟ›่กŒๆต‹่ฏ• +result = inference_recognizer(model, img_path) +``` + +`result` ๆ˜ฏไธ€ไธชๅŒ…ๅซ `pred_scores` ็š„ๅญ—ๅ…ธใ€‚ + +็คบไพ‹ไธญ็š„ๅŠจไฝœ่ฏ†ๅˆซๆผ”็คบๅฏไปฅๅœจ[demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py)ไธญๆ‰พๅˆฐใ€‚ diff --git a/docs/zh_cn/user_guides/prepare_dataset.md b/docs/zh_cn/user_guides/prepare_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..c8cd66fb432cdfa44fc05da7e532ac96e30fafdc --- /dev/null +++ b/docs/zh_cn/user_guides/prepare_dataset.md @@ -0,0 +1,295 @@ +# ๅ‡†ๅค‡ๆ•ฐๆฎ้›† + +MMAction2 ๆ”ฏๆŒ่ฎธๅคš็Žฐๆœ‰็š„ๆ•ฐๆฎ้›†ใ€‚ๅœจๆœฌ็ซ ไธญ๏ผŒๆˆ‘ไปฌๅฐ†ๅผ•ๅฏผๆ‚จๅ‡†ๅค‡ MMAction2 ็š„ๆ•ฐๆฎ้›†ใ€‚ + +- [ๅ‡†ๅค‡ๆ•ฐๆฎ้›†](#ๅ‡†ๅค‡ๆ•ฐๆฎ้›†) + - [ๅ…ณไบŽ่ง†้ข‘ๆ•ฐๆฎๆ ผๅผ็š„่ฏดๆ˜Ž](#ๅ…ณไบŽ่ง†้ข‘ๆ•ฐๆฎๆ ผๅผ็š„่ฏดๆ˜Ž) + - [ไฝฟ็”จๅ†…็ฝฎๆ•ฐๆฎ้›†](#ไฝฟ็”จๅ†…็ฝฎๆ•ฐๆฎ้›†) + - [ไฝฟ็”จ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](#ไฝฟ็”จ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†) + - [ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŠจไฝœ่ฏ†ๅˆซ) + - [ๅŸบไบŽ้ชจ้ชผ็š„ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŸบไบŽ้ชจ้ชผ็š„ๅŠจไฝœ่ฏ†ๅˆซ) + - [ๅŸบไบŽ้Ÿณ้ข‘็š„ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŸบไบŽ้Ÿณ้ข‘็š„ๅŠจไฝœ่ฏ†ๅˆซ) + - [ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹](#ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹) + - [ๆ—ถๅบๅŠจไฝœๅฎšไฝ](#ๆ—ถๅบๅŠจไฝœๅฎšไฝ) + - [ไฝฟ็”จๆททๅˆๆ•ฐๆฎ้›†่ฟ›่กŒ่ฎญ็ปƒ](#ไฝฟ็”จๆททๅˆๆ•ฐๆฎ้›†่ฟ›่กŒ่ฎญ็ปƒ) + - [้‡ๅคๆ•ฐๆฎ้›†](#้‡ๅคๆ•ฐๆฎ้›†) + - [ๆต่งˆๆ•ฐๆฎ้›†](#ๆต่งˆๆ•ฐๆฎ้›†) + +## ๅ…ณไบŽ่ง†้ข‘ๆ•ฐๆฎๆ ผๅผ็š„่ฏดๆ˜Ž + +MMAction2 ๆ”ฏๆŒไธค็ง็ฑปๅž‹็š„ๆ•ฐๆฎๆ ผๅผ๏ผšๅŽŸๅง‹ๅธงๅ’Œ่ง†้ข‘ใ€‚ๅ‰่€…ๅœจไน‹ๅ‰็š„้กน็›ฎ๏ผˆๅฆ‚ [TSN](https://github.com/yjxiong/temporal-segment-networks)๏ผ‰ไธญ่ขซๅนฟๆณ›ไฝฟ็”จใ€‚ๅฝ“ SSD ๅฏ็”จๆ—ถ๏ผŒ่ฟ™็งๆ–นๆณ•่ฟ่กŒ้€Ÿๅบฆๅพˆๅฟซ๏ผŒไฝ†ๆ— ๆณ•ๆปก่ถณๆ—ฅ็›Šๅขž้•ฟ็š„ๆ•ฐๆฎ้›†้œ€ๆฑ‚๏ผˆไพ‹ๅฆ‚๏ผŒๆœ€ๆ–ฐ็š„ [Kinetics](https://www.deepmind.com/open-source/kinetics) ๆ•ฐๆฎ้›†ๆœ‰ 65 ไธ‡ไธช่ง†้ข‘๏ผŒๆ€ปๅธงๆ•ฐๅฐ†ๅ ็”จๅ‡  TB ็š„็ฉบ้—ด๏ผ‰ใ€‚ๅŽ่€…ๅฏไปฅ่Š‚็œ็ฉบ้—ด๏ผŒไฝ†ๅฟ…้กปๅœจๆ‰ง่กŒๆ—ถ่ฟ›่กŒ่ฎก็ฎ—ๅฏ†้›†ๅž‹็š„่ง†้ข‘่งฃ็ ใ€‚ไธบไบ†ๅŠ ๅฟซ่ง†้ข‘่งฃ็ ้€Ÿๅบฆ๏ผŒๆˆ‘ไปฌๆ”ฏๆŒๅ‡ ็ง้ซ˜ๆ•ˆ็š„่ง†้ข‘ๅŠ ่ฝฝๅบ“๏ผŒๅฆ‚ [decord](https://github.com/zhreshold/decord)ใ€[PyAV](https://github.com/PyAV-Org/PyAV) ็ญ‰ใ€‚ + +## ไฝฟ็”จๅ†…็ฝฎๆ•ฐๆฎ้›† + +MMAction2 ๅทฒ็ปๆ”ฏๆŒ่ฎธๅคšๆ•ฐๆฎ้›†๏ผŒๆˆ‘ไปฌๅœจ่ทฏๅพ„ `$MMACTION2/tools/data/` ไธ‹ๆไพ›ไบ†็”จไบŽๆ•ฐๆฎๅ‡†ๅค‡็š„ shell ่„šๆœฌ๏ผŒ่ฏทๅ‚่€ƒ[ๆ”ฏๆŒ็š„ๆ•ฐๆฎ้›†](https://mmaction2.readthedocs.io/zh_CN/latest/datasetzoo_statistics.html)ไปฅ่Žทๅ–ๅ‡†ๅค‡็‰นๅฎšๆ•ฐๆฎ้›†็š„่ฏฆ็ป†ไฟกๆฏใ€‚ + +## ไฝฟ็”จ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›† + +ๆœ€็ฎ€ๅ•็š„ๆ–นๆณ•ๆ˜ฏๅฐ†ๆ‚จ็š„ๆ•ฐๆฎ้›†่ฝฌๆขไธบ็Žฐๆœ‰็š„ๆ•ฐๆฎ้›†ๆ ผๅผ๏ผš + +- `RawFrameDataset` ๅ’Œ `VideoDataset` ็”จไบŽ[ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŠจไฝœ่ฏ†ๅˆซ) +- `PoseDataset` ็”จไบŽ[ๅŸบไบŽ้ชจ้ชผ็š„ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŸบไบŽ้ชจ้ชผ็š„ๅŠจไฝœ่ฏ†ๅˆซ) +- `AudioDataset` ็”จไบŽ[ๅŸบไบŽ้Ÿณ้ข‘ๅŠจไฝœ่ฏ†ๅˆซ](#ๅŸบไบŽ้Ÿณ้ข‘ๅŠจไฝœ่ฏ†ๅˆซ) +- `AVADataset` ็”จไบŽ[ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹](#ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹) +- `ActivityNetDataset` ็”จไบŽ[ๆ—ถๅบๅŠจไฝœๅฎšไฝ](#ๆ—ถๅบๅŠจไฝœๅฎšไฝ) + +ๅœจๆ•ฐๆฎ้ข„ๅค„็†ไน‹ๅŽ๏ผŒ็”จๆˆท้œ€่ฆ่ฟ›ไธ€ๆญฅไฟฎๆ”น้…็ฝฎๆ–‡ไปถไปฅไฝฟ็”จๆ•ฐๆฎ้›†ใ€‚ไปฅไธ‹ๆ˜ฏๅœจๅŽŸๅง‹ๅธงๆ ผๅผไธญไฝฟ็”จ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†็š„็คบไพ‹ใ€‚ + +ๅœจ `configs/task/method/my_custom_config.py` ไธญ๏ผš + +```python +... +# ๆ•ฐๆฎ้›†่ฎพ็ฝฎ +dataset_type = 'RawframeDataset' +data_root = 'path/to/your/root' +data_root_val = 'path/to/your/root_val' +ann_file_train = 'data/custom/custom_train_list.txt' +ann_file_val = 'data/custom/custom_val_list.txt' +ann_file_test = 'data/custom/custom_val_list.txt' +... +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + ...), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + ...), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + ...)) +... +``` + +### ๅŠจไฝœ่ฏ†ๅˆซ + +ๅŠจไฝœ่ฏ†ๅˆซๆœ‰ไธค็ง็ฑปๅž‹็š„ๆณจ้‡Šๆ–‡ไปถใ€‚ + +- `RawFrameDataset` ็š„ๅŽŸๅง‹ๅธงๆณจ้‡Š + + ๅŽŸๅง‹ๅธงๆ•ฐๆฎ้›†็š„ๆณจ้‡Šๆ˜ฏไธ€ไธชๅŒ…ๅซๅคš่กŒ็š„ๆ–‡ๆœฌๆ–‡ไปถ๏ผŒๆฏไธ€่กŒ่กจ็คบไธ€ไธช่ง†้ข‘็š„ `frame_directory`๏ผˆ็›ธๅฏน่ทฏๅพ„๏ผ‰ใ€่ง†้ข‘็š„ `total_frames` ๅ’Œ่ง†้ข‘็š„ `label`๏ผŒ็”จ็ฉบๆ ผๅˆ†้š”ใ€‚ + + ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹ใ€‚ + + ``` + some/directory-1 163 1 + some/directory-2 122 1 + some/directory-3 258 2 + some/directory-4 234 2 + some/directory-5 295 3 + some/directory-6 121 3 + ``` + +- `VideoDataset` ็š„่ง†้ข‘ๆณจ้‡Š + + ่ง†้ข‘ๆ•ฐๆฎ้›†็š„ๆณจ้‡Šๆ˜ฏไธ€ไธชๅŒ…ๅซๅคš่กŒ็š„ๆ–‡ๆœฌๆ–‡ไปถ๏ผŒๆฏไธ€่กŒ่กจ็คบไธ€ไธชๆ ทๆœฌ่ง†้ข‘๏ผŒๅŒ…ๆ‹ฌ `filepath`๏ผˆ็›ธๅฏน่ทฏๅพ„๏ผ‰ๅ’Œ `label`๏ผŒ็”จ็ฉบๆ ผๅˆ†้š”ใ€‚ + + ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹ใ€‚ + + ``` + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + some/path/003.mp4 2 + some/path/004.mp4 3 + some/path/005.mp4 3 + ``` + +### ๅŸบไบŽ้ชจ้ชผ็‚น็š„ๅŠจไฝœ่ฏ†ๅˆซ + +่ฏฅไปปๅŠกๅŸบไบŽ้ชจ้ชผๅบๅˆ—๏ผˆๅ…ณ้”ฎ็‚น็š„ๆ—ถ้—ดๅบๅˆ—๏ผ‰่ฏ†ๅˆซๅŠจไฝœ็ฑปๅˆซใ€‚ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไบ›ๆ–นๆณ•ๆฅๆž„ๅปบ่‡ชๅฎšไน‰็š„้ชจ้ชผๆ•ฐๆฎ้›†ใ€‚ + +- ไปŽ RGB ่ง†้ข‘ๆ•ฐๆฎๆž„ๅปบ + + ๆ‚จ้œ€่ฆไปŽ่ง†้ข‘ไธญๆๅ–ๅ…ณ้”ฎ็‚นๆ•ฐๆฎ๏ผŒๅนถๅฐ†ๅ…ถ่ฝฌๆขไธบๆ”ฏๆŒ็š„ๆ ผๅผใ€‚ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธช[ๆ•™็จ‹](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md)๏ผŒ่ฏฆ็ป†ไป‹็ปไบ†ๅฆ‚ไฝ•ๆ‰ง่กŒใ€‚ + +- ไปŽ็Žฐๆœ‰ๅ…ณ้”ฎ็‚นๆ•ฐๆฎๆž„ๅปบ + + ๅ‡่ฎพๆ‚จๅทฒ็ปๆœ‰ไบ† coco ๆ ผๅผ็š„ๅ…ณ้”ฎ็‚นๆ•ฐๆฎ๏ผŒๆ‚จๅฏไปฅๅฐ†ๅฎƒไปฌๆ”ถ้›†ๅˆฐไธ€ไธช pickle ๆ–‡ไปถไธญใ€‚ + + ๆฏไธช pickle ๆ–‡ไปถๅฏนๅบ”ไธ€ไธชๅŠจไฝœ่ฏ†ๅˆซๆ•ฐๆฎ้›†ใ€‚pickle ๆ–‡ไปถ็š„ๅ†…ๅฎนๆ˜ฏไธ€ไธชๅญ—ๅ…ธ๏ผŒๅŒ…ๅซไธคไธชๅญ—ๆฎต๏ผš`split` ๅ’Œ `annotations` + + 1. Split๏ผš`split` ๅญ—ๆฎต็š„ๅ€ผๆ˜ฏไธ€ไธชๅญ—ๅ…ธ๏ผš้”ฎๆ˜ฏๆ‹†ๅˆ†ๅ็งฐ๏ผŒๅ€ผๆ˜ฏๅฑžไบŽ็‰นๅฎšๅ‰ช่พ‘็š„่ง†้ข‘ๆ ‡่ฏ†็ฌฆๅˆ—่กจใ€‚ + 2. Annotations๏ผš`annotations` ๅญ—ๆฎต็š„ๅ€ผๆ˜ฏไธ€ไธช้ชจ้ชผๆณจ้‡Šๅˆ—่กจ๏ผŒๆฏไธช้ชจ้ชผๆณจ้‡Šๆ˜ฏไธ€ไธชๅญ—ๅ…ธ๏ผŒๅŒ…ๅซไปฅไธ‹ๅญ—ๆฎต๏ผš + - `frame_dir`๏ผˆstr๏ผ‰๏ผšๅฏนๅบ”่ง†้ข‘็š„ๆ ‡่ฏ†็ฌฆใ€‚ + - `total_frames`๏ผˆint๏ผ‰๏ผšๆญค่ง†้ข‘ไธญ็š„ๅธงๆ•ฐใ€‚ + - `img_shape`๏ผˆtuple\[int\]๏ผ‰๏ผš่ง†้ข‘ๅธง็š„ๅฝข็Šถ๏ผŒไธ€ไธชๅŒ…ๅซไธคไธชๅ…ƒ็ด ็š„ๅ…ƒ็ป„๏ผŒๆ ผๅผไธบ `(height, width)`ใ€‚ไป…ๅฏน 2D ้ชจ้ชผ้œ€่ฆใ€‚ + - `original_shape`๏ผˆtuple\[int\]๏ผ‰๏ผšไธŽ `img_shape` ็›ธๅŒใ€‚ + - `label`๏ผˆint๏ผ‰๏ผšๅŠจไฝœๆ ‡็ญพใ€‚ + - `keypoint`๏ผˆnp.ndarray๏ผŒๅฝข็Šถไธบ `[M x T x V x C]`๏ผ‰๏ผšๅ…ณ้”ฎ็‚นๆณจ้‡Šใ€‚ + - M๏ผšไบบๆ•ฐ๏ผ› + - T๏ผšๅธงๆ•ฐ๏ผˆไธŽ `total_frames` ็›ธๅŒ๏ผ‰๏ผ› + - V๏ผšๅ…ณ้”ฎ็‚นๆ•ฐ้‡๏ผˆNTURGB+D 3D ้ชจ้ชผไธบ 25๏ผŒCoco ไธบ 17๏ผŒOpenPose ไธบ 18 ็ญ‰๏ผ‰๏ผ› + - C๏ผšๅ…ณ้”ฎ็‚นๅๆ ‡็š„็ปดๆ•ฐ๏ผˆ2D ๅ…ณ้”ฎ็‚นไธบ C=2๏ผŒ3D ๅ…ณ้”ฎ็‚นไธบ C=3๏ผ‰ใ€‚ + - `keypoint_score`๏ผˆnp.ndarray๏ผŒๅฝข็Šถไธบ `[M x T x V]`๏ผ‰๏ผšๅ…ณ้”ฎ็‚น็š„็ฝฎไฟกๅบฆๅˆ†ๆ•ฐใ€‚ไป…ๅฏน 2D ้ชจ้ชผ้œ€่ฆใ€‚ + + ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + + ``` + { + "split": + { + 'xsub_train': + ['S001C001P001R001A001', ...], + 'xsub_val': + ['S001C001P003R001A001', ...], + ... + } + + "annotations: + [ + { + { + 'frame_dir': 'S001C001P001R001A001', + 'label': 0, + 'img_shape': (1080, 1920), + 'original_shape': (1080, 1920), + 'total_frames': 103, + 'keypoint': array([[[[1032. , 334.8], ...]]]) + 'keypoint_score': array([[[0.934 , 0.9766, ...]]]) + }, + { + 'frame_dir': 'S001C001P003R001A001', + ... + }, + ... + + } + ] + } + ``` + + ๆ”ฏๆŒๅ…ถไป–ๅ…ณ้”ฎ็‚นๆ ผๅผ้œ€่ฆ่ฟ›่กŒ่ฟ›ไธ€ๆญฅไฟฎๆ”น๏ผŒ่ฏทๅ‚่€ƒ[่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](../advanced_guides/customize_dataset.md)ใ€‚ + +### ๅŸบไบŽ้Ÿณ้ข‘็š„ๅŠจไฝœ่ฏ†ๅˆซ + +MMAction2 ๆ”ฏๆŒๅŸบไบŽ `AudioDataset` ็š„้Ÿณ้ข‘ๅŠจไฝœ่ฏ†ๅˆซไปปๅŠกใ€‚่ฏฅไปปๅŠกไฝฟ็”จๆข…ๅฐ”้ข‘่ฐฑ็‰นๅพไฝœไธบ่พ“ๅ…ฅ, ๆณจ้‡Šๆ–‡ไปถๆ ผๅผ็คบไพ‹ๅฆ‚ไธ‹๏ผš + +``` +ihWykL5mYRI.npy 300 153 +lumzQD42AN8.npy 240 321 +sWFRmD9Of4s.npy 250 250 +w_IpfgRsBVA.npy 300 356 +``` + +ๆฏไธ€่กŒไปฃ่กจไธ€ไธช่ฎญ็ปƒๆ ทๆœฌ๏ผŒไปฅ็ฌฌไธ€่กŒไธบไพ‹๏ผŒ`ihWykL5mYRI.npy` ไธบๆข…ๅฐ”้ข‘่ฐฑ็‰นๅพ็š„ๆ–‡ไปถๅ๏ผŒ`300` ไธบ่ฏฅๆข…ๅฐ”้ข‘่ฐฑ็‰นๅพๆ–‡ไปถๅฏนๅบ”็š„ๅŽŸ่ง†้ข‘ๆ–‡ไปถ็š„ๆ€ปๅธงๆ•ฐ๏ผŒ`153` ไธบ็ฑปๅˆซๆ ‡็ญพใ€‚ๆˆ‘ไปฌๅˆ†ไปฅไธ‹ไธค้˜ถๆฎต็”Ÿๆˆๆ‰€้œ€่ฆ็š„ๆข…ๅฐ”้ข‘่ฐฑ็‰นๅพๆ–‡ไปถๆ•ฐๆฎ๏ผš + +้ฆ–ๅ…ˆ๏ผŒ้€š่ฟ‡่ง†้ข‘ๆ–‡ไปถๆๅ–`้Ÿณ้ข‘ๆ–‡ไปถ`: + +``` +cd $MMACTION2 +python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \ + [--level ${LEVEL}] +``` + +- `ROOT`: ่ง†้ข‘็š„ๆ น็›ฎๅฝ•ใ€‚ +- `DST_ROOT`: ๅญ˜ๆ”พ็”Ÿๆˆ้Ÿณ้ข‘็š„ๆ น็›ฎๅฝ•ใ€‚ +- `EXT`: ่ง†้ข‘็š„ๅŽ็ผ€ๅ๏ผŒๅฆ‚ `mp4`ใ€‚ +- `N_WORKERS`: ไฝฟ็”จ็š„่ฟ›็จ‹ๆ•ฐ้‡ใ€‚ + +ไธ‹ไธ€ๆญฅ๏ผŒไปŽ้Ÿณ้ข‘ๆ–‡ไปถ็”Ÿๆˆ`ๆข…ๅฐ”้ข‘่ฐฑ็‰นๅพ`: + +``` +cd $MMACTION2 +python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \ + [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART] +``` + +- `AUDIO_HOME_PATH`: ้Ÿณ้ข‘ๆ–‡ไปถ็š„ๆ น็›ฎๅฝ•ใ€‚ +- `SPECTROGRAM_SAVE_PATH`: ๅญ˜ๆ”พ็”Ÿๆˆ้Ÿณ้ข‘็‰นๅพ็š„ๆ น็›ฎๅฝ•ใ€‚ +- `EXT`: ้Ÿณ้ข‘็š„ๅŽ็ผ€ๅ๏ผŒๅฆ‚ `m4a`ใ€‚ +- `N_WORKERS`: ไฝฟ็”จ็š„่ฟ›็จ‹ๆ•ฐ้‡ใ€‚ +- `PART`: ๅฐ†ๅฎŒๆ•ด็š„่งฃ็ ไปปๅŠกๅˆ†ไธบๅ‡ ้ƒจๅˆ†ๅนถๆ‰ง่กŒๅ…ถไธญไธ€ไปฝใ€‚ๅฆ‚ `2/5` ่กจ็คบๅฐ†ๆ‰€ๆœ‰ๅพ…่งฃ็ ๆ•ฐๆฎๅˆ†ๆˆ 5 ไปฝ๏ผŒๅนถๅฏนๅ…ถไธญ็š„็ฌฌ 2 ไปฝ่ฟ›่กŒ่งฃ็ ใ€‚่ฟ™ไธ€้€‰้กนๅœจ็”จๆˆทๆœ‰ๅคšๅฐๆœบๅ™จๆ—ถๅ‘ๆŒฅไฝœ็”จใ€‚ + +### ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹ + +MMAction2 ๆ”ฏๆŒๅŸบไบŽ `AVADataset` ็š„ๆ—ถ็ฉบๅŠจไฝœๆฃ€ๆต‹ไปปๅŠกใ€‚ๆณจ้‡ŠๅŒ…ๅซ็œŸๅฎž่พน็•Œๆก†ๅ’Œๆ่ฎฎ่พน็•Œๆก†ใ€‚ + +- ็œŸๅฎž่พน็•Œๆก† + ็œŸๅฎž่พน็•Œๆก†ๆ˜ฏไธ€ไธชๅŒ…ๅซๅคš่กŒ็š„ csv ๆ–‡ไปถ๏ผŒๆฏไธ€่กŒๆ˜ฏไธ€ไธชๅธง็š„ๆฃ€ๆต‹ๆ ทๆœฌ๏ผŒๆ ผๅผๅฆ‚ไธ‹๏ผš + + video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id + ๆฏไธชๅญ—ๆฎต็š„ๅซไน‰ๅฆ‚ไธ‹๏ผš + `video_identifier`๏ผšๅฏนๅบ”่ง†้ข‘็š„ๆ ‡่ฏ†็ฌฆ + `time_stamp`๏ผšๅฝ“ๅ‰ๅธง็š„ๆ—ถ้—ดๆˆณ + `lt_x`๏ผšๅทฆไธŠ่ง’็‚น็š„่ง„่ŒƒๅŒ– x ๅๆ ‡ + `lt_y`๏ผšๅทฆไธŠ่ง’็‚น็š„่ง„่ŒƒๅŒ– y ๅๆ ‡ + `rb_y`๏ผšๅณไธ‹่ง’็‚น็š„่ง„่ŒƒๅŒ– x ๅๆ ‡ + `rb_y`๏ผšๅณไธ‹่ง’็‚น็š„่ง„่ŒƒๅŒ– y ๅๆ ‡ + `label`๏ผšๅŠจไฝœๆ ‡็ญพ + `entity_id`๏ผšไธ€ไธชๅ”ฏไธ€็š„ๆ•ดๆ•ฐ๏ผŒๅ…่ฎธๅฐ†ๆญคๆก†ไธŽ่ฏฅ่ง†้ข‘็›ธ้‚ปๅธงไธญๆ็ป˜ๅŒไธ€ไธชไบบ็š„ๅ…ถไป–ๆก†่ฟžๆŽฅ่ตทๆฅ + + ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + + ``` + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0 + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0 + ... + ``` + +- ๆ่ฎฎ่พน็•Œๆก† + ๆ่ฎฎ่พน็•Œๆก†ๆ˜ฏ็”ฑไธ€ไธชไบบไฝ“ๆฃ€ๆต‹ๅ™จ็”Ÿๆˆ็š„ pickle ๆ–‡ไปถ๏ผŒ้€šๅธธ้œ€่ฆๅœจ็›ฎๆ ‡ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒๅพฎ่ฐƒใ€‚pickle ๆ–‡ไปถๅŒ…ๅซไธ€ไธชๅธฆๆœ‰ไปฅไธ‹ๆ•ฐๆฎ็ป“ๆž„็š„ๅญ—ๅ…ธ๏ผš + + `{'video_identifier,time_stamp': bbox_info}` + + video_identifier๏ผˆstr๏ผ‰๏ผšๅฏนๅบ”่ง†้ข‘็š„ๆ ‡่ฏ†็ฌฆ + time_stamp๏ผˆint๏ผ‰๏ผšๅฝ“ๅ‰ๅธง็š„ๆ—ถ้—ดๆˆณ + bbox_info๏ผˆnp.ndarray๏ผŒๅฝข็Šถไธบ`[n, 5]`๏ผ‰๏ผšๆฃ€ๆต‹ๅˆฐ็š„่พน็•Œๆก†๏ผŒ\ \ \ \ \ใ€‚x1ใ€x2ใ€y1ใ€y2 ๆ˜ฏ็›ธๅฏนไบŽๅธงๅคงๅฐๅฝ’ไธ€ๅŒ–็š„ๅ€ผ๏ผŒ่Œƒๅ›ดไธบ 0.0-1.0ใ€‚ + +### ๆ—ถๅบๅŠจไฝœๅฎšไฝ + +ๆˆ‘ไปฌๆ”ฏๆŒๅŸบไบŽ `ActivityNetDataset` ็š„ๆ—ถๅบๅŠจไฝœๅฎšไฝใ€‚ActivityNet ๆ•ฐๆฎ้›†็š„ๆณจ้‡Šๆ˜ฏไธ€ไธช json ๆ–‡ไปถใ€‚ๆฏไธช้”ฎๆ˜ฏไธ€ไธช่ง†้ข‘ๅ๏ผŒ็›ธๅบ”็š„ๅ€ผๆ˜ฏ่ง†้ข‘็š„ๅ…ƒๆ•ฐๆฎๅ’Œๆณจ้‡Šใ€‚ + +ไปฅไธ‹ๆ˜ฏไธ€ไธช็คบไพ‹๏ผš + +``` +{ + "video1": { + "duration_second": 211.53, + "duration_frame": 6337, + "annotations": [ + { + "segment": [ + 30.025882995319815, + 205.2318595943838 + ], + "label": "Rock climbing" + } + ], + "feature_frame": 6336, + "fps": 30.0, + "rfps": 29.9579255898 + }, + "video2": {... + } + ... +} +``` + +## ไฝฟ็”จๆททๅˆๆ•ฐๆฎ้›†่ฟ›่กŒ่ฎญ็ปƒ + +MMAction2 ่ฟ˜ๆ”ฏๆŒๆททๅˆๆ•ฐๆฎ้›†่ฟ›่กŒ่ฎญ็ปƒใ€‚็›ฎๅ‰๏ผŒๅฎƒๆ”ฏๆŒ้‡ๅคๆ•ฐๆฎ้›†ใ€‚ + +### ้‡ๅคๆ•ฐๆฎ้›† + +ๆˆ‘ไปฌไฝฟ็”จ `RepeatDataset` ไฝœไธบๅŒ…่ฃ…ๅ™จๆฅ้‡ๅคๆ•ฐๆฎ้›†ใ€‚ไพ‹ๅฆ‚๏ผŒๅ‡่ฎพๅŽŸๅง‹ๆ•ฐๆฎ้›†ไธบ `Dataset_A`๏ผŒ่ฆ้‡ๅคๅฎƒ๏ผŒ้…็ฝฎๅฆ‚ไธ‹ๆ‰€็คบ + +```python +dataset_A_train = dict( + type='RepeatDataset', + times=N, + dataset=dict( # ่ฟ™ๆ˜ฏ Dataset_A ็š„ๅŽŸๅง‹้…็ฝฎ + type='Dataset_A', + ... + pipeline=train_pipeline + ) + ) +``` + +## ๆต่งˆๆ•ฐๆฎ้›† + +ๅณๅฐ†ๆŽจๅ‡บ... diff --git a/docs/zh_cn/user_guides/train_test.md b/docs/zh_cn/user_guides/train_test.md new file mode 100644 index 0000000000000000000000000000000000000000..1da2e3cf3b1c8c1f84e366f825fd09721c5e9294 --- /dev/null +++ b/docs/zh_cn/user_guides/train_test.md @@ -0,0 +1,248 @@ +# ่ฎญ็ปƒไธŽๆต‹่ฏ• + +- [่ฎญ็ปƒไธŽๆต‹่ฏ•](#่ฎญ็ปƒไธŽๆต‹่ฏ•) + - [่ฎญ็ปƒ](#่ฎญ็ปƒ) + - [ไฝฟ็”จๅ•ไธช GPU ่ฟ›่กŒ่ฎญ็ปƒ](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒ่ฎญ็ปƒ) + - [ไฝฟ็”จๅคšไธช GPU ่ฟ›่กŒ่ฎญ็ปƒ](#ไฝฟ็”จๅคšไธช-gpu-่ฟ›่กŒ่ฎญ็ปƒ) + - [ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒ่ฎญ็ปƒ](#ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒ่ฎญ็ปƒ) + - [ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ](#ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ) + - [ไฝฟ็”จ slurm ็ฎก็†็š„ๅคšๅฐๆœบๅ™จ](#ไฝฟ็”จ-slurm-็ฎก็†็š„ๅคšๅฐๆœบๅ™จ) + - [ๆต‹่ฏ•](#ๆต‹่ฏ•) + - [ไฝฟ็”จๅ•ไธช GPU ่ฟ›่กŒๆต‹่ฏ•](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒๆต‹่ฏ•) + - [ไฝฟ็”จๅคšไธช GPU ่ฟ›่กŒๆต‹่ฏ•](#ไฝฟ็”จๅคšไธช-gpu-่ฟ›่กŒๆต‹่ฏ•) + - [ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒๆต‹่ฏ•](#ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒๆต‹่ฏ•) + - [ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ](#ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ-1) + - [ไฝฟ็”จ slurm ็ฎก็†็š„ๅคšๅฐๆœบๅ™จ](#ไฝฟ็”จ-slurm-็ฎก็†็š„ๅคšๅฐๆœบๅ™จ-1) + +## ่ฎญ็ปƒ + +### ไฝฟ็”จๅ•ไธช GPU ่ฟ›่กŒ่ฎญ็ปƒ + +ๆ‚จๅฏไปฅไฝฟ็”จ `tools/train.py` ๅœจไธ€ๅฐๅธฆๆœ‰ CPU ๅ’Œ GPU(ๅฏ้€‰) ็š„ๅ•ๆœบไธŠ่ฎญ็ปƒๆจกๅž‹ใ€‚ + +ไธ‹้ขๆ˜ฏ่„šๆœฌ็š„ๅฎŒๆ•ด็”จๆณ•๏ผš + +```shell +python tools/train.py ${CONFIG_FILE} [ARGS] +``` + +````{note} +้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒMMAction2 ๆ›ดๅ€พๅ‘ไบŽไฝฟ็”จ GPU ่€Œไธๆ˜ฏ CPU ่ฟ›่กŒ่ฎญ็ปƒใ€‚ๅฆ‚ๆžœๆ‚จๆƒณๅœจ CPU ไธŠ่ฎญ็ปƒๆจกๅž‹๏ผŒ่ฏทๆธ…็ฉบ `CUDA_VISIBLE_DEVICES` ๆˆ–ๅฐ†ๅ…ถ่ฎพ็ฝฎไธบ -1 ไปฅไฝฟ GPU ๅฏน็จ‹ๅบไธๅฏ่งใ€‚ + +```bash +CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS] +``` +```` + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CONFIG_FILE` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `--work-dir WORK_DIR` | ไฟๅญ˜ๆ—ฅๅฟ—ๅ’Œๆƒ้‡็š„็›ฎๆ ‡ๆ–‡ไปถๅคนใ€‚้ป˜่ฎคไธบไธŽ้…็ฝฎๆ–‡ไปถ็›ธๅŒๅ็งฐ็š„ๆ–‡ไปถๅคน๏ผŒไฝไบŽ `./work_dirs` ไธ‹ใ€‚ | +| `--resume [RESUME]` | ๆขๅค่ฎญ็ปƒใ€‚ๅฆ‚ๆžœๆŒ‡ๅฎšไบ†่ทฏๅพ„๏ผŒๅˆ™ไปŽ่ฏฅ่ทฏๅพ„ๆขๅค๏ผŒๅฆ‚ๆžœๆœชๆŒ‡ๅฎš๏ผŒๅˆ™ๅฐ่ฏ•ไปŽๆœ€ๆ–ฐ็š„ๆƒ้‡่‡ชๅŠจๆขๅคใ€‚ | +| `--amp` | ๅฏ็”จ่‡ชๅŠจๆททๅˆ็ฒพๅบฆ่ฎญ็ปƒใ€‚ | +| `--no-validate` | **ไธๅปบ่ฎฎไฝฟ็”จ**ใ€‚ๅœจ่ฎญ็ปƒๆœŸ้—ด็ฆ็”จๆƒ้‡่ฏ„ไผฐใ€‚ | +| `--auto-scale-lr` | ๆ นๆฎๅฎž้™…ๆ‰นๆฌกๅคงๅฐๅ’ŒๅŽŸๅง‹ๆ‰นๆฌกๅคงๅฐ่‡ชๅŠจ็ผฉๆ”พๅญฆไน ็އใ€‚ | +| `--seed` | ้šๆœบ็งๅญใ€‚ | +| `--diff-rank-seed` | ๆ˜ฏๅฆไธบไธๅŒ็š„ rank ่ฎพ็ฝฎไธๅŒ็š„็งๅญใ€‚ | +| `--deterministic` | ๆ˜ฏๅฆไธบ CUDNN ๅŽ็ซฏ่ฎพ็ฝฎ็กฎๅฎšๆ€ง้€‰้กนใ€‚ | +| `--cfg-options CFG_OPTIONS` | ่ฆ†็›–ไฝฟ็”จ็š„้…็ฝฎไธญ็š„ๆŸไบ›่ฎพ็ฝฎ๏ผŒxxx=yyy ๆ ผๅผ็š„้”ฎๅ€ผๅฏนๅฐ†ๅˆๅนถๅˆฐ้…็ฝฎๆ–‡ไปถไธญใ€‚ๅฆ‚ๆžœ่ฆ่ฆ†็›–็š„ๅ€ผๆ˜ฏไธ€ไธชๅˆ—่กจ๏ผŒๅˆ™ๅบ”้‡‡็”จ `key="[a,b]"` ๆˆ– `key=a,b` ็š„ๅฝขๅผใ€‚่ฏฅๅ‚ๆ•ฐ่ฟ˜ๅ…่ฎธๅตŒๅฅ—็š„ๅˆ—่กจ/ๅ…ƒ็ป„ๅ€ผ๏ผŒไพ‹ๅฆ‚ `key="[(a,b),(c,d)]"`ใ€‚่ฏทๆณจๆ„๏ผŒๅผ•ๅทๆ˜ฏๅฟ…้œ€็š„๏ผŒไธ”ไธๅ…่ฎธๆœ‰็ฉบๆ ผใ€‚ | +| `--launcher {none,pytorch,slurm,mpi}` | ไฝœไธšๅฏๅŠจๅ™จ็š„้€‰้กนใ€‚้ป˜่ฎคไธบ `none`ใ€‚ | + +### ไฝฟ็”จๅคšไธช GPU ่ฟ›่กŒ่ฎญ็ปƒ + +ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธช shell ่„šๆœฌไฝฟ็”จ `torch.distributed.launch` ๆฅๅฏๅŠจๅคšไธช GPU ็š„่ฎญ็ปƒไปปๅŠกใ€‚ + +```shell +bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS] +``` + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ---------- | ----------------------------------------------------------------------- | +| `CONFIG` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `GPUS` | ่ฆไฝฟ็”จ็š„ GPU ๆ•ฐ้‡ใ€‚ | +| `[PYARGS]` | `tools/train.py` ็š„ๅ…ถไป–ๅฏ้€‰ๅ‚ๆ•ฐ๏ผŒ่ฏทๅ‚่ง[่ฟ™้‡Œ](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒ่ฎญ็ปƒ)ใ€‚ | + +ๆ‚จ่ฟ˜ๅฏไปฅ้€š่ฟ‡็Žฏๅขƒๅ˜้‡ๆฅๆŒ‡ๅฎšๅฏๅŠจๅ™จ็š„ๅ…ถไป–ๅ‚ๆ•ฐใ€‚ไพ‹ๅฆ‚๏ผŒไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๅฐ†ๅฏๅŠจๅ™จ็š„้€šไฟก็ซฏๅฃๆ›ดๆ”นไธบ 29666๏ผš + +```shell +PORT=29666 bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS] +``` + +ๅฆ‚ๆžœๆ‚จๆƒณๅฏๅŠจๅคšไธช่ฎญ็ปƒไฝœไธšๅนถไฝฟ็”จไธๅŒ็š„ GPU๏ผŒๅฏไปฅ้€š่ฟ‡ๆŒ‡ๅฎšไธๅŒ็š„็ซฏๅฃๅ’Œๅฏ่ง่ฎพๅค‡ๆฅๅฏๅŠจๅฎƒไปฌใ€‚ + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS] +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS] +``` + +### ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒ่ฎญ็ปƒ + +#### ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ + +ๅฆ‚ๆžœๆ‚จไฝฟ็”จไปฅๅคช็ฝ‘่ฟžๆŽฅ็š„ๅคšๅฐๆœบๅ™จๅฏๅŠจ่ฎญ็ปƒไฝœไธš๏ผŒๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค๏ผš + +ๅœจ็ฌฌไธ€ๅฐๆœบๅ™จไธŠ๏ผš + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +ๅœจ็ฌฌไบŒๅฐๆœบๅ™จไธŠ๏ผš + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +้œ€่ฆๆŒ‡ๅฎšไปฅไธ‹้ขๅค–็š„็Žฏๅขƒๅ˜้‡ๆฅ่ฎญ็ปƒๆˆ–ๆต‹่ฏ•ๅคšๅฐๆœบๅ™จไธŠ็š„ๆจกๅž‹๏ผš + +| ENV_VARS | ๆ่ฟฐ | +| ------------- | ---------------------------------------------------------------- | +| `NNODES` | ๆœบๅ™จ็š„ๆ€ปๆ•ฐใ€‚้ป˜่ฎคไธบ 1ใ€‚ | +| `NODE_RANK` | ๆœฌๅœฐๆœบๅ™จ็š„็ดขๅผ•ใ€‚้ป˜่ฎคไธบ 0ใ€‚ | +| `PORT` | ้€šไฟก็ซฏๅฃ๏ผŒๅœจๆ‰€ๆœ‰ๆœบๅ™จไธŠๅบ”่ฏฅไฟๆŒไธ€่‡ดใ€‚้ป˜่ฎคไธบ 29500ใ€‚ | +| `MASTER_ADDR` | ไธปๆœบๅ™จ็š„ IP ๅœฐๅ€๏ผŒๅœจๆ‰€ๆœ‰ๆœบๅ™จไธŠๅบ”่ฏฅไฟๆŒไธ€่‡ดใ€‚้ป˜่ฎคไธบ `127.0.0.1`ใ€‚ | + +้€šๅธธ๏ผŒๅฆ‚ๆžœๆ‚จๆฒกๆœ‰้ซ˜้€Ÿ็ฝ‘็ปœ๏ผˆๅฆ‚ InfiniBand๏ผ‰๏ผŒๅˆ™้€Ÿๅบฆไผšๆฏ”่พƒๆ…ขใ€‚ + +#### ไฝฟ็”จ slurm ็ฎก็†็š„ๅคšๅฐๆœบๅ™จ + +ๅฆ‚ๆžœๆ‚จๅœจไฝฟ็”จ [slurm](https://slurm.schedmd.com/) ็ฎก็†็š„้›†็พคไธŠ่ฟ่กŒ MMAction2๏ผŒๅฏไปฅไฝฟ็”จ่„šๆœฌ `slurm_train.sh`ใ€‚ + +```shell +[ENV_VARS] bash tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG} [PY_ARGS] +``` + +ไธ‹้ขๆ˜ฏ่ฏฅ่„šๆœฌ็š„ๅ‚ๆ•ฐๆ่ฟฐใ€‚ + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ----------- | ----------------------------------------------------------------------- | +| `PARTITION` | ้›†็พคไธญ่ฆไฝฟ็”จ็š„ๅˆ†ๅŒบใ€‚ | +| `JOB_NAME` | ไฝœไธš็š„ๅ็งฐ๏ผŒๆ‚จๅฏไปฅ่‡ชๅฎšไน‰ใ€‚ | +| `CONFIG` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `[PYARGS]` | `tools/train.py` ็š„ๅ…ถไป–ๅฏ้€‰ๅ‚ๆ•ฐ๏ผŒ่ฏทๅ‚่ง[่ฟ™้‡Œ](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒ่ฎญ็ปƒ)ใ€‚ | + +ไธ‹้ขๅˆ—ๅ‡บไบ†ๅฏ็”จไบŽ้…็ฝฎ slurm ไฝœไธš็š„็Žฏๅขƒๅ˜้‡ใ€‚ + +| ENV_VARS | ๆ่ฟฐ | +| --------------- | -------------------------------------------------------------------------------- | +| `GPUS` | ่ฆไฝฟ็”จ็š„ GPU ๆ•ฐ้‡ใ€‚้ป˜่ฎคไธบ 8ใ€‚ | +| `GPUS_PER_NODE` | ๆฏไธช่Š‚็‚น่ฆๅˆ†้…็š„ GPU ๆ•ฐ้‡ใ€‚้ป˜่ฎคไธบ 8ใ€‚ | +| `CPUS_PER_TASK` | ๆฏไธชไปปๅŠก่ฆๅˆ†้…็š„ CPU ๆ•ฐ้‡๏ผˆ้€šๅธธไธ€ไธช GPU ๅฏนๅบ”ไธ€ไธชไปปๅŠก๏ผ‰ใ€‚้ป˜่ฎคไธบ 5ใ€‚ | +| `SRUN_ARGS` | `srun` ็š„ๅ…ถไป–ๅ‚ๆ•ฐใ€‚ๅฏ็”จ้€‰้กนๅฏๅœจ[่ฟ™้‡Œ](https://slurm.schedmd.com/srun.html)ๆ‰พๅˆฐใ€‚ | + +## ๆต‹่ฏ• + +### ไฝฟ็”จๅ•ไธช GPU ่ฟ›่กŒๆต‹่ฏ• + +ๆ‚จๅฏไปฅไฝฟ็”จ `tools/test.py` ๅœจไธ€ๅฐๅธฆๆœ‰ CPU ๅ’Œๅฏ้€‰ GPU ็š„ๅ•ๆœบไธŠๆต‹่ฏ•ๆจกๅž‹ใ€‚ + +ไธ‹้ขๆ˜ฏ่„šๆœฌ็š„ๅฎŒๆ•ด็”จๆณ•๏ผš + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS] +``` + +````{note} +้ป˜่ฎคๆƒ…ๅ†ตไธ‹๏ผŒMMAction2 ๆ›ดๅ€พๅ‘ไบŽไฝฟ็”จ GPU ่€Œไธๆ˜ฏ CPU ่ฟ›่กŒๆต‹่ฏ•ใ€‚ๅฆ‚ๆžœๆ‚จๆƒณๅœจ CPU ไธŠๆต‹่ฏ•ๆจกๅž‹๏ผŒ่ฏทๆธ…็ฉบ `CUDA_VISIBLE_DEVICES` ๆˆ–ๅฐ†ๅ…ถ่ฎพ็ฝฎไธบ -1 ไปฅไฝฟ GPU ๅฏน็จ‹ๅบไธๅฏ่งใ€‚ + +```bash +CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS] +``` +```` + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CONFIG_FILE` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `CHECKPOINT_FILE` | ๆƒ้‡ๆ–‡ไปถ็š„่ทฏๅพ„๏ผˆๅฏไปฅๆ˜ฏ HTTP ้“พๆŽฅ๏ผ‰ใ€‚ | +| `--work-dir WORK_DIR` | ไฟๅญ˜ๅŒ…ๅซ่ฏ„ไผฐๆŒ‡ๆ ‡็š„ๆ–‡ไปถ็š„็›ฎๅฝ•ใ€‚้ป˜่ฎคไธบไธŽ้…็ฝฎๆ–‡ไปถ็›ธๅŒๅ็งฐ็š„ๆ–‡ไปถๅคน๏ผŒไฝไบŽ `./work_dirs` ไธ‹ใ€‚ | +| `--dump DUMP` | ๅญ˜ๅ‚จๆจกๅž‹็š„ๆ‰€ๆœ‰่พ“ๅ‡บไปฅ่ฟ›่กŒ็ฆป็บฟ่ฏ„ไผฐ็š„่ทฏๅพ„ใ€‚ | +| `--cfg-options CFG_OPTIONS` | ่ฆ†็›–ไฝฟ็”จ็š„้…็ฝฎไธญ็š„ๆŸไบ›่ฎพ็ฝฎ๏ผŒxxx=yyy ๆ ผๅผ็š„้”ฎๅ€ผๅฏนๅฐ†ๅˆๅนถๅˆฐ้…็ฝฎๆ–‡ไปถไธญใ€‚ๅฆ‚ๆžœ่ฆ่ฆ†็›–็š„ๅ€ผๆ˜ฏไธ€ไธชๅˆ—่กจ๏ผŒๅˆ™ๅบ”้‡‡็”จ `key="[a,b]"` ๆˆ– `key=a,b` ็š„ๅฝขๅผใ€‚่ฏฅๅ‚ๆ•ฐ่ฟ˜ๅ…่ฎธๅตŒๅฅ—็š„ๅˆ—่กจ/ๅ…ƒ็ป„ๅ€ผ๏ผŒไพ‹ๅฆ‚ `key="[(a,b),(c,d)]"`ใ€‚่ฏทๆณจๆ„๏ผŒๅผ•ๅทๆ˜ฏๅฟ…้œ€็š„๏ผŒไธ”ไธๅ…่ฎธๆœ‰็ฉบๆ ผใ€‚ | +| `--show-dir SHOW_DIR` | ไฟๅญ˜็ป“ๆžœๅฏ่ง†ๅŒ–ๅ›พ็‰‡็š„็›ฎๅฝ•ใ€‚ | +| `--show` | ๅœจ็ช—ๅฃไธญๅฏ่ง†ๅŒ–้ข„ๆต‹็ป“ๆžœใ€‚ | +| `--interval INTERVAL` | ๅฏ่ง†ๅŒ–็š„ๆ ทๆœฌ้—ด้š”ใ€‚้ป˜่ฎคไธบ 1ใ€‚ | +| `--wait-time WAIT_TIME` | ๆฏไธช็ช—ๅฃ็š„ๆ˜พ็คบๆ—ถ้—ด๏ผˆๅ•ไฝ๏ผš็ง’๏ผ‰ใ€‚้ป˜่ฎคไธบ 2ใ€‚ | +| `--launcher {none,pytorch,slurm,mpi}` | ไฝœไธšๅฏๅŠจๅ™จ็š„้€‰้กนใ€‚้ป˜่ฎคไธบ `none`ใ€‚ | + +### ไฝฟ็”จๅคšไธช GPU ่ฟ›่กŒๆต‹่ฏ• + +ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธช shell ่„šๆœฌไฝฟ็”จ `torch.distributed.launch` ๆฅๅฏๅŠจๅคšไธช GPU ็š„ๆต‹่ฏ•ไปปๅŠกใ€‚ + +```shell +bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS] +``` + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ------------ | ---------------------------------------------------------------------- | +| `CONFIG` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `CHECKPOINT` | ๆƒ้‡ๆ–‡ไปถ็š„่ทฏๅพ„๏ผˆๅฏไปฅๆ˜ฏ HTTP ้“พๆŽฅ๏ผ‰ใ€‚ | +| `GPUS` | ่ฆไฝฟ็”จ็š„ GPU ๆ•ฐ้‡ใ€‚ | +| `[PYARGS]` | `tools/test.py` ็š„ๅ…ถไป–ๅฏ้€‰ๅ‚ๆ•ฐ๏ผŒ่ฏทๅ‚่ง[่ฟ™้‡Œ](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒๆต‹่ฏ•)ใ€‚ | + +ๆ‚จ่ฟ˜ๅฏไปฅ้€š่ฟ‡็Žฏๅขƒๅ˜้‡ๆฅๆŒ‡ๅฎšๅฏๅŠจๅ™จ็š„ๅ…ถไป–ๅ‚ๆ•ฐใ€‚ไพ‹ๅฆ‚๏ผŒไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๅฐ†ๅฏๅŠจๅ™จ็š„้€šไฟก็ซฏๅฃๆ›ดๆ”นไธบ 29666๏ผš + +```shell +PORT=29666 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS] +``` + +ๅฆ‚ๆžœๆ‚จๆƒณๅฏๅŠจๅคšไธชๆต‹่ฏ•ไฝœไธšๅนถไฝฟ็”จไธๅŒ็š„ GPU๏ผŒๅฏไปฅ้€š่ฟ‡ๆŒ‡ๅฎšไธๅŒ็š„็ซฏๅฃๅ’Œๅฏ่ง่ฎพๅค‡ๆฅๅฏๅŠจๅฎƒไปฌใ€‚ + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS] +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS] +``` + +### ไฝฟ็”จๅคšๅฐๆœบๅ™จ่ฟ›่กŒๆต‹่ฏ• + +#### ๅŒไธ€็ฝ‘็ปœไธญ็š„ๅคšๅฐๆœบๅ™จ + +ๅฆ‚ๆžœๆ‚จไฝฟ็”จไปฅๅคช็ฝ‘่ฟžๆŽฅ็š„ๅคšๅฐๆœบๅ™จ่ฟ›่กŒๆต‹่ฏ•ไฝœไธš๏ผŒๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค๏ผš + +ๅœจ็ฌฌไธ€ๅฐๆœบๅ™จไธŠ๏ผš + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS +``` + +ๅœจ็ฌฌไบŒๅฐๆœบๅ™จไธŠ๏ผš + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS +``` + +ไธŽๅ•ๅฐๆœบๅ™จไธŠ็š„ๅคšไธช GPU ็›ธๆฏ”๏ผŒๆ‚จ้œ€่ฆๆŒ‡ๅฎšไธ€ไบ›้ขๅค–็š„็Žฏๅขƒๅ˜้‡๏ผš + +| ENV_VARS | ๆ่ฟฐ | +| ------------- | ---------------------------------------------------------------- | +| `NNODES` | ๆœบๅ™จ็š„ๆ€ปๆ•ฐใ€‚้ป˜่ฎคไธบ 1ใ€‚ | +| `NODE_RANK` | ๆœฌๅœฐๆœบๅ™จ็š„็ดขๅผ•ใ€‚้ป˜่ฎคไธบ 0ใ€‚ | +| `PORT` | ้€šไฟก็ซฏๅฃ๏ผŒๅœจๆ‰€ๆœ‰ๆœบๅ™จไธŠๅบ”่ฏฅไฟๆŒไธ€่‡ดใ€‚้ป˜่ฎคไธบ 29500ใ€‚ | +| `MASTER_ADDR` | ไธปๆœบๅ™จ็š„ IP ๅœฐๅ€๏ผŒๅœจๆ‰€ๆœ‰ๆœบๅ™จไธŠๅบ”่ฏฅไฟๆŒไธ€่‡ดใ€‚้ป˜่ฎคไธบ `127.0.0.1`ใ€‚ | + +้€šๅธธ๏ผŒๅฆ‚ๆžœๆ‚จๆฒกๆœ‰้ซ˜้€Ÿ็ฝ‘็ปœ๏ผˆๅฆ‚ InfiniBand๏ผ‰๏ผŒๅˆ™้€Ÿๅบฆไผšๆฏ”่พƒๆ…ขใ€‚ + +#### ไฝฟ็”จ slurm ็ฎก็†็š„ๅคšๅฐๆœบๅ™จ + +ๅฆ‚ๆžœๆ‚จๅœจไฝฟ็”จ [slurm](https://slurm.schedmd.com/) ็ฎก็†็š„้›†็พคไธŠ่ฟ่กŒ MMAction2๏ผŒๅฏไปฅไฝฟ็”จ่„šๆœฌ `slurm_test.sh`ใ€‚ + +```shell +[ENV_VARS] bash tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG} ${CHECKPOINT} [PY_ARGS] +``` + +ไธ‹้ขๆ˜ฏ่ฏฅ่„šๆœฌ็š„ๅ‚ๆ•ฐๆ่ฟฐใ€‚ + +| ๅ‚ๆ•ฐ | ๆ่ฟฐ | +| ------------ | ---------------------------------------------------------------------- | +| `PARTITION` | ้›†็พคไธญ่ฆไฝฟ็”จ็š„ๅˆ†ๅŒบใ€‚ | +| `JOB_NAME` | ไฝœไธš็š„ๅ็งฐ๏ผŒๆ‚จๅฏไปฅ่‡ชๅฎšไน‰ใ€‚ | +| `CONFIG` | ้…็ฝฎๆ–‡ไปถ็š„่ทฏๅพ„ใ€‚ | +| `CHECKPOINT` | ๆƒ้‡ๆ–‡ไปถ็š„่ทฏๅพ„๏ผˆๅฏไปฅๆ˜ฏ HTTP ้“พๆŽฅ๏ผ‰ใ€‚ | +| `[PYARGS]` | `tools/test.py` ็š„ๅ…ถไป–ๅฏ้€‰ๅ‚ๆ•ฐ๏ผŒ่ฏทๅ‚่ง[่ฟ™้‡Œ](#ไฝฟ็”จๅ•ไธช-gpu-่ฟ›่กŒๆต‹่ฏ•)ใ€‚ | + +ไธ‹้ขๅˆ—ๅ‡บไบ†ๅฏ็”จไบŽ้…็ฝฎ slurm ไฝœไธš็š„็Žฏๅขƒๅ˜้‡ใ€‚ + +| ENV_VARS | ๆ่ฟฐ | +| --------------- | -------------------------------------------------------------------------------- | +| `GPUS` | ่ฆไฝฟ็”จ็š„ GPU ๆ•ฐ้‡ใ€‚้ป˜่ฎคไธบ 8ใ€‚ | +| `GPUS_PER_NODE` | ๆฏไธช่Š‚็‚น่ฆๅˆ†้…็š„ GPU ๆ•ฐ้‡ใ€‚้ป˜่ฎคไธบ 8ใ€‚ | +| `CPUS_PER_TASK` | ๆฏไธชไปปๅŠก่ฆๅˆ†้…็š„ CPU ๆ•ฐ้‡๏ผˆ้€šๅธธไธ€ไธช GPU ๅฏนๅบ”ไธ€ไธชไปปๅŠก๏ผ‰ใ€‚้ป˜่ฎคไธบ 5ใ€‚ | +| `SRUN_ARGS` | `srun` ็š„ๅ…ถไป–ๅ‚ๆ•ฐใ€‚ๅฏ็”จ้€‰้กนๅฏๅœจ[่ฟ™้‡Œ](https://slurm.schedmd.com/srun.html)ๆ‰พๅˆฐใ€‚ | diff --git a/docs/zh_cn/utils.py b/docs/zh_cn/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..444e4c147d19d3f20686c81233d7ffc6e0821c19 --- /dev/null +++ b/docs/zh_cn/utils.py @@ -0,0 +1,28 @@ +import re +from pathlib import Path + + +def replace_link(pattern, template, content, file_path): + MMACT_ROOT = Path(__file__).absolute().parents[2] + GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/' + + def replace_core(matchobj): + name = matchobj.group(1) + link = matchobj.group(2) + if link.startswith('http') or link.startswith('#'): + return template.format(name, link) + # For link relative to project folder, such as '/configs/*/*.py' + elif Path(link).is_absolute(): + link = link.lstrip('/') + folder = MMACT_ROOT + # For link relative to current file, such as './config/*.py' + else: + folder = file_path.parent + file_link = link.split('#')[0] + assert (folder / file_link).exists(), \ + f'Link not found:\n{file_path}: {folder / link}' + rel_link = (folder / link).resolve().relative_to(MMACT_ROOT) + link = GITHUB_PREFIX + str(rel_link) + return template.format(name, link) + + return re.sub(pattern, replace_core, content) diff --git a/mmaction/__init__.py b/mmaction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6f701013f44991753496c110a3387d2f5120fc12 --- /dev/null +++ b/mmaction/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmengine +from mmengine.utils import digit_version + +from .version import __version__ + +mmcv_minimum_version = '2.0.0rc4' +mmcv_maximum_version = '2.2.0' +mmcv_version = digit_version(mmcv.__version__) + +mmengine_minimum_version = '0.7.1' +mmengine_maximum_version = '1.0.0' +mmengine_version = digit_version(mmengine.__version__) + +assert (digit_version(mmcv_minimum_version) <= mmcv_version + < digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.' + +assert (digit_version(mmengine_minimum_version) <= mmengine_version + < digit_version(mmengine_maximum_version)), \ + f'MMEngine=={mmengine.__version__} is used but incompatible. ' \ + f'Please install mmengine>={mmengine_minimum_version}, ' \ + f'<{mmengine_maximum_version}.' + +__all__ = ['__version__'] diff --git a/mmaction/__pycache__/__init__.cpython-310.pyc b/mmaction/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37662e4137a6b3039e7f5d80b16ff32775ccd234 Binary files /dev/null and b/mmaction/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/__pycache__/registry.cpython-310.pyc b/mmaction/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73f96855f5771850f0c07424b79ed8f89ec70585 Binary files /dev/null and b/mmaction/__pycache__/registry.cpython-310.pyc differ diff --git a/mmaction/__pycache__/version.cpython-310.pyc b/mmaction/__pycache__/version.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e3733c5a560cb7643df63639201edf55789d884 Binary files /dev/null and b/mmaction/__pycache__/version.cpython-310.pyc differ diff --git a/mmaction/apis/__init__.py b/mmaction/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7ab8a6c762275b531f179bb6a61adc0d02fac839 --- /dev/null +++ b/mmaction/apis/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .inference import (detection_inference, inference_recognizer, + inference_skeleton, init_recognizer, pose_inference) +from .inferencers import * # NOQA + +__all__ = [ + 'init_recognizer', 'inference_recognizer', 'inference_skeleton', + 'detection_inference', 'pose_inference' +] diff --git a/mmaction/apis/__pycache__/__init__.cpython-310.pyc b/mmaction/apis/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d166d7452b0fff9257dfcdf8e3b42ef1330889f4 Binary files /dev/null and b/mmaction/apis/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/apis/__pycache__/inference.cpython-310.pyc b/mmaction/apis/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b5b260f371b4cb40d347c7be9a9db361d2e0315 Binary files /dev/null and b/mmaction/apis/__pycache__/inference.cpython-310.pyc differ diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..692173f5aebf0233f8539edadfe6c20340b78baa --- /dev/null +++ b/mmaction/apis/inference.py @@ -0,0 +1,295 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import mmengine +import numpy as np +import torch +import torch.nn as nn +from mmengine.dataset import Compose, pseudo_collate +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import InstanceData +from mmengine.utils import track_iter_progress + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample + + +def init_recognizer(config: Union[str, Path, mmengine.Config], + checkpoint: Optional[str] = None, + device: Union[str, torch.device] = 'cuda:0') -> nn.Module: + """Initialize a recognizer from config file. + + Args: + config (str or :obj:`Path` or :obj:`mmengine.Config`): Config file + path, :obj:`Path` or the config object. + checkpoint (str, optional): Checkpoint path/url. If set to None, + the model will not load any weights. Defaults to None. + device (str | torch.device): The desired device of returned + tensor. Defaults to ``'cuda:0'``. + + Returns: + nn.Module: The constructed recognizer. + """ + if isinstance(config, (str, Path)): + config = mmengine.Config.fromfile(config) + elif not isinstance(config, mmengine.Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + + init_default_scope(config.get('default_scope', 'mmaction')) + + if hasattr(config.model, 'backbone') and config.model.backbone.get( + 'pretrained', None): + config.model.backbone.pretrained = None + model = MODELS.build(config.model) + + if checkpoint is not None: + load_checkpoint(model, checkpoint, map_location='cpu') + model.cfg = config + model.to(device) + model.eval() + return model + + +def inference_recognizer(model: nn.Module, + video: Union[str, dict], + test_pipeline: Optional[Compose] = None + ) -> ActionDataSample: + """Inference a video with the recognizer. + + Args: + model (nn.Module): The loaded recognizer. + video (Union[str, dict]): The video file path or the results + dictionary (the input of pipeline). + test_pipeline (:obj:`Compose`, optional): The test pipeline. + If not specified, the test pipeline in the config will be + used. Defaults to None. + + Returns: + :obj:`ActionDataSample`: The inference results. Specifically, the + predicted scores are saved at ``result.pred_score``. + """ + + if test_pipeline is None: + cfg = model.cfg + init_default_scope(cfg.get('default_scope', 'mmaction')) + test_pipeline_cfg = cfg.test_pipeline + test_pipeline = Compose(test_pipeline_cfg) + + input_flag = None + if isinstance(video, dict): + input_flag = 'dict' + elif isinstance(video, str) and osp.exists(video): + if video.endswith('.npy'): + input_flag = 'audio' + else: + input_flag = 'video' + else: + raise RuntimeError(f'The type of argument `video` is not supported: ' + f'{type(video)}') + + if input_flag == 'dict': + data = video + if input_flag == 'video': + data = dict(filename=video, label=-1, start_index=0, modality='RGB') + if input_flag == 'audio': + data = dict( + audio_path=video, + total_frames=len(np.load(video)), + start_index=0, + label=-1) + + data = test_pipeline(data) + data = pseudo_collate([data]) + + # Forward the model + with torch.no_grad(): + result = model.test_step(data)[0] + + return result + + +def inference_skeleton(model: nn.Module, + pose_results: List[dict], + img_shape: Tuple[int], + test_pipeline: Optional[Compose] = None + ) -> ActionDataSample: + """Inference a pose results with the skeleton recognizer. + + Args: + model (nn.Module): The loaded recognizer. + pose_results (List[dict]): The pose estimation results dictionary + (the results of `pose_inference`) + img_shape (Tuple[int]): The original image shape used for inference + skeleton recognizer. + test_pipeline (:obj:`Compose`, optional): The test pipeline. + If not specified, the test pipeline in the config will be + used. Defaults to None. + + Returns: + :obj:`ActionDataSample`: The inference results. Specifically, the + predicted scores are saved at ``result.pred_score``. + """ + if test_pipeline is None: + cfg = model.cfg + init_default_scope(cfg.get('default_scope', 'mmaction')) + test_pipeline_cfg = cfg.test_pipeline + test_pipeline = Compose(test_pipeline_cfg) + + h, w = img_shape + num_keypoint = pose_results[0]['keypoints'].shape[1] + num_frame = len(pose_results) + num_person = max([len(x['keypoints']) for x in pose_results]) + fake_anno = dict( + frame_dict='', + label=-1, + img_shape=(h, w), + origin_shape=(h, w), + start_index=0, + modality='Pose', + total_frames=num_frame) + + keypoint = np.zeros((num_frame, num_person, num_keypoint, 2), + dtype=np.float16) + keypoint_score = np.zeros((num_frame, num_person, num_keypoint), + dtype=np.float16) + + for f_idx, frm_pose in enumerate(pose_results): + frm_num_persons = frm_pose['keypoints'].shape[0] + for p_idx in range(frm_num_persons): + keypoint[f_idx, p_idx] = frm_pose['keypoints'][p_idx] + keypoint_score[f_idx, p_idx] = frm_pose['keypoint_scores'][p_idx] + + fake_anno['keypoint'] = keypoint.transpose((1, 0, 2, 3)) + fake_anno['keypoint_score'] = keypoint_score.transpose((1, 0, 2)) + return inference_recognizer(model, fake_anno, test_pipeline) + + +def detection_inference(det_config: Union[str, Path, mmengine.Config, + nn.Module], + det_checkpoint: str, + frame_paths: List[str], + det_score_thr: float = 0.9, + det_cat_id: int = 0, + device: Union[str, torch.device] = 'cuda:0', + with_score: bool = False) -> tuple: + """Detect human boxes given frame paths. + + Args: + det_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`, + :obj:`torch.nn.Module`]): + Det config file path or Detection model object. It can be + a :obj:`Path`, a config object, or a module object. + det_checkpoint: Checkpoint path/url. + frame_paths (List[str]): The paths of frames to do detection inference. + det_score_thr (float): The threshold of human detection score. + Defaults to 0.9. + det_cat_id (int): The category id for human detection. Defaults to 0. + device (Union[str, torch.device]): The desired device of returned + tensor. Defaults to ``'cuda:0'``. + with_score (bool): Whether to append detection score after box. + Defaults to None. + + Returns: + List[np.ndarray]: List of detected human boxes. + List[:obj:`DetDataSample`]: List of data samples, generally used + to visualize data. + """ + try: + from mmdet.apis import inference_detector, init_detector + from mmdet.structures import DetDataSample + except (ImportError, ModuleNotFoundError): + raise ImportError('Failed to import `inference_detector` and ' + '`init_detector` from `mmdet.apis`. These apis are ' + 'required in this inference api! ') + if isinstance(det_config, nn.Module): + model = det_config + else: + model = init_detector( + config=det_config, checkpoint=det_checkpoint, device=device) + + results = [] + data_samples = [] + print('Performing Human Detection for each frame') + for frame_path in track_iter_progress(frame_paths): + det_data_sample: DetDataSample = inference_detector(model, frame_path) + pred_instance = det_data_sample.pred_instances.cpu().numpy() + bboxes = pred_instance.bboxes + scores = pred_instance.scores + # We only keep human detection bboxs with score larger + # than `det_score_thr` and category id equal to `det_cat_id`. + valid_idx = np.logical_and(pred_instance.labels == det_cat_id, + pred_instance.scores > det_score_thr) + bboxes = bboxes[valid_idx] + scores = scores[valid_idx] + + if with_score: + bboxes = np.concatenate((bboxes, scores[:, None]), axis=-1) + results.append(bboxes) + data_samples.append(det_data_sample) + + return results, data_samples + + +def pose_inference(pose_config: Union[str, Path, mmengine.Config, nn.Module], + pose_checkpoint: str, + frame_paths: List[str], + det_results: List[np.ndarray], + device: Union[str, torch.device] = 'cuda:0') -> tuple: + """Perform Top-Down pose estimation. + + Args: + pose_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`, + :obj:`torch.nn.Module`]): Pose config file path or + pose model object. It can be a :obj:`Path`, a config object, + or a module object. + pose_checkpoint: Checkpoint path/url. + frame_paths (List[str]): The paths of frames to do pose inference. + det_results (List[np.ndarray]): List of detected human boxes. + device (Union[str, torch.device]): The desired device of returned + tensor. Defaults to ``'cuda:0'``. + + Returns: + List[List[Dict[str, np.ndarray]]]: List of pose estimation results. + List[:obj:`PoseDataSample`]: List of data samples, generally used + to visualize data. + """ + try: + from mmpose.apis import inference_topdown, init_model + from mmpose.structures import PoseDataSample, merge_data_samples + except (ImportError, ModuleNotFoundError): + raise ImportError('Failed to import `inference_topdown` and ' + '`init_model` from `mmpose.apis`. These apis ' + 'are required in this inference api! ') + if isinstance(pose_config, nn.Module): + model = pose_config + else: + model = init_model(pose_config, pose_checkpoint, device) + + results = [] + data_samples = [] + print('Performing Human Pose Estimation for each frame') + for f, d in track_iter_progress(list(zip(frame_paths, det_results))): + pose_data_samples: List[PoseDataSample] \ + = inference_topdown(model, f, d[..., :4], bbox_format='xyxy') + pose_data_sample = merge_data_samples(pose_data_samples) + pose_data_sample.dataset_meta = model.dataset_meta + # make fake pred_instances + if not hasattr(pose_data_sample, 'pred_instances'): + num_keypoints = model.dataset_meta['num_keypoints'] + pred_instances_data = dict( + keypoints=np.empty(shape=(0, num_keypoints, 2)), + keypoints_scores=np.empty(shape=(0, 17), dtype=np.float32), + bboxes=np.empty(shape=(0, 4), dtype=np.float32), + bbox_scores=np.empty(shape=(0), dtype=np.float32)) + pose_data_sample.pred_instances = InstanceData( + **pred_instances_data) + + poses = pose_data_sample.pred_instances.to_dict() + results.append(poses) + data_samples.append(pose_data_sample) + + return results, data_samples diff --git a/mmaction/apis/inferencers/__init__.py b/mmaction/apis/inferencers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..237fbcb4e661c8c65a0a454f0285ccadec9a0532 --- /dev/null +++ b/mmaction/apis/inferencers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .actionrecog_inferencer import ActionRecogInferencer +from .mmaction2_inferencer import MMAction2Inferencer + +__all__ = ['ActionRecogInferencer', 'MMAction2Inferencer'] diff --git a/mmaction/apis/inferencers/__pycache__/__init__.cpython-310.pyc b/mmaction/apis/inferencers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e748ba85d869c3a9ea3cd88133251ed78b49b073 Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-310.pyc b/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a65757c5ae993ff564503e639da372aca82cd47 Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-310.pyc differ diff --git a/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-310.pyc b/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab770c8e881be6d2544f3451160d55606361d156 Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-310.pyc differ diff --git a/mmaction/apis/inferencers/actionrecog_inferencer.py b/mmaction/apis/inferencers/actionrecog_inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..1e7b32174746a046e524c2b972f2615d04686ca2 --- /dev/null +++ b/mmaction/apis/inferencers/actionrecog_inferencer.py @@ -0,0 +1,361 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmengine +import numpy as np +from mmengine.dataset import Compose +from mmengine.fileio import list_from_file +from mmengine.infer.infer import BaseInferencer, ModelType +from mmengine.registry import init_default_scope +from mmengine.structures import InstanceData + +from mmaction.registry import INFERENCERS +from mmaction.structures import ActionDataSample +from mmaction.utils import ConfigType, get_str_type + +InstanceList = List[InstanceData] +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = Union[InstanceData, InstanceList] +ImgType = Union[np.ndarray, Sequence[np.ndarray]] +ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]] + + +@INFERENCERS.register_module(name='action-recognition') +@INFERENCERS.register_module() +class ActionRecogInferencer(BaseInferencer): + """The inferencer for action recognition. + + Args: + model (str, optional): Path to the config file or the model name + defined in metafile. For example, it could be + "slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb" or + "configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py". + weights (str, optional): Path to the checkpoint. If it is not specified + and model is a model name of metafile, the weights will be loaded + from metafile. Defaults to None. + device (str, optional): Device to run inference. If None, the available + device will be automatically used. Defaults to None. + label_file (str, optional): label file for dataset. + input_format (str): Input video format, Choices are 'video', + 'rawframes', 'array'. 'video' means input data is a video file, + 'rawframes' means input data is a video frame folder, and 'array' + means input data is a np.ndarray. Defaults to 'video'. + pack_cfg (dict, optional): Config for `InferencerPackInput` to load + input. Defaults to empty dict. + scope (str, optional): The scope of the model. Defaults to "mmaction". + """ + + preprocess_kwargs: set = set() + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', 'show', 'wait_time', 'vid_out_dir', 'draw_pred', 'fps', + 'out_type', 'target_resolution' + } + postprocess_kwargs: set = { + 'print_result', 'pred_out_file', 'return_datasample' + } + + def __init__(self, + model: Union[ModelType, str], + weights: Optional[str] = None, + device: Optional[str] = None, + label_file: Optional[str] = None, + input_format: str = 'video', + pack_cfg: dict = {}, + scope: Optional[str] = 'mmaction') -> None: + # A global counter tracking the number of videos processed, for + # naming of the output videos + self.num_visualized_vids = 0 + self.input_format = input_format + self.pack_cfg = pack_cfg.copy() + init_default_scope(scope) + super().__init__( + model=model, weights=weights, device=device, scope=scope) + + if label_file is not None: + self.visualizer.dataset_meta = dict( + classes=list_from_file(label_file)) + + def __call__(self, + inputs: InputsType, + return_datasamples: bool = False, + batch_size: int = 1, + return_vis: bool = False, + show: bool = False, + wait_time: int = 0, + draw_pred: bool = True, + vid_out_dir: str = '', + out_type: str = 'video', + print_result: bool = False, + pred_out_file: str = '', + target_resolution: Optional[Tuple[int]] = None, + **kwargs) -> dict: + """Call the inferencer. + + Args: + inputs (InputsType): Inputs for the inferencer. + return_datasamples (bool): Whether to return results as + :obj:`BaseDataElement`. Defaults to False. + batch_size (int): Inference batch size. Defaults to 1. + show (bool): Whether to display the visualization results in a + popup window. Defaults to False. + wait_time (float): The interval of show (s). Defaults to 0. + draw_pred (bool): Whether to draw predicted bounding boxes. + Defaults to True. + vid_out_dir (str): Output directory of visualization results. + If left as empty, no file will be saved. Defaults to ''. + out_type (str): Output type of visualization results. + Defaults to 'video'. + print_result (bool): Whether to print the inference result w/o + visualization to the console. Defaults to False. + pred_out_file: File to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + **kwargs: Other keyword arguments passed to :meth:`preprocess`, + :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. + Each key in kwargs should be in the corresponding set of + ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs`` + and ``postprocess_kwargs``. + + Returns: + dict: Inference and visualization results. + """ + return super().__call__( + inputs, + return_datasamples, + batch_size, + return_vis=return_vis, + show=show, + wait_time=wait_time, + draw_pred=draw_pred, + vid_out_dir=vid_out_dir, + print_result=print_result, + pred_out_file=pred_out_file, + out_type=out_type, + target_resolution=target_resolution, + **kwargs) + + def _inputs_to_list(self, inputs: InputsType) -> list: + """Preprocess the inputs to a list. The main difference from mmengine + version is that we don't list a directory cause input could be a frame + folder. + + Preprocess inputs to a list according to its type: + + - list or tuple: return inputs + - str: return a list containing the string. The string + could be a path to file, a url or other types of string according + to the task. + + Args: + inputs (InputsType): Inputs for the inferencer. + + Returns: + list: List of input for the :meth:`preprocess`. + """ + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] + + return list(inputs) + + def _init_pipeline(self, cfg: ConfigType) -> Compose: + """Initialize the test pipeline.""" + test_pipeline = cfg.test_dataloader.dataset.pipeline + # Alter data pipelines for decode + if self.input_format == 'array': + for i in range(len(test_pipeline)): + if 'Decode' in get_str_type(test_pipeline[i]['type']): + test_pipeline[i] = dict(type='ArrayDecode') + test_pipeline = [ + x for x in test_pipeline if 'Init' not in x['type'] + ] + elif self.input_format == 'video': + if 'Init' not in get_str_type(test_pipeline[0]['type']): + test_pipeline = [dict(type='DecordInit')] + test_pipeline + else: + test_pipeline[0] = dict(type='DecordInit') + for i in range(len(test_pipeline)): + if 'Decode' in get_str_type(test_pipeline[i]['type']): + test_pipeline[i] = dict(type='DecordDecode') + elif self.input_format == 'rawframes': + if 'Init' in get_str_type(test_pipeline[0]['type']): + test_pipeline = test_pipeline[1:] + for i in range(len(test_pipeline)): + if 'Decode' in get_str_type(test_pipeline[i]['type']): + test_pipeline[i] = dict(type='RawFrameDecode') + # Alter data pipelines to close TTA, avoid OOM + # Use center crop instead of multiple crop + for i in range(len(test_pipeline)): + if get_str_type( + test_pipeline[i]['type']) in ['ThreeCrop', 'TenCrop']: + test_pipeline[i]['type'] = 'CenterCrop' + # Use single clip for `Recognizer3D` + if cfg.model.type == 'Recognizer3D': + for i in range(len(test_pipeline)): + if get_str_type(test_pipeline[i]['type']) == 'SampleFrames': + test_pipeline[i]['num_clips'] = 1 + # Pack multiple types of input format + test_pipeline.insert( + 0, + dict( + type='InferencerPackInput', + input_format=self.input_format, + **self.pack_cfg)) + + return Compose(test_pipeline) + + def visualize( + self, + inputs: InputsType, + preds: PredType, + return_vis: bool = False, + show: bool = False, + wait_time: int = 0, + draw_pred: bool = True, + fps: int = 30, + out_type: str = 'video', + target_resolution: Optional[Tuple[int]] = None, + vid_out_dir: str = '', + ) -> Union[List[np.ndarray], None]: + """Visualize predictions. + + Args: + inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer. + preds (List[Dict]): Predictions of the model. + return_vis (bool): Whether to return the visualization result. + Defaults to False. + show (bool): Whether to display the image in a popup window. + Defaults to False. + wait_time (float): The interval of show (s). Defaults to 0. + draw_pred (bool): Whether to draw prediction labels. + Defaults to True. + fps (int): Frames per second for saving video. Defaults to 4. + out_type (str): Output format type, choose from 'img', 'gif', + 'video'. Defaults to ``'img'``. + target_resolution (Tuple[int], optional): Set to + (desired_width desired_height) to have resized frames. If + either dimension is None, the frames are resized by keeping + the existing aspect ratio. Defaults to None. + vid_out_dir (str): Output directory of visualization results. + If left as empty, no file will be saved. Defaults to ''. + + Returns: + List[np.ndarray] or None: Returns visualization results only if + applicable. + """ + if self.visualizer is None or (not show and vid_out_dir == '' + and not return_vis): + return None + + if getattr(self, 'visualizer') is None: + raise ValueError('Visualization needs the "visualizer" term' + 'defined in the config, but got None.') + + results = [] + + for single_input, pred in zip(inputs, preds): + if isinstance(single_input, str): + frames = single_input + video_name = osp.basename(single_input) + elif isinstance(single_input, np.ndarray): + frames = single_input.copy() + video_num = str(self.num_visualized_vids).zfill(8) + video_name = f'{video_num}.mp4' + else: + raise ValueError('Unsupported input type: ' + f'{type(single_input)}') + + out_path = osp.join(vid_out_dir, video_name) if vid_out_dir != '' \ + else None + + visualization = self.visualizer.add_datasample( + video_name, + frames, + pred, + show_frames=show, + wait_time=wait_time, + draw_gt=False, + draw_pred=draw_pred, + fps=fps, + out_type=out_type, + out_path=out_path, + target_resolution=target_resolution, + ) + results.append(visualization) + self.num_visualized_vids += 1 + + return results + + def postprocess( + self, + preds: PredType, + visualization: Optional[List[np.ndarray]] = None, + return_datasample: bool = False, + print_result: bool = False, + pred_out_file: str = '', + ) -> Union[ResType, Tuple[ResType, np.ndarray]]: + """Process the predictions and visualization results from ``forward`` + and ``visualize``. + + This method should be responsible for the following tasks: + + 1. Convert datasamples into a json-serializable dict if needed. + 2. Pack the predictions and visualization results and return them. + 3. Dump or log the predictions. + + Args: + preds (List[Dict]): Predictions of the model. + visualization (Optional[np.ndarray]): Visualized predictions. + return_datasample (bool): Whether to use Datasample to store + inference results. If False, dict will be used. + print_result (bool): Whether to print the inference result w/o + visualization to the console. Defaults to False. + pred_out_file: File to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + Returns: + dict: Inference and visualization results with key ``predictions`` + and ``visualization``. + + - ``visualization`` (Any): Returned by :meth:`visualize`. + - ``predictions`` (dict or DataSample): Returned by + :meth:`forward` and processed in :meth:`postprocess`. + If ``return_datasample=False``, it usually should be a + json-serializable dict containing only basic data elements such + as strings and numbers. + """ + result_dict = {} + results = preds + if not return_datasample: + results = [] + for pred in preds: + result = self.pred2dict(pred) + results.append(result) + # Add video to the results after printing and dumping + result_dict['predictions'] = results + if print_result: + print(result_dict) + if pred_out_file != '': + mmengine.dump(result_dict, pred_out_file) + result_dict['visualization'] = visualization + return result_dict + + def pred2dict(self, data_sample: ActionDataSample) -> Dict: + """Extract elements necessary to represent a prediction into a + dictionary. It's better to contain only basic data elements such as + strings and numbers in order to guarantee it's json-serializable. + + Args: + data_sample (ActionDataSample): The data sample to be converted. + + Returns: + dict: The output dictionary. + """ + result = {} + result['pred_labels'] = data_sample.pred_label.tolist() + result['pred_scores'] = data_sample.pred_score.tolist() + return result diff --git a/mmaction/apis/inferencers/mmaction2_inferencer.py b/mmaction/apis/inferencers/mmaction2_inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..6a37a9bb9a3531a31fdf67683a161b82dc26b816 --- /dev/null +++ b/mmaction/apis/inferencers/mmaction2_inferencer.py @@ -0,0 +1,232 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmengine +import numpy as np +from mmengine.infer import BaseInferencer +from mmengine.structures import InstanceData + +from mmaction.utils import ConfigType +from .actionrecog_inferencer import ActionRecogInferencer + +InstanceList = List[InstanceData] +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = Union[InstanceData, InstanceList] +ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]] + + +class MMAction2Inferencer(BaseInferencer): + """MMAction2 Inferencer. It's a unified inferencer interface for video + analyse task, currently including: ActionRecog. and it can be used to + perform end-to-end action recognition inference. + + Args: + rec (str, optional): Pretrained action recognition algorithm. + It's the path to the config file or the model name defined in + metafile. For example, it could be: + + - model alias, e.g. ``'slowfast'``, + - config name, e.g. ``'slowfast_r50_8xb8-8x8x1-256e_kinetics400 + -rgb'``, + - config path + + Defaults to ``None``. + rec_weights (str, optional): Path to the custom checkpoint file of + the selected rec model. If it is not specified and "rec" is a model + name of metafile, the weights will be loaded from metafile. + Defaults to None. + device (str, optional): Device to run inference. For example, + it could be 'cuda' or 'cpu'. If None, the available + device will be automatically used. Defaults to None. + label_file (str, optional): label file for dataset. + input_format (str): Input video format, Choices are 'video', + 'rawframes', 'array'. 'video' means input data is a video file, + 'rawframes' means input data is a video frame folder, and 'array' + means input data is a np.ndarray. Defaults to 'video'. + """ + + preprocess_kwargs: set = set() + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', 'show', 'wait_time', 'vid_out_dir', 'draw_pred', 'fps', + 'out_type', 'target_resolution' + } + postprocess_kwargs: set = { + 'print_result', 'pred_out_file', 'return_datasample' + } + + def __init__(self, + rec: Optional[str] = None, + rec_weights: Optional[str] = None, + device: Optional[str] = None, + label_file: Optional[str] = None, + input_format: str = 'video') -> None: + + if rec is None: + raise ValueError('rec algorithm should provided.') + + self.visualizer = None + self.num_visualized_imgs = 0 + + if rec is not None: + self.actionrecog_inferencer = ActionRecogInferencer( + rec, rec_weights, device, label_file, input_format) + self.mode = 'rec' + + def _init_pipeline(self, cfg: ConfigType) -> None: + pass + + def forward(self, inputs: InputType, batch_size: int, + **forward_kwargs) -> PredType: + """Forward the inputs to the model. + + Args: + inputs (InputsType): The inputs to be forwarded. + batch_size (int): Batch size. Defaults to 1. + + Returns: + Dict: The prediction results. Possibly with keys "rec". + """ + result = {} + if self.mode == 'rec': + predictions = self.actionrecog_inferencer( + inputs, + return_datasamples=True, + batch_size=batch_size, + **forward_kwargs)['predictions'] + result['rec'] = [[p] for p in predictions] + + return result + + def visualize(self, inputs: InputsType, preds: PredType, + **kwargs) -> List[np.ndarray]: + """Visualize predictions. + + Args: + inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer. + preds (List[Dict]): Predictions of the model. + show (bool): Whether to display the image in a popup window. + Defaults to False. + wait_time (float): The interval of show (s). Defaults to 0. + draw_pred (bool): Whether to draw predicted bounding boxes. + Defaults to True. + fps (int): Frames per second for saving video. Defaults to 4. + out_type (str): Output format type, choose from 'img', 'gif', + 'video'. Defaults to ``'img'``. + target_resolution (Tuple[int], optional): Set to + (desired_width desired_height) to have resized frames. If + either dimension is None, the frames are resized by keeping + the existing aspect ratio. Defaults to None. + vid_out_dir (str): Output directory of visualization results. + If left as empty, no file will be saved. Defaults to ''. + """ + + if 'rec' in self.mode: + return self.actionrecog_inferencer.visualize( + inputs, preds['rec'][0], **kwargs) + + def __call__( + self, + inputs: InputsType, + batch_size: int = 1, + **kwargs, + ) -> dict: + """Call the inferencer. + + Args: + inputs (InputsType): Inputs for the inferencer. It can be a path + to image / image directory, or an array, or a list of these. + return_datasamples (bool): Whether to return results as + :obj:`BaseDataElement`. Defaults to False. + batch_size (int): Batch size. Defaults to 1. + **kwargs: Key words arguments passed to :meth:`preprocess`, + :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. + Each key in kwargs should be in the corresponding set of + ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs`` + and ``postprocess_kwargs``. + + Returns: + dict: Inference and visualization results. + """ + ( + preprocess_kwargs, + forward_kwargs, + visualize_kwargs, + postprocess_kwargs, + ) = self._dispatch_kwargs(**kwargs) + + ori_inputs = self._inputs_to_list(inputs) + + preds = self.forward(ori_inputs, batch_size, **forward_kwargs) + + visualization = self.visualize( + ori_inputs, preds, + **visualize_kwargs) # type: ignore # noqa: E501 + results = self.postprocess(preds, visualization, **postprocess_kwargs) + return results + + def _inputs_to_list(self, inputs: InputsType) -> list: + """Preprocess the inputs to a list. The main difference from mmengine + version is that we don't list a directory cause input could be a frame + folder. + + Preprocess inputs to a list according to its type: + + - list or tuple: return inputs + - str: return a list containing the string. The string + could be a path to file, a url or other types of string according + to the task. + + Args: + inputs (InputsType): Inputs for the inferencer. + + Returns: + list: List of input for the :meth:`preprocess`. + """ + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] + + return list(inputs) + + def postprocess(self, + preds: PredType, + visualization: Optional[List[np.ndarray]] = None, + print_result: bool = False, + pred_out_file: str = '' + ) -> Union[ResType, Tuple[ResType, np.ndarray]]: + """Postprocess predictions. + + Args: + preds (Dict): Predictions of the model. + visualization (Optional[np.ndarray]): Visualized predictions. + print_result (bool): Whether to print the result. + Defaults to False. + pred_out_file (str): Output file name to store predictions + without images. Supported file formats are โ€œjsonโ€, โ€œyaml/ymlโ€ + and โ€œpickle/pklโ€. Defaults to ''. + + Returns: + Dict or List[Dict]: Each dict contains the inference result of + each image. Possible keys are "rec_labels", "rec_scores" + """ + + result_dict = {} + pred_results = [{} for _ in range(len(next(iter(preds.values()))))] + if 'rec' in self.mode: + for i, rec_pred in enumerate(preds['rec']): + result = dict(rec_labels=[], rec_scores=[]) + for rec_pred_instance in rec_pred: + rec_dict_res = self.actionrecog_inferencer.pred2dict( + rec_pred_instance) + result['rec_labels'].append(rec_dict_res['pred_labels']) + result['rec_scores'].append(rec_dict_res['pred_scores']) + pred_results[i].update(result) + + result_dict['predictions'] = pred_results + if print_result: + print(result_dict) + if pred_out_file != '': + mmengine.dump(result_dict, pred_out_file) + result_dict['visualization'] = visualization + return result_dict diff --git a/mmaction/configs/_base_/__init__.py b/mmaction/configs/_base_/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/_base_/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/_base_/default_runtime.py b/mmaction/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..f927f1859264091be607c0e1dac7ce4c6426ad7b --- /dev/null +++ b/mmaction/configs/_base_/default_runtime.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook, RuntimeInfoHook, + SyncBuffersHook) +from mmengine.runner import LogProcessor + +from mmaction.visualization import ActionVisualizer, LocalVisBackend + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type=RuntimeInfoHook), + timer=dict(type=IterTimerHook), + logger=dict(type=LoggerHook, interval=20, ignore_last=False), + param_scheduler=dict(type=ParamSchedulerHook), + checkpoint=dict(type=CheckpointHook, interval=1, save_best='auto'), + sampler_seed=dict(type=DistSamplerSeedHook), + sync_buffers=dict(type=SyncBuffersHook)) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type=LogProcessor, window_size=20, by_epoch=True) + +vis_backends = [dict(type=LocalVisBackend)] +visualizer = dict(type=ActionVisualizer, vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/mmaction/configs/_base_/models/__init__.py b/mmaction/configs/_base_/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/_base_/models/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/_base_/models/slowfast_r50.py b/mmaction/configs/_base_/models/slowfast_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..e01ce9268af64454a77729a3a531dd55dda9a1e7 --- /dev/null +++ b/mmaction/configs/_base_/models/slowfast_r50.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + ResNet3dSlowFast, SlowFastHead) + +# model settings +model = dict( + type=Recognizer3D, + backbone=dict( + type=ResNet3dSlowFast, + pretrained=None, + resample_rate=8, # tau + speed_ratio=8, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_eval=False)), + cls_head=dict( + type=SlowFastHead, + in_channels=2304, # 2048+256 + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/mmaction/configs/_base_/models/slowonly_r50.py b/mmaction/configs/_base_/models/slowonly_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..b295ac30a48e2064cf3b40d818b5f37284ecdff9 --- /dev/null +++ b/mmaction/configs/_base_/models/slowonly_r50.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D, + ResNet3dSlowOnly) + +model = dict( + type=Recognizer3D, + backbone=dict( + type=ResNet3dSlowOnly, + depth=50, + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type=I3DHead, + in_channels=2048, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) diff --git a/mmaction/configs/_base_/models/swin_tiny.py b/mmaction/configs/_base_/models/swin_tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..147a67e1fef329ab8e156ba4ce27483d3aeaa196 --- /dev/null +++ b/mmaction/configs/_base_/models/swin_tiny.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D, + SwinTransformer3D) + +model = dict( + type=Recognizer3D, + backbone=dict( + type=SwinTransformer3D, + arch='tiny', + pretrained=None, + pretrained2d=True, + patch_size=(2, 4, 4), + window_size=(8, 7, 7), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + patch_norm=True), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + cls_head=dict( + type=I3DHead, + in_channels=768, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5, + average_clips='prob')) diff --git a/mmaction/configs/recognition/slowfast/__init__.py b/mmaction/configs/recognition/slowfast/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/recognition/slowfast/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7c51d51f0979d2cbf7076d6201e2d7794445cd --- /dev/null +++ b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.slowfast_r50 import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.evaluation import AccMetric + +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=ThreeCrop, crop_size=256), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +optim_wrapper = dict( + optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=34, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=256, + eta_min=0, + by_epoch=True, + begin=0, + end=256) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=4, max_keep_ckpts=3), + logger=dict(interval=100))) diff --git a/mmaction/configs/recognition/slowonly/__init__.py b/mmaction/configs/recognition/slowonly/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/recognition/slowonly/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c73e8331f2ae22a00ec19bb704c2141fd35a60d5 --- /dev/null +++ b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.slowonly_r50 import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim.sgd import SGD + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.evaluation import AccMetric + +# model settings +model.update(dict(backbone=dict(pretrained=None))) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=4, frame_interval=16, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=4, + frame_interval=16, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=4, + frame_interval=16, + num_clips=10, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=ThreeCrop, crop_size=256), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# learning policy +param_scheduler = [ + dict(type=LinearLR, start_factor=0.1, by_epoch=True, begin=0, end=34), + dict( + type=CosineAnnealingLR, + T_max=222, + eta_min=0, + by_epoch=True, + begin=34, + end=256) +] + +optim_wrapper = dict( + optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=1e-4), + clip_grad=dict(max_norm=40, norm_type=2)) + +# runtime settings +default_hooks.update(dict(checkpoint=dict(interval=4, max_keep_ckpts=3))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/mmaction/configs/recognition/swin/__init__.py b/mmaction/configs/recognition/swin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/recognition/swin/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..c450525a93e548a68377fc059626b2868fcf8b87 --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.swin_tiny import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.engine import SwinOptimWrapperConstructor +from mmaction.evaluation import AccMetric + +model.update( + dict( + backbone=dict( + arch='base', + drop_path_rate=0.3, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_base_patch4_window7_224.pth' # noqa: E501 + ), + cls_head=dict(in_channels=1024))) + +# dataset settings +dataset_type = VideoDataset +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05), + constructor=SwinOptimWrapperConstructor, + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..021ddcbc188958689e838de4f5388b9e4e4765cb --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from .swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb import * # noqa: E501 + +model.update(dict(cls_head=dict(num_classes=700))) + +# dataset +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +dataset_type = VideoDataset +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +optim_wrapper.update(dict(optimizer=dict(lr=2e-3))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (8 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=128)) diff --git a/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2909b782f9842a73bccda3fdfa691698b9b2fa --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.swin_tiny import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.engine import SwinOptimWrapperConstructor +from mmaction.evaluation import AccMetric + +model.update( + dict( + backbone=dict( + arch='large', + drop_path_rate=0.4, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_large_patch4_window7_224_22k.pth' # noqa: E501 + ), + cls_head=dict(in_channels=1536))) + +# dataset settings +dataset_type = VideoDataset +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05), + constructor=SwinOptimWrapperConstructor, + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a27940cf4133c6e82f396749fc67645065847952 --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from .swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb import * # noqa: E501 + +from mmengine.dataset import DefaultSampler +from torch.utils.data import ConcatDataset + +model.update(dict(cls_head=dict(num_classes=710))) + +k400_data_root = 'data/kinetics400/videos_train' +k600_data_root = 'data/kinetics600/videos' +k700_data_root = 'data/kinetics700/videos' +k400_data_root_val = 'data/kinetics400/videos_val' +k600_data_root_val = k600_data_root +k700_data_root_val = k700_data_root + +k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt' +k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt' +k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt' + +k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt' +k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt' +k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' + +k400_trainset = dict( + type=VideoDataset, + ann_file=k400_ann_file_train, + data_prefix=dict(video=k400_data_root), + pipeline=train_pipeline) +k600_trainset = dict( + type=VideoDataset, + ann_file=k600_ann_file_train, + data_prefix=dict(video=k600_data_root), + pipeline=train_pipeline) +k700_trainset = dict( + type=VideoDataset, + ann_file=k700_ann_file_train, + data_prefix=dict(video=k700_data_root), + pipeline=train_pipeline) + +k400_valset = dict( + type=VideoDataset, + ann_file=k400_ann_file_val, + data_prefix=dict(video=k400_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k600_valset = dict( + type=VideoDataset, + ann_file=k600_ann_file_val, + data_prefix=dict(video=k600_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k700_valset = dict( + type=VideoDataset, + ann_file=k700_ann_file_val, + data_prefix=dict(video=k700_data_root_val), + pipeline=val_pipeline, + test_mode=True) + +k400_testset = k400_valset.copy() +k600_testset = k600_valset.copy() +k700_testset = k700_valset.copy() +k400_testset['pipeline'] = test_pipeline +k600_testset['pipeline'] = test_pipeline +k700_testset['pipeline'] = test_pipeline + +k710_trainset = dict( + type=ConcatDataset, + datasets=[k400_trainset, k600_trainset, k700_trainset], + _delete_=True) +k710_valset = dict( + type=ConcatDataset, + datasets=[k400_valset, k600_valset, k700_valset], + _delete_=True) +k710_testset = dict( + type=ConcatDataset, + datasets=[k400_testset, k600_testset, k700_testset], + _delete_=True, +) + +train_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=k710_trainset) +val_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=k710_valset) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=k710_testset) + +optim_wrapper.update(dict(optimizer=dict(lr=2e-3))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (8 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=128)) diff --git a/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..769bc9b249f58510d6f969a133aea7b0969b1ba4 --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py @@ -0,0 +1,155 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.swin_tiny import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.engine import SwinOptimWrapperConstructor +from mmaction.evaluation import AccMetric + +model.update( + dict( + backbone=dict( + arch='small', + drop_path_rate=0.2, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_small_patch4_window7_224.pth' # noqa: E501 + ))) + +# dataset settings +dataset_type = VideoDataset +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02), + constructor=SwinOptimWrapperConstructor, + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7afd26a65ac83d98fa18b125852fb55b588eca58 --- /dev/null +++ b/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.models.swin_tiny import * + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + RandomResizedCrop, Resize, SampleFrames, + ThreeCrop, VideoDataset) +from mmaction.engine import SwinOptimWrapperConstructor +from mmaction.evaluation import AccMetric + +model.update( + dict( + backbone=dict( + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_tiny_patch4_window7_224.pth' # noqa: E501 + ))) + +# dataset settings +dataset_type = VideoDataset +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict( + type=SampleFrames, + clip_len=32, + frame_interval=2, + num_clips=4, + test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02), + constructor=SwinOptimWrapperConstructor, + paramwise_cfg=dict( + absolute_pos_embed=dict(decay_mult=0.), + relative_position_bias_table=dict(decay_mult=0.), + norm=dict(decay_mult=0.), + backbone=dict(lr_mult=0.1))) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=2.5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=30, + eta_min=0, + by_epoch=True, + begin=0, + end=30) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/mmaction/configs/recognition/uniformerv2/__init__.py b/mmaction/configs/recognition/uniformerv2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..1e6b335d2affd4b7abbbbe7aa69582b48e735832 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k400.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/skeleton/posec3d/__init__.py b/mmaction/configs/skeleton/posec3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/configs/skeleton/posec3d/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py new file mode 100644 index 0000000000000000000000000000000000000000..17e86dd5c91388133f4e89babf3b48a3d707457f --- /dev/null +++ b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler, RepeatDataset +from mmengine.optim import CosineAnnealingLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmaction.datasets import (CenterCrop, Flip, FormatShape, + GeneratePoseTarget, PackActionInputs, + PoseCompact, PoseDataset, PoseDecode, + RandomResizedCrop, Resize, UniformSampleFrames) +from mmaction.evaluation import AccMetric +from mmaction.models import I3DHead, Recognizer3D, ResNet3dSlowOnly + +model = dict( + type=Recognizer3D, + backbone=dict( + type=ResNet3dSlowOnly, + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)), + cls_head=dict( + type=I3DHead, + in_channels=512, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], + [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2], + [1, 3], [2, 4], [11, 12]] +left_limb = [0, 2, 3, 6, 7, 8, 12, 14] +right_limb = [1, 4, 5, 9, 10, 11, 13, 15] +train_pipeline = [ + dict(type=UniformSampleFrames, clip_len=48), + dict(type=PoseDecode), + dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True), + dict(type=Resize, scale=(-1, 64)), + dict(type=RandomResizedCrop, area_range=(0.56, 1.0)), + dict(type=Resize, scale=(56, 56), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type=GeneratePoseTarget, + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type=FormatShape, input_format='NCTHW_Heatmap'), + dict(type=PackActionInputs) +] +val_pipeline = [ + dict(type=UniformSampleFrames, clip_len=48, num_clips=1, test_mode=True), + dict(type=PoseDecode), + dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True), + dict(type=Resize, scale=(-1, 64)), + dict(type=CenterCrop, crop_size=64), + dict( + type=GeneratePoseTarget, + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons), + dict(type=FormatShape, input_format='NCTHW_Heatmap'), + dict(type=PackActionInputs) +] +test_pipeline = [ + dict(type=UniformSampleFrames, clip_len=48, num_clips=10, test_mode=True), + dict(type=PoseDecode), + dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True), + dict(type=Resize, scale=(-1, 64)), + dict(type=CenterCrop, crop_size=64), + dict( + type=GeneratePoseTarget, + sigma=0.6, + use_score=True, + with_kp=False, + with_limb=True, + skeletons=skeletons, + double=True, + left_limb=left_limb, + right_limb=right_limb), + dict(type=FormatShape, input_format='NCTHW_Heatmap'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=RepeatDataset, + times=10, + dataset=dict( + type=PoseDataset, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=PoseDataset, + ann_file=ann_file, + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=PoseDataset, + ann_file=ann_file, + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type=AccMetric)] +test_evaluator = val_evaluator + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +param_scheduler = [ + dict( + type=CosineAnnealingLR, + eta_min=0, + T_max=24, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2823fe8dc7d1f0d2f86e6cb7c885ea6e29ded93e --- /dev/null +++ b/mmaction/datasets/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .activitynet_dataset import ActivityNetDataset +from .audio_dataset import AudioDataset +from .ava_dataset import AVADataset, AVAKineticsDataset +from .base import BaseActionDataset +from .charades_sta_dataset import CharadesSTADataset +from .msrvtt_datasets import MSRVTTVQA, MSRVTTVQAMC, MSRVTTRetrieval +from .pose_dataset import PoseDataset +from .rawframe_dataset import RawframeDataset +from .repeat_aug_dataset import RepeatAugDataset, repeat_pseudo_collate +from .transforms import * # noqa: F401, F403 +from .video_dataset import VideoDataset +from .video_text_dataset import VideoTextDataset + +__all__ = [ + 'AVADataset', 'AVAKineticsDataset', 'ActivityNetDataset', 'AudioDataset', + 'BaseActionDataset', 'PoseDataset', 'RawframeDataset', 'RepeatAugDataset', + 'VideoDataset', 'repeat_pseudo_collate', 'VideoTextDataset', + 'MSRVTTRetrieval', 'MSRVTTVQA', 'MSRVTTVQAMC', 'CharadesSTADataset' +] diff --git a/mmaction/datasets/activitynet_dataset.py b/mmaction/datasets/activitynet_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f5f7a4dde26c46a8ee2029a035eb2852a63d85f4 --- /dev/null +++ b/mmaction/datasets/activitynet_dataset.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, List, Optional, Union + +import mmengine +from mmengine.fileio import exists + +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .base import BaseActionDataset + + +@DATASETS.register_module() +class ActivityNetDataset(BaseActionDataset): + """ActivityNet dataset for temporal action localization. The dataset loads + raw features and apply specified transforms to return a dict containing the + frame tensors and other information. The ann_file is a json file with + multiple objects, and each object has a key of the name of a video, and + value of total frames of the video, total seconds of the video, annotations + of a video, feature frames (frames covered by features) of the video, fps + and rfps. Example of a annotation file: + + .. code-block:: JSON + { + "v_--1DO2V4K74": { + "duration_second": 211.53, + "duration_frame": 6337, + "annotations": [ + { + "segment": [ + 30.025882995319815, + 205.2318595943838 + ], + "label": "Rock climbing" + } + ], + "feature_frame": 6336, + "fps": 30.0, + "rfps": 29.9579255898 + }, + "v_--6bJUbfpnQ": { + "duration_second": 26.75, + "duration_frame": 647, + "annotations": [ + { + "segment": [ + 2.578755070202808, + 24.914101404056165 + ], + "label": "Drinking beer" + } + ], + "feature_frame": 624, + "fps": 24.0, + "rfps": 24.1869158879 + }, + ... + } + Args: + ann_file (str): Path to the annotation file. + pipeline (list[dict | callable]): A sequence of data transforms. + data_prefix (dict or ConfigDict): Path to a directory where videos are + held. Defaults to ``dict(video='')``. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[dict, Callable]], + data_prefix: Optional[ConfigType] = dict(video=''), + test_mode: bool = False, + **kwargs): + + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + anno_database = mmengine.load(self.ann_file) + for video_name in anno_database: + video_info = anno_database[video_name] + feature_path = video_name + '.csv' + feature_path = '%s/%s' % (self.data_prefix['video'], feature_path) + video_info['feature_path'] = feature_path + video_info['video_name'] = video_name + data_list.append(video_info) + return data_list diff --git a/mmaction/datasets/audio_dataset.py b/mmaction/datasets/audio_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..30a30f08e645c90a27fc602f14d467a747533cae --- /dev/null +++ b/mmaction/datasets/audio_dataset.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Callable, Dict, List, Optional, Union + +from mmengine.utils import check_file_exist + +from mmaction.registry import DATASETS +from .base import BaseActionDataset + + +@DATASETS.register_module() +class AudioDataset(BaseActionDataset): + """Audio dataset for action recognition. + + The ann_file is a text file with multiple lines, and each line indicates + a sample audio or extracted audio feature with the filepath, total frames + of the raw video and label, which are split with a whitespace. + Example of a annotation file: + + .. code-block:: txt + some/directory-1.npy 163 1 + some/directory-2.npy 122 1 + some/directory-3.npy 258 2 + some/directory-4.npy 234 2 + some/directory-5.npy 295 3 + some/directory-6.npy 121 3 + + Args: + ann_file (str): Path to the annotation file. + pipeline (list[dict | callable]): A sequence of data transforms. + data_prefix (dict): Path to a directory where + audios are held. Defaults to ``dict(audio='')``. + multi_class (bool): Determines whether it is a multi-class + recognition dataset. Defaults to False. + num_classes (int, optional): Number of classes in the dataset. + Defaults to None. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[Dict, Callable]], + data_prefix: Dict = dict(audio=''), + multi_class: bool = False, + num_classes: Optional[int] = None, + **kwargs) -> None: + super().__init__( + ann_file, + pipeline, + data_prefix=data_prefix, + multi_class=multi_class, + num_classes=num_classes, + modality='Audio', + **kwargs) + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get audio information.""" + check_file_exist(self.ann_file) + data_list = [] + with open(self.ann_file, 'r') as fin: + for line in fin: + line_split = line.strip().split() + video_info = {} + idx = 0 + filename = line_split[idx] + if self.data_prefix['audio'] is not None: + filename = osp.join(self.data_prefix['audio'], filename) + video_info['audio_path'] = filename + idx += 1 + # idx for total_frames + video_info['total_frames'] = int(line_split[idx]) + idx += 1 + # idx for label + label = [int(x) for x in line_split[idx:]] + assert label, f'missing label in line: {line}' + if self.multi_class: + assert self.num_classes is not None + video_info['label'] = label + else: + assert len(label) == 1 + video_info['label'] = label[0] + data_list.append(video_info) + + return data_list diff --git a/mmaction/datasets/ava_dataset.py b/mmaction/datasets/ava_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..89657b84256d17594a85ee159f0109e8565bbac4 --- /dev/null +++ b/mmaction/datasets/ava_dataset.py @@ -0,0 +1,651 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from collections import defaultdict +from typing import Callable, List, Optional, Union + +import numpy as np +from mmengine.fileio import exists, list_from_file, load +from mmengine.logging import MMLogger + +from mmaction.evaluation import read_labelmap +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .base import BaseActionDataset + + +@DATASETS.register_module() +class AVADataset(BaseActionDataset): + """STAD dataset for spatial temporal action detection. + + The dataset loads raw frames/video files, bounding boxes, + proposals and applies specified transformations to return + a dict containing the frame tensors and other information. + + This datasets can load information from the following files: + + .. code-block:: txt + + ann_file -> ava_{train, val}_{v2.1, v2.2}.csv + exclude_file -> ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv + label_file -> ava_action_list_{v2.1, v2.2}.pbtxt / + ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt + proposal_file -> ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl + + Particularly, the proposal_file is a pickle file which contains + ``img_key`` (in format of ``{video_id},{timestamp}``). Example of a pickle + file: + + .. code-block:: JSON + + { + ... + '0f39OWEqJ24,0902': + array([[0.011 , 0.157 , 0.655 , 0.983 , 0.998163]]), + '0f39OWEqJ24,0912': + array([[0.054 , 0.088 , 0.91 , 0.998 , 0.068273], + [0.016 , 0.161 , 0.519 , 0.974 , 0.984025], + [0.493 , 0.283 , 0.981 , 0.984 , 0.983621]]), + ... + } + + Args: + ann_file (str): Path to the annotation file like + ``ava_{train, val}_{v2.1, v2.2}.csv``. + exclude_file (str): Path to the excluded timestamp file like + ``ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv``. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + label_file (str): Path to the label file like + ``ava_action_list_{v2.1, v2.2}.pbtxt`` or + ``ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt``. + Defaults to None. + filename_tmpl (str): Template for each filename. + Defaults to 'img_{:05}.jpg'. + start_index (int): Specify a start index for frames in consideration of + different filename format. It should be set to 1 for AVA, since + frame index start from 1 in AVA dataset. Defaults to 1. + proposal_file (str): Path to the proposal file like + ``ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl``. + Defaults to None. + person_det_score_thr (float): The threshold of person detection scores, + bboxes with scores above the threshold will be used. + Note that 0 <= person_det_score_thr <= 1. If no proposal has + detection score larger than the threshold, the one with the largest + detection score will be used. Default: 0.9. + num_classes (int): The number of classes of the dataset. Default: 81. + (AVA has 80 action classes, another 1-dim is added for potential + usage) + custom_classes (List[int], optional): A subset of class ids from origin + dataset. Please note that 0 should NOT be selected, and + ``num_classes`` should be equal to ``len(custom_classes) + 1``. + data_prefix (dict or ConfigDict): Path to a directory where video + frames are held. Defaults to ``dict(img='')``. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + modality (str): Modality of data. Support ``RGB``, ``Flow``. + Defaults to ``RGB``. + num_max_proposals (int): Max proposals number to store. + Defaults to 1000. + timestamp_start (int): The start point of included timestamps. The + default value is referred from the official website. + Defaults to 902. + timestamp_end (int): The end point of included timestamps. The default + value is referred from the official website. Defaults to 1798. + use_frames (bool): Whether to use rawframes as input. + Defaults to True. + fps (int): Overrides the default FPS for the dataset. If set to 1, + means counting timestamp by frame, e.g. MultiSports dataset. + Otherwise by second. Defaults to 30. + multilabel (bool): Determines whether it is a multilabel recognition + task. Defaults to True. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[ConfigType, Callable]], + exclude_file: Optional[str] = None, + label_file: Optional[str] = None, + filename_tmpl: str = 'img_{:05}.jpg', + start_index: int = 1, + proposal_file: str = None, + person_det_score_thr: float = 0.9, + num_classes: int = 81, + custom_classes: Optional[List[int]] = None, + data_prefix: ConfigType = dict(img=''), + modality: str = 'RGB', + test_mode: bool = False, + num_max_proposals: int = 1000, + timestamp_start: int = 900, + timestamp_end: int = 1800, + use_frames: bool = True, + fps: int = 30, + multilabel: bool = True, + **kwargs) -> None: + self._FPS = fps # Keep this as standard + self.custom_classes = custom_classes + if custom_classes is not None: + assert num_classes == len(custom_classes) + 1 + assert 0 not in custom_classes + _, class_whitelist = read_labelmap(open(label_file)) + assert set(custom_classes).issubset(class_whitelist) + + self.custom_classes = list([0] + custom_classes) + self.exclude_file = exclude_file + self.label_file = label_file + self.proposal_file = proposal_file + assert 0 <= person_det_score_thr <= 1, ( + 'The value of ' + 'person_det_score_thr should in [0, 1]. ') + self.person_det_score_thr = person_det_score_thr + self.timestamp_start = timestamp_start + self.timestamp_end = timestamp_end + self.num_max_proposals = num_max_proposals + self.filename_tmpl = filename_tmpl + self.use_frames = use_frames + self.multilabel = multilabel + + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + num_classes=num_classes, + start_index=start_index, + modality=modality, + **kwargs) + + if self.proposal_file is not None: + self.proposals = load(self.proposal_file) + else: + self.proposals = None + + def parse_img_record(self, img_records: List[dict]) -> tuple: + """Merge image records of the same entity at the same time. + + Args: + img_records (List[dict]): List of img_records (lines in AVA + annotations). + + Returns: + Tuple(list): A tuple consists of lists of bboxes, action labels and + entity_ids. + """ + bboxes, labels, entity_ids = [], [], [] + while len(img_records) > 0: + img_record = img_records[0] + num_img_records = len(img_records) + + selected_records = [ + x for x in img_records + if np.array_equal(x['entity_box'], img_record['entity_box']) + ] + + num_selected_records = len(selected_records) + img_records = [ + x for x in img_records if + not np.array_equal(x['entity_box'], img_record['entity_box']) + ] + + assert len(img_records) + num_selected_records == num_img_records + + bboxes.append(img_record['entity_box']) + valid_labels = np.array([ + selected_record['label'] + for selected_record in selected_records + ]) + + # The format can be directly used by BCELossWithLogits + if self.multilabel: + label = np.zeros(self.num_classes, dtype=np.float32) + label[valid_labels] = 1. + else: + label = valid_labels + + labels.append(label) + entity_ids.append(img_record['entity_id']) + bboxes = np.stack(bboxes) + labels = np.stack(labels) + entity_ids = np.stack(entity_ids) + return bboxes, labels, entity_ids + + def load_data_list(self) -> List[dict]: + """Load AVA annotations.""" + exists(self.ann_file) + data_list = [] + records_dict_by_img = defaultdict(list) + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split(',') + + label = int(line_split[6]) + if self.custom_classes is not None: + if label not in self.custom_classes: + continue + label = self.custom_classes.index(label) + + video_id = line_split[0] + timestamp = int(line_split[1]) # count by second or frame. + img_key = f'{video_id},{timestamp:04d}' + + entity_box = np.array(list(map(float, line_split[2:6]))) + entity_id = int(line_split[7]) + if self.use_frames: + shot_info = (0, (self.timestamp_end - self.timestamp_start) * + self._FPS) + # for video data, automatically get shot info when decoding + else: + shot_info = None + + video_info = dict( + video_id=video_id, + timestamp=timestamp, + entity_box=entity_box, + label=label, + entity_id=entity_id, + shot_info=shot_info) + records_dict_by_img[img_key].append(video_info) + + for img_key in records_dict_by_img: + video_id, timestamp = img_key.split(',') + bboxes, labels, entity_ids = self.parse_img_record( + records_dict_by_img[img_key]) + ann = dict( + gt_bboxes=bboxes, gt_labels=labels, entity_ids=entity_ids) + frame_dir = video_id + if self.data_prefix['img'] is not None: + frame_dir = osp.join(self.data_prefix['img'], frame_dir) + video_info = dict( + frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + if not self.use_frames: + video_info['filename'] = video_info.pop('frame_dir') + data_list.append(video_info) + + return data_list + + def filter_data(self) -> List[dict]: + """Filter out records in the exclude_file.""" + valid_indexes = [] + if self.exclude_file is None: + valid_indexes = list(range(len(self.data_list))) + else: + exclude_video_infos = [ + x.strip().split(',') for x in open(self.exclude_file) + ] + for i, data_info in enumerate(self.data_list): + valid_indexes.append(i) + for video_id, timestamp in exclude_video_infos: + if (data_info['video_id'] == video_id + and data_info['timestamp'] == int(timestamp)): + valid_indexes.pop() + break + + logger = MMLogger.get_current_instance() + logger.info(f'{len(valid_indexes)} out of {len(self.data_list)}' + f' frames are valid.') + data_list = [self.data_list[i] for i in valid_indexes] + + return data_list + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + img_key = data_info['img_key'] + + data_info['filename_tmpl'] = self.filename_tmpl + data_info['timestamp_start'] = self.timestamp_start + data_info['timestamp_end'] = self.timestamp_end + + if self.proposals is not None: + if img_key not in self.proposals: + data_info['proposals'] = np.array([[0, 0, 1, 1]]) + data_info['scores'] = np.array([1]) + else: + proposals = self.proposals[img_key] + assert proposals.shape[-1] in [4, 5] + if proposals.shape[-1] == 5: + thr = min(self.person_det_score_thr, max(proposals[:, 4])) + positive_inds = (proposals[:, 4] >= thr) + proposals = proposals[positive_inds] + proposals = proposals[:self.num_max_proposals] + data_info['proposals'] = proposals[:, :4] + data_info['scores'] = proposals[:, 4] + else: + proposals = proposals[:self.num_max_proposals] + data_info['proposals'] = proposals + + assert data_info['proposals'].max() <= 1 and \ + data_info['proposals'].min() >= 0, \ + (f'relative proposals invalid: max value ' + f'{data_info["proposals"].max()}, min value ' + f'{data_info["proposals"].min()}') + + ann = data_info.pop('ann') + data_info['gt_bboxes'] = ann['gt_bboxes'] + data_info['gt_labels'] = ann['gt_labels'] + data_info['entity_ids'] = ann['entity_ids'] + + return data_info + + +@DATASETS.register_module() +class AVAKineticsDataset(BaseActionDataset): + """AVA-Kinetics dataset for spatial temporal detection. + + Based on official AVA annotation files, the dataset loads raw frames, + bounding boxes, proposals and applies specified transformations to return + a dict containing the frame tensors and other information. + + This datasets can load information from the following files: + + .. code-block:: txt + + ann_file -> ava_{train, val}_{v2.1, v2.2}.csv + exclude_file -> ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv + label_file -> ava_action_list_{v2.1, v2.2}.pbtxt / + ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt + proposal_file -> ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl + + Particularly, the proposal_file is a pickle file which contains + ``img_key`` (in format of ``{video_id},{timestamp}``). Example of a pickle + file: + + .. code-block:: JSON + + { + ... + '0f39OWEqJ24,0902': + array([[0.011 , 0.157 , 0.655 , 0.983 , 0.998163]]), + '0f39OWEqJ24,0912': + array([[0.054 , 0.088 , 0.91 , 0.998 , 0.068273], + [0.016 , 0.161 , 0.519 , 0.974 , 0.984025], + [0.493 , 0.283 , 0.981 , 0.984 , 0.983621]]), + ... + } + + Args: + ann_file (str): Path to the annotation file like + ``ava_{train, val}_{v2.1, v2.2}.csv``. + exclude_file (str): Path to the excluded timestamp file like + ``ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv``. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + label_file (str): Path to the label file like + ``ava_action_list_{v2.1, v2.2}.pbtxt`` or + ``ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt``. + Defaults to None. + filename_tmpl (str): Template for each filename. + Defaults to 'img_{:05}.jpg'. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking frames as input, + it should be set to 0, since frames from 0. Defaults to 0. + proposal_file (str): Path to the proposal file like + ``ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl``. + Defaults to None. + person_det_score_thr (float): The threshold of person detection scores, + bboxes with scores above the threshold will be used. + Note that 0 <= person_det_score_thr <= 1. If no proposal has + detection score larger than the threshold, the one with the largest + detection score will be used. Default: 0.9. + num_classes (int): The number of classes of the dataset. Default: 81. + (AVA has 80 action classes, another 1-dim is added for potential + usage) + custom_classes (List[int], optional): A subset of class ids from origin + dataset. Please note that 0 should NOT be selected, and + ``num_classes`` should be equal to ``len(custom_classes) + 1``. + data_prefix (dict or ConfigDict): Path to a directory where video + frames are held. Defaults to ``dict(img='')``. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + modality (str): Modality of data. Support ``RGB``, ``Flow``. + Defaults to ``RGB``. + num_max_proposals (int): Max proposals number to store. + Defaults to 1000. + timestamp_start (int): The start point of included timestamps. The + default value is referred from the official website. + Defaults to 902. + timestamp_end (int): The end point of included timestamps. The default + value is referred from the official website. Defaults to 1798. + fps (int): Overrides the default FPS for the dataset. Defaults to 30. + """ + + def __init__(self, + ann_file: str, + exclude_file: str, + pipeline: List[Union[ConfigType, Callable]], + label_file: str, + filename_tmpl: str = 'img_{:05}.jpg', + start_index: int = 0, + proposal_file: str = None, + person_det_score_thr: float = 0.9, + num_classes: int = 81, + custom_classes: Optional[List[int]] = None, + data_prefix: ConfigType = dict(img=''), + modality: str = 'RGB', + test_mode: bool = False, + num_max_proposals: int = 1000, + timestamp_start: int = 900, + timestamp_end: int = 1800, + fps: int = 30, + **kwargs) -> None: + self._FPS = fps # Keep this as standard + self.custom_classes = custom_classes + if custom_classes is not None: + assert num_classes == len(custom_classes) + 1 + assert 0 not in custom_classes + _, class_whitelist = read_labelmap(open(label_file)) + assert set(custom_classes).issubset(class_whitelist) + + self.custom_classes = list([0] + custom_classes) + self.exclude_file = exclude_file + self.label_file = label_file + self.proposal_file = proposal_file + assert 0 <= person_det_score_thr <= 1, ( + 'The value of ' + 'person_det_score_thr should in [0, 1]. ') + self.person_det_score_thr = person_det_score_thr + self.timestamp_start = timestamp_start + self.timestamp_end = timestamp_end + self.num_max_proposals = num_max_proposals + self.filename_tmpl = filename_tmpl + + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + num_classes=num_classes, + start_index=start_index, + modality=modality, + **kwargs) + + if self.proposal_file is not None: + self.proposals = load(self.proposal_file) + else: + self.proposals = None + + def parse_img_record(self, img_records: List[dict]) -> tuple: + """Merge image records of the same entity at the same time. + + Args: + img_records (List[dict]): List of img_records (lines in AVA + annotations). + + Returns: + Tuple(list): A tuple consists of lists of bboxes, action labels and + entity_ids. + """ + bboxes, labels, entity_ids = [], [], [] + while len(img_records) > 0: + img_record = img_records[0] + num_img_records = len(img_records) + + selected_records = [ + x for x in img_records + if np.array_equal(x['entity_box'], img_record['entity_box']) + ] + + num_selected_records = len(selected_records) + img_records = [ + x for x in img_records if + not np.array_equal(x['entity_box'], img_record['entity_box']) + ] + + assert len(img_records) + num_selected_records == num_img_records + + bboxes.append(img_record['entity_box']) + valid_labels = np.array([ + selected_record['label'] + for selected_record in selected_records + ]) + + # The format can be directly used by BCELossWithLogits + label = np.zeros(self.num_classes, dtype=np.float32) + label[valid_labels] = 1. + + labels.append(label) + entity_ids.append(img_record['entity_id']) + + bboxes = np.stack(bboxes) + labels = np.stack(labels) + entity_ids = np.stack(entity_ids) + return bboxes, labels, entity_ids + + def filter_data(self) -> List[dict]: + """Filter out records in the exclude_file.""" + valid_indexes = [] + if self.exclude_file is None: + valid_indexes = list(range(len(self.data_list))) + else: + exclude_video_infos = [ + x.strip().split(',') for x in open(self.exclude_file) + ] + for i, data_info in enumerate(self.data_list): + valid_indexes.append(i) + for video_id, timestamp in exclude_video_infos: + if (data_info['video_id'] == video_id + and data_info['timestamp'] == int(timestamp)): + valid_indexes.pop() + break + + logger = MMLogger.get_current_instance() + logger.info(f'{len(valid_indexes)} out of {len(self.data_list)}' + f' frames are valid.') + data_list = [self.data_list[i] for i in valid_indexes] + + return data_list + + def get_timestamp(self, video_id): + if len(video_id) == 11: + return self.timestamp_start, self.timestamp_end + video_id = video_id.split('_') + if len(video_id) >= 3: + start = int(video_id[-2]) + end = int(video_id[-1]) + video_id = '_'.join(video_id[:-2]) + return start, end + return self.timestamp_start, self.timestamp_end + + def load_data_list(self) -> List[dict]: + """Load AVA annotations.""" + exists(self.ann_file) + data_list = [] + records_dict_by_img = defaultdict(list) + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split(',') + + label = int(line_split[6]) + if self.custom_classes is not None: + if label not in self.custom_classes: + continue + label = self.custom_classes.index(label) + + video_id = line_split[0] + timestamp = int(line_split[1]) + img_key = f'{video_id},{timestamp:04d}' + + entity_box = np.array(list(map(float, line_split[2:6]))) + entity_id = int(line_split[7]) + start, end = self.get_timestamp(video_id) + shot_info = (1, (end - start) * self._FPS + 1) + + video_info = dict( + video_id=video_id, + timestamp=timestamp, + entity_box=entity_box, + label=label, + entity_id=entity_id, + shot_info=shot_info) + records_dict_by_img[img_key].append(video_info) + + for img_key in records_dict_by_img: + video_id, timestamp = img_key.split(',') + start, end = self.get_timestamp(video_id) + bboxes, labels, entity_ids = self.parse_img_record( + records_dict_by_img[img_key]) + ann = dict( + gt_bboxes=bboxes, gt_labels=labels, entity_ids=entity_ids) + frame_dir = video_id + if self.data_prefix['img'] is not None: + frame_dir = osp.join(self.data_prefix['img'], frame_dir) + video_info = dict( + frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + timestamp_start=start, + timestamp_end=end, + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + data_list.append(video_info) + + return data_list + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + img_key = data_info['img_key'] + data_info['filename_tmpl'] = self.filename_tmpl + if 'timestamp_start' not in data_info: + data_info['timestamp_start'] = self.timestamp_start + data_info['timestamp_end'] = self.timestamp_end + + if self.proposals is not None: + if len(img_key) == 16: + proposal_key = img_key + else: + video_id, timestamp = img_key.split(',') + vid = '_'.join(video_id.split('_')[:-2]) + timestamp = int(timestamp) + proposal_key = f'{vid},{timestamp:04d}' + + if proposal_key not in self.proposals: + data_info['proposals'] = np.array([[0, 0, 1, 1]]) + data_info['scores'] = np.array([1]) + else: + proposals = self.proposals[proposal_key] + assert proposals.shape[-1] in [4, 5] + if proposals.shape[-1] == 5: + thr = min(self.person_det_score_thr, max(proposals[:, 4])) + positive_inds = (proposals[:, 4] >= thr) + proposals = proposals[positive_inds] + proposals = proposals[:self.num_max_proposals] + data_info['proposals'] = proposals[:, :4] + data_info['scores'] = proposals[:, 4] + else: + proposals = proposals[:self.num_max_proposals] + data_info['proposals'] = proposals + + ann = data_info.pop('ann') + data_info['gt_bboxes'] = ann['gt_bboxes'] + data_info['gt_labels'] = ann['gt_labels'] + data_info['entity_ids'] = ann['entity_ids'] + + return data_info diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..ed485142aee2c2172ed9cf917f9f672cfcb8c19a --- /dev/null +++ b/mmaction/datasets/base.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta +from typing import Callable, List, Optional, Union + +import torch +from mmengine.dataset import BaseDataset + +from mmaction.utils import ConfigType + + +class BaseActionDataset(BaseDataset, metaclass=ABCMeta): + """Base class for datasets. + + Args: + ann_file (str): Path to the annotation file. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + data_prefix (dict or ConfigDict, optional): Path to a directory where + videos are held. Defaults to None. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + multi_class (bool): Determines whether the dataset is a multi-class + dataset. Defaults to False. + num_classes (int, optional): Number of classes of the dataset, used in + multi-class datasets. Defaults to None. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking videos as input, + it should be set to 0, since frames loaded from videos count + from 0. Defaults to 0. + modality (str): Modality of data. Support ``RGB``, ``Flow``, ``Pose``, + ``Audio``. Defaults to ``RGB``. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[ConfigType, Callable]], + data_prefix: Optional[ConfigType] = dict(prefix=''), + test_mode: bool = False, + multi_class: bool = False, + num_classes: Optional[int] = None, + start_index: int = 0, + modality: str = 'RGB', + **kwargs) -> None: + self.multi_class = multi_class + self.num_classes = num_classes + self.start_index = start_index + self.modality = modality + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + **kwargs) + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + data_info['modality'] = self.modality + data_info['start_index'] = self.start_index + + if self.multi_class: + onehot = torch.zeros(self.num_classes) + onehot[data_info['label']] = 1. + data_info['label'] = onehot + + return data_info diff --git a/mmaction/datasets/charades_sta_dataset.py b/mmaction/datasets/charades_sta_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e27f986fde31f6b09ee75db61fc0972af54b2d84 --- /dev/null +++ b/mmaction/datasets/charades_sta_dataset.py @@ -0,0 +1,124 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from typing import Callable, List, Optional, Union + +import mmengine +import numpy as np +import torch +from mmengine.fileio import exists + +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .base import BaseActionDataset + +try: + import nltk + nltk_imported = True +except ImportError: + nltk_imported = False + + +@DATASETS.register_module() +class CharadesSTADataset(BaseActionDataset): + + def __init__(self, + ann_file: str, + pipeline: List[Union[dict, Callable]], + word2id_file: str, + fps_file: str, + duration_file: str, + num_frames_file: str, + window_size: int, + ft_overlap: float, + data_prefix: Optional[ConfigType] = dict(video=''), + test_mode: bool = False, + **kwargs): + if not nltk_imported: + raise ImportError('nltk is required for CharadesSTADataset') + + self.fps_info = mmengine.load(fps_file) + self.duration_info = mmengine.load(duration_file) + self.num_frames = mmengine.load(num_frames_file) + self.word2id = mmengine.load(word2id_file) + self.ft_interval = int(window_size * (1 - ft_overlap)) + + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + with open(self.ann_file) as f: + anno_database = f.readlines() + + for item in anno_database: + first_part, query_sentence = item.strip().split('##') + query_sentence = query_sentence.replace('.', '') + query_words = nltk.word_tokenize(query_sentence) + query_tokens = [self.word2id[word] for word in query_words] + query_length = len(query_tokens) + query_tokens = torch.from_numpy(np.array(query_tokens)) + + vid_name, start_time, end_time = first_part.split() + duration = float(self.duration_info[vid_name]) + fps = float(self.fps_info[vid_name]) + + gt_start_time = float(start_time) + gt_end_time = float(end_time) + + gt_bbox = (gt_start_time / duration, min(gt_end_time / duration, + 1)) + + num_frames = int(self.num_frames[vid_name]) + proposal_frames = self.get_proposals(num_frames) + + proposals = proposal_frames / num_frames + proposals = torch.from_numpy(proposals) + proposal_indexes = proposal_frames / self.ft_interval + proposal_indexes = proposal_indexes.astype(np.int32) + + info = dict( + vid_name=vid_name, + fps=fps, + num_frames=num_frames, + duration=duration, + query_tokens=query_tokens, + query_length=query_length, + gt_start_time=gt_start_time, + gt_end_time=gt_end_time, + gt_bbox=gt_bbox, + proposals=proposals, + num_proposals=proposals.shape[0], + proposal_indexes=proposal_indexes) + data_list.append(info) + return data_list + + def get_proposals(self, num_frames): + proposals = (num_frames - 1) / 32 * np.arange(33) + proposals = proposals.astype(np.int32) + proposals = np.stack([proposals[:-1], proposals[1:]]).T + return proposals + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + vid_name = data_info['vid_name'] + feature_path = os.path.join(self.data_prefix['video'], + f'{vid_name}.pt') + vid_feature = torch.load(feature_path) + proposal_feats = [] + proposal_indexes = data_info['proposal_indexes'].clip( + max=vid_feature.shape[0] - 1) + for s, e in proposal_indexes: + prop_feature, _ = vid_feature[s:e + 1].max(dim=0) + proposal_feats.append(prop_feature) + + proposal_feats = torch.stack(proposal_feats) + + data_info['raw_feature'] = proposal_feats + return data_info diff --git a/mmaction/datasets/msrvtt_datasets.py b/mmaction/datasets/msrvtt_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..da249de2a4ceb011666a3edd36c612c96975df00 --- /dev/null +++ b/mmaction/datasets/msrvtt_datasets.py @@ -0,0 +1,116 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import re +from collections import Counter +from typing import Dict, List + +from mmengine.fileio import exists + +from mmaction.registry import DATASETS +from .base import BaseActionDataset + + +@DATASETS.register_module() +class MSRVTTVQA(BaseActionDataset): + """MSR-VTT Video Question Answering dataset.""" + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + + with open(self.ann_file) as f: + data_lines = json.load(f) + for data in data_lines: + answers = data['answer'] + if isinstance(answers, str): + answers = [answers] + count = Counter(answers) + answer_weight = [i / len(answers) for i in count.values()] + data_item = dict( + question_id=data['question_id'], + filename=osp.join(self.data_prefix['video'], + data['video']), + question=pre_text(data['question']), + gt_answer=list(count.keys()), + gt_answer_weight=answer_weight) + data_list.append(data_item) + + return data_list + + +@DATASETS.register_module() +class MSRVTTVQAMC(BaseActionDataset): + """MSR-VTT VQA multiple choices dataset.""" + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + + with open(self.ann_file) as f: + data_lines = json.load(f) + for data in data_lines: + data_item = dict( + filename=osp.join(self.data_prefix['video'], + data['video']), + label=data['answer'], + caption_options=[pre_text(c) for c in data['caption']]) + data_list.append(data_item) + + return data_list + + +@DATASETS.register_module() +class MSRVTTRetrieval(BaseActionDataset): + """MSR-VTT Retrieval dataset.""" + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + + with open(self.ann_file) as f: + data_lines = json.load(f) + video_idx = 0 + text_idx = 0 + for data in data_lines: + # don't consider multiple videos or multiple captions + video_path = osp.join(self.data_prefix['video'], data['video']) + data_item = dict( + filename=video_path, + text=[], + gt_video_id=[], + gt_text_id=[]) + if isinstance(data['caption'], str): + data['caption'] = [data['caption']] + + for text in data['caption']: + text = pre_text(text) + data_item['text'].append(text) + data_item['gt_video_id'].append(video_idx) + data_item['gt_text_id'].append(text_idx) + text_idx += 1 + + video_idx += 1 + data_list.append(data_item) + self.num_videos = video_idx + self.num_texts = text_idx + + return data_list + + +def pre_text(text, max_l=None): + text = re.sub(r"([,.'!?\"()*#:;~])", '', text.lower()) + text = text.replace('-', ' ').replace('/', + ' ').replace('', 'person') + + text = re.sub(r'\s{2,}', ' ', text) + text = text.rstrip('\n').strip(' ') + + if max_l: # truncate + words = text.split(' ') + if len(words) > max_l: + text = ' '.join(words[:max_l]) + return text diff --git a/mmaction/datasets/pose_dataset.py b/mmaction/datasets/pose_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4862ad12d4775c82693b8c13179c1795e334bbe1 --- /dev/null +++ b/mmaction/datasets/pose_dataset.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Callable, Dict, List, Optional, Union + +import mmengine +from mmengine.logging import MMLogger + +from mmaction.registry import DATASETS +from .base import BaseActionDataset + + +@DATASETS.register_module() +class PoseDataset(BaseActionDataset): + """Pose dataset for action recognition. + + The dataset loads pose and apply specified transforms to return a + dict containing pose information. + + The ann_file is a pickle file, the json file contains a list of + annotations, the fields of an annotation include frame_dir(video_id), + total_frames, label, kp, kpscore. + + Args: + ann_file (str): Path to the annotation file. + pipeline (list[dict | callable]): A sequence of data transforms. + split (str, optional): The dataset split used. For UCF101 and + HMDB51, allowed choices are 'train1', 'test1', 'train2', + 'test2', 'train3', 'test3'. For NTURGB+D, allowed choices + are 'xsub_train', 'xsub_val', 'xview_train', 'xview_val'. + For NTURGB+D 120, allowed choices are 'xsub_train', + 'xsub_val', 'xset_train', 'xset_val'. For FineGYM, + allowed choices are 'train', 'val'. Defaults to None. + valid_ratio (float, optional): The valid_ratio for videos in + KineticsPose. For a video with n frames, it is a valid + training sample only if n * valid_ratio frames have human + pose. None means not applicable (only applicable to Kinetics + Pose).Defaults to None. + box_thr (float): The threshold for human proposals. Only boxes + with confidence score larger than `box_thr` is kept. None + means not applicable (only applicable to Kinetics). Allowed + choices are 0.5, 0.6, 0.7, 0.8, 0.9. Defaults to 0.5. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[Dict, Callable]], + split: Optional[str] = None, + valid_ratio: Optional[float] = None, + box_thr: float = 0.5, + **kwargs) -> None: + self.split = split + self.box_thr = box_thr + assert box_thr in [.5, .6, .7, .8, .9] + self.valid_ratio = valid_ratio + + super().__init__( + ann_file, pipeline=pipeline, modality='Pose', **kwargs) + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get skeleton information.""" + assert self.ann_file.endswith('.pkl') + mmengine.exists(self.ann_file) + data_list = mmengine.load(self.ann_file) + + if self.split is not None: + split, annos = data_list['split'], data_list['annotations'] + identifier = 'filename' if 'filename' in annos[0] else 'frame_dir' + split = set(split[self.split]) + data_list = [x for x in annos if x[identifier] in split] + + # Sometimes we may need to load video from the file + if 'video' in self.data_prefix: + for item in data_list: + if 'filename' in item: + item['filename'] = osp.join(self.data_prefix['video'], + item['filename']) + if 'frame_dir' in item: + item['frame_dir'] = osp.join(self.data_prefix['video'], + item['frame_dir']) + return data_list + + def filter_data(self) -> List[Dict]: + """Filter out invalid samples.""" + if self.valid_ratio is not None and isinstance( + self.valid_ratio, float) and self.valid_ratio > 0: + self.data_list = [ + x for x in self.data_list if x['valid'][self.box_thr] / + x['total_frames'] >= self.valid_ratio + ] + for item in self.data_list: + assert 'box_score' in item,\ + 'if valid_ratio is a positive number,' \ + 'item should have field `box_score`' + anno_inds = (item['box_score'] >= self.box_thr) + item['anno_inds'] = anno_inds + + logger = MMLogger.get_current_instance() + logger.info( + f'{len(self.data_list)} videos remain after valid thresholding') + + return self.data_list + + def get_data_info(self, idx: int) -> Dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + + # Sometimes we may need to load skeleton from the file + if 'skeleton' in self.data_prefix: + identifier = 'filename' if 'filename' in data_info \ + else 'frame_dir' + ske_name = data_info[identifier] + ske_path = osp.join(self.data_prefix['skeleton'], + ske_name + '.pkl') + ske = mmengine.load(ske_path) + for k in ske: + data_info[k] = ske[k] + + return data_info diff --git a/mmaction/datasets/rawframe_dataset.py b/mmaction/datasets/rawframe_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c333446dfad882041e70149eb9efa5d936837104 --- /dev/null +++ b/mmaction/datasets/rawframe_dataset.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Callable, List, Optional, Union + +from mmengine.fileio import exists, list_from_file + +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .base import BaseActionDataset + + +@DATASETS.register_module() +class RawframeDataset(BaseActionDataset): + """Rawframe dataset for action recognition. + + The dataset loads raw frames and apply specified transforms to return a + dict containing the frame tensors and other information. + + The ann_file is a text file with multiple lines, and each line indicates + the directory to frames of a video, total frames of the video and + the label of a video, which are split with a whitespace. + Example of a annotation file: + + .. code-block:: txt + + some/directory-1 163 1 + some/directory-2 122 1 + some/directory-3 258 2 + some/directory-4 234 2 + some/directory-5 295 3 + some/directory-6 121 3 + + Example of a multi-class annotation file: + + + .. code-block:: txt + + some/directory-1 163 1 3 5 + some/directory-2 122 1 2 + some/directory-3 258 2 + some/directory-4 234 2 4 6 8 + some/directory-5 295 3 + some/directory-6 121 3 + + Example of a with_offset annotation file (clips from long videos), each + line indicates the directory to frames of a video, the index of the start + frame, total frames of the video clip and the label of a video clip, which + are split with a whitespace. + + + .. code-block:: txt + + some/directory-1 12 163 3 + some/directory-2 213 122 4 + some/directory-3 100 258 5 + some/directory-4 98 234 2 + some/directory-5 0 295 3 + some/directory-6 50 121 3 + + + Args: + ann_file (str): Path to the annotation file. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + data_prefix (dict or ConfigDict): Path to a directory where video + frames are held. Defaults to ``dict(img='')``. + filename_tmpl (str): Template for each filename. + Defaults to ``img_{:05}.jpg``. + with_offset (bool): Determines whether the offset information is in + ann_file. Defaults to False. + multi_class (bool): Determines whether it is a multi-class + recognition dataset. Defaults to False. + num_classes (int, optional): Number of classes in the dataset. + Defaults to None. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking frames as input, + it should be set to 1, since raw frames count from 1. + Defaults to 1. + modality (str): Modality of data. Support ``RGB``, ``Flow``. + Defaults to ``RGB``. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[ConfigType, Callable]], + data_prefix: ConfigType = dict(img=''), + filename_tmpl: str = 'img_{:05}.jpg', + with_offset: bool = False, + multi_class: bool = False, + num_classes: Optional[int] = None, + start_index: int = 1, + modality: str = 'RGB', + test_mode: bool = False, + **kwargs) -> None: + self.filename_tmpl = filename_tmpl + self.with_offset = with_offset + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + test_mode=test_mode, + multi_class=multi_class, + num_classes=num_classes, + start_index=start_index, + modality=modality, + **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split() + video_info = {} + idx = 0 + # idx for frame_dir + frame_dir = line_split[idx] + if self.data_prefix['img'] is not None: + frame_dir = osp.join(self.data_prefix['img'], frame_dir) + video_info['frame_dir'] = frame_dir + idx += 1 + if self.with_offset: + # idx for offset and total_frames + video_info['offset'] = int(line_split[idx]) + video_info['total_frames'] = int(line_split[idx + 1]) + idx += 2 + else: + # idx for total_frames + video_info['total_frames'] = int(line_split[idx]) + idx += 1 + # idx for label[s] + label = [int(x) for x in line_split[idx:]] + # add fake label for inference datalist without label + if not label: + label = [-1] + if self.multi_class: + assert self.num_classes is not None + video_info['label'] = label + else: + assert len(label) == 1 + video_info['label'] = label[0] + data_list.append(video_info) + + return data_list + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = super().get_data_info(idx) + data_info['filename_tmpl'] = self.filename_tmpl + return data_info diff --git a/mmaction/datasets/repeat_aug_dataset.py b/mmaction/datasets/repeat_aug_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1e3a4c0876171a4b859356fabd7358e04c9973 --- /dev/null +++ b/mmaction/datasets/repeat_aug_dataset.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from typing import Any, Callable, List, Optional, Sequence, Union + +import numpy as np +from mmengine.dataset import COLLATE_FUNCTIONS, pseudo_collate + +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .video_dataset import VideoDataset + + +def get_type(transform: Union[dict, Callable]) -> str: + """get the type of the transform.""" + if isinstance(transform, dict) and 'type' in transform: + return transform['type'] + elif callable(transform): + return transform.__repr__().split('(')[0] + else: + raise TypeError + + +@DATASETS.register_module() +class RepeatAugDataset(VideoDataset): + """Video dataset for action recognition use repeat augment. + https://arxiv.org/pdf/1901.09335.pdf. + + The dataset loads raw videos and apply specified transforms to return a + dict containing the frame tensors and other information. + + The ann_file is a text file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a + whitespace. Example of a annotation file: + + .. code-block:: txt + + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + some/path/003.mp4 2 + some/path/004.mp4 3 + some/path/005.mp4 3 + + + Args: + ann_file (str): Path to the annotation file. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + data_prefix (dict or ConfigDict): Path to a directory where videos + are held. Defaults to ``dict(video='')``. + num_repeats (int): Number of repeat time of one video in a batch. + Defaults to 4. + sample_once (bool): Determines whether use same frame index for + repeat samples. Defaults to False. + multi_class (bool): Determines whether the dataset is a multi-class + dataset. Defaults to False. + num_classes (int, optional): Number of classes of the dataset, used in + multi-class datasets. Defaults to None. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking videos as input, + it should be set to 0, since frames loaded from videos count + from 0. Defaults to 0. + modality (str): Modality of data. Support ``RGB``, ``Flow``. + Defaults to ``RGB``. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[dict, Callable]], + data_prefix: ConfigType = dict(video=''), + num_repeats: int = 4, + sample_once: bool = False, + multi_class: bool = False, + num_classes: Optional[int] = None, + start_index: int = 0, + modality: str = 'RGB', + **kwargs) -> None: + + use_decord = get_type(pipeline[0]) == 'DecordInit' and \ + get_type(pipeline[2]) == 'DecordDecode' + + assert use_decord, ( + 'RepeatAugDataset requires decord as the video ' + 'loading backend, will support more backends in the ' + 'future') + + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + multi_class=multi_class, + num_classes=num_classes, + start_index=start_index, + modality=modality, + test_mode=False, + **kwargs) + self.num_repeats = num_repeats + self.sample_once = sample_once + + def prepare_data(self, idx) -> List[dict]: + """Get data processed by ``self.pipeline``. + + Reduce the video loading and decompressing. + Args: + idx (int): The index of ``data_info``. + Returns: + List[dict]: A list of length num_repeats. + """ + transforms = self.pipeline.transforms + + data_info = self.get_data_info(idx) + data_info = transforms[0](data_info) # DecordInit + + frame_inds_list, frame_inds_length = [], [0] + + fake_data_info = dict( + total_frames=data_info['total_frames'], + start_index=data_info['start_index']) + + if not self.sample_once: + for repeat in range(self.num_repeats): + data_info_ = transforms[1](fake_data_info) # SampleFrames + frame_inds = data_info_['frame_inds'] + frame_inds_list.append(frame_inds.reshape(-1)) + frame_inds_length.append(frame_inds.size + + frame_inds_length[-1]) + else: + data_info_ = transforms[1](fake_data_info) # SampleFrames + frame_inds = data_info_['frame_inds'] + for repeat in range(self.num_repeats): + frame_inds_list.append(frame_inds.reshape(-1)) + frame_inds_length.append(frame_inds.size + + frame_inds_length[-1]) + + for key in data_info_: + data_info[key] = data_info_[key] + + data_info['frame_inds'] = np.concatenate(frame_inds_list) + + data_info = transforms[2](data_info) # DecordDecode + imgs = data_info.pop('imgs') + + data_info_list = [] + for repeat in range(self.num_repeats): + data_info_ = deepcopy(data_info) + start = frame_inds_length[repeat] + end = frame_inds_length[repeat + 1] + data_info_['imgs'] = imgs[start:end] + for transform in transforms[3:]: + data_info_ = transform(data_info_) + data_info_list.append(data_info_) + del imgs + return data_info_list + + +@COLLATE_FUNCTIONS.register_module() +def repeat_pseudo_collate(data_batch: Sequence) -> Any: + data_batch = [i for j in data_batch for i in j] + return pseudo_collate(data_batch) diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3205ca7425d5ff6d41c7776a3b6f3308e353580b --- /dev/null +++ b/mmaction/datasets/transforms/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .formatting import (FormatAudioShape, FormatGCNInput, FormatShape, + PackActionInputs, PackLocalizationInputs, Transpose) +from .loading import (ArrayDecode, AudioFeatureSelector, BuildPseudoClip, + DecordDecode, DecordInit, DenseSampleFrames, + GenerateLocalizationLabels, ImageDecode, + LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature, + LoadProposals, LoadRGBFromFile, OpenCVDecode, OpenCVInit, + PIMSDecode, PIMSInit, PyAVDecode, PyAVDecodeMotionVector, + PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, + UniformSample, UntrimmedSampleFrames) +from .pose_transforms import (DecompressPose, GeneratePoseTarget, GenSkeFeat, + JointToBone, MergeSkeFeat, MMCompact, MMDecode, + MMUniformSampleFrames, PadTo, PoseCompact, + PoseDecode, PreNormalize2D, PreNormalize3D, + ToMotion, UniformSampleFrames) +from .processing import (CenterCrop, ColorJitter, Flip, Fuse, MultiScaleCrop, + RandomCrop, RandomRescale, RandomResizedCrop, Resize, + TenCrop, ThreeCrop) +from .text_transforms import CLIPTokenize +from .wrappers import ImgAug, PytorchVideoWrapper, TorchVisionWrapper + +__all__ = [ + 'ArrayDecode', 'AudioFeatureSelector', 'BuildPseudoClip', 'CenterCrop', + 'ColorJitter', 'DecordDecode', 'DecordInit', 'DecordInit', + 'DenseSampleFrames', 'Flip', 'FormatAudioShape', 'FormatGCNInput', + 'FormatShape', 'Fuse', 'GenSkeFeat', 'GenerateLocalizationLabels', + 'GeneratePoseTarget', 'ImageDecode', 'ImgAug', 'JointToBone', + 'LoadAudioFeature', 'LoadHVULabel', 'DecompressPose', + 'LoadLocalizationFeature', 'LoadProposals', 'LoadRGBFromFile', + 'MergeSkeFeat', 'MultiScaleCrop', 'OpenCVDecode', 'OpenCVInit', + 'OpenCVInit', 'PIMSDecode', 'PIMSInit', 'PackActionInputs', + 'PackLocalizationInputs', 'PadTo', 'PoseCompact', 'PoseDecode', + 'PreNormalize2D', 'PreNormalize3D', 'PyAVDecode', 'PyAVDecodeMotionVector', + 'PyAVInit', 'PyAVInit', 'PytorchVideoWrapper', 'RandomCrop', + 'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode', 'Resize', + 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop', 'ToMotion', + 'TorchVisionWrapper', 'Transpose', 'UniformSample', 'UniformSampleFrames', + 'UntrimmedSampleFrames', 'MMUniformSampleFrames', 'MMDecode', 'MMCompact', + 'CLIPTokenize' +] diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..7616defe89bb7d6d84cc4a4b6e4631043c8d4335 --- /dev/null +++ b/mmaction/datasets/transforms/formatting.py @@ -0,0 +1,451 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Sequence, Tuple + +import numpy as np +import torch +from mmcv.transforms import BaseTransform, to_tensor +from mmengine.structures import InstanceData + +from mmaction.registry import TRANSFORMS +from mmaction.structures import ActionDataSample + + +@TRANSFORMS.register_module() +class PackActionInputs(BaseTransform): + """Pack the inputs data. + + Args: + collect_keys (tuple[str], optional): The keys to be collected + to ``packed_results['inputs']``. Defaults to `` + meta_keys (Sequence[str]): The meta keys to saved in the + `metainfo` of the `data_sample`. + Defaults to ``('img_shape', 'img_key', 'video_id', 'timestamp')``. + algorithm_keys (Sequence[str]): The keys of custom elements to be used + in the algorithm. Defaults to an empty tuple. + """ + + mapping_table = { + 'gt_bboxes': 'bboxes', + 'gt_labels': 'labels', + } + + def __init__( + self, + collect_keys: Optional[Tuple[str]] = None, + meta_keys: Sequence[str] = ('img_shape', 'img_key', 'video_id', + 'timestamp'), + algorithm_keys: Sequence[str] = (), + ) -> None: + self.collect_keys = collect_keys + self.meta_keys = meta_keys + self.algorithm_keys = algorithm_keys + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`PackActionInputs`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + packed_results = dict() + if self.collect_keys is not None: + packed_results['inputs'] = dict() + for key in self.collect_keys: + packed_results['inputs'][key] = to_tensor(results[key]) + else: + if 'imgs' in results: + imgs = results['imgs'] + packed_results['inputs'] = to_tensor(imgs) + elif 'heatmap_imgs' in results: + heatmap_imgs = results['heatmap_imgs'] + packed_results['inputs'] = to_tensor(heatmap_imgs) + elif 'keypoint' in results: + keypoint = results['keypoint'] + packed_results['inputs'] = to_tensor(keypoint) + elif 'audios' in results: + audios = results['audios'] + packed_results['inputs'] = to_tensor(audios) + elif 'text' in results: + text = results['text'] + packed_results['inputs'] = to_tensor(text) + else: + raise ValueError( + 'Cannot get `imgs`, `keypoint`, `heatmap_imgs`, ' + '`audios` or `text` in the input dict of ' + '`PackActionInputs`.') + + data_sample = ActionDataSample() + + if 'gt_bboxes' in results: + instance_data = InstanceData() + for key in self.mapping_table.keys(): + instance_data[self.mapping_table[key]] = to_tensor( + results[key]) + data_sample.gt_instances = instance_data + + if 'proposals' in results: + data_sample.proposals = InstanceData( + bboxes=to_tensor(results['proposals'])) + + if 'label' in results: + data_sample.set_gt_label(results['label']) + + # Set custom algorithm keys + for key in self.algorithm_keys: + if key in results: + data_sample.set_field(results[key], key) + + # Set meta keys + img_meta = {k: results[k] for k in self.meta_keys if k in results} + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(collect_keys={self.collect_keys}, ' + repr_str += f'meta_keys={self.meta_keys})' + return repr_str + + +@TRANSFORMS.register_module() +class PackLocalizationInputs(BaseTransform): + + def __init__(self, keys=(), meta_keys=('video_name', )): + self.keys = keys + self.meta_keys = meta_keys + + def transform(self, results): + """Method to pack the input data. + + Args: + results (dict): Result dict from the data pipeline. + + Returns: + dict: + + - 'inputs' (obj:`torch.Tensor`): The forward data of models. + - 'data_samples' (obj:`DetDataSample`): The annotation info of the + sample. + """ + packed_results = dict() + if 'raw_feature' in results: + raw_feature = results['raw_feature'] + packed_results['inputs'] = to_tensor(raw_feature) + elif 'bsp_feature' in results: + packed_results['inputs'] = torch.tensor(0.) + else: + raise ValueError( + 'Cannot get "raw_feature" or "bsp_feature" in the input ' + 'dict of `PackActionInputs`.') + + data_sample = ActionDataSample() + for key in self.keys: + if key not in results: + continue + elif key == 'proposals': + instance_data = InstanceData() + instance_data[key] = to_tensor(results[key]) + data_sample.proposals = instance_data + else: + if hasattr(data_sample, 'gt_instances'): + data_sample.gt_instances[key] = to_tensor(results[key]) + else: + instance_data = InstanceData() + instance_data[key] = to_tensor(results[key]) + data_sample.gt_instances = instance_data + + img_meta = {k: results[k] for k in self.meta_keys if k in results} + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str + + +@TRANSFORMS.register_module() +class Transpose(BaseTransform): + """Transpose image channels to a given order. + + Args: + keys (Sequence[str]): Required keys to be converted. + order (Sequence[int]): Image channel order. + """ + + def __init__(self, keys, order): + self.keys = keys + self.order = order + + def transform(self, results): + """Performs the Transpose formatting. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + for key in self.keys: + results[key] = results[key].transpose(self.order) + return results + + def __repr__(self): + return (f'{self.__class__.__name__}(' + f'keys={self.keys}, order={self.order})') + + +@TRANSFORMS.register_module() +class FormatShape(BaseTransform): + """Format final imgs shape to the given input_format. + + Required keys: + + - imgs (optional) + - heatmap_imgs (optional) + - modality (optional) + - num_clips + - clip_len + + Modified Keys: + + - imgs + + Added Keys: + + - input_shape + - heatmap_input_shape (optional) + + Args: + input_format (str): Define the final data format. + collapse (bool): To collapse input_format N... to ... (NCTHW to CTHW, + etc.) if N is 1. Should be set as True when training and testing + detectors. Defaults to False. + """ + + def __init__(self, input_format: str, collapse: bool = False) -> None: + self.input_format = input_format + self.collapse = collapse + if self.input_format not in [ + 'NCTHW', 'NCHW', 'NCTHW_Heatmap', 'NPTCHW' + ]: + raise ValueError( + f'The input format {self.input_format} is invalid.') + + def transform(self, results: Dict) -> Dict: + """Performs the FormatShape formatting. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if not isinstance(results['imgs'], np.ndarray): + results['imgs'] = np.array(results['imgs']) + + # [M x H x W x C] + # M = 1 * N_crops * N_clips * T + if self.collapse: + assert results['num_clips'] == 1 + + if self.input_format == 'NCTHW': + if 'imgs' in results: + imgs = results['imgs'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + if isinstance(clip_len, dict): + clip_len = clip_len['RGB'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x T x H x W x C + imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4)) + # N_crops x N_clips x C x T x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x T x H x W + # M' = N_crops x N_clips + results['imgs'] = imgs + results['input_shape'] = imgs.shape + + if 'heatmap_imgs' in results: + imgs = results['heatmap_imgs'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + # clip_len must be a dict + clip_len = clip_len['Pose'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x T x C x H x W + imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5)) + # N_crops x N_clips x C x T x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x T x H x W + # M' = N_crops x N_clips + results['heatmap_imgs'] = imgs + results['heatmap_input_shape'] = imgs.shape + + elif self.input_format == 'NCTHW_Heatmap': + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = results['imgs'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x T x C x H x W + imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5)) + # N_crops x N_clips x C x T x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x T x H x W + # M' = N_crops x N_clips + results['imgs'] = imgs + results['input_shape'] = imgs.shape + + elif self.input_format == 'NCHW': + imgs = results['imgs'] + imgs = np.transpose(imgs, (0, 3, 1, 2)) + if 'modality' in results and results['modality'] == 'Flow': + clip_len = results['clip_len'] + imgs = imgs.reshape((-1, clip_len * imgs.shape[1]) + + imgs.shape[2:]) + # M x C x H x W + results['imgs'] = imgs + results['input_shape'] = imgs.shape + + elif self.input_format == 'NPTCHW': + num_proposals = results['num_proposals'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = results['imgs'] + imgs = imgs.reshape((num_proposals, num_clips * clip_len) + + imgs.shape[1:]) + # P x M x H x W x C + # M = N_clips x T + imgs = np.transpose(imgs, (0, 1, 4, 2, 3)) + # P x M x C x H x W + results['imgs'] = imgs + results['input_shape'] = imgs.shape + + if self.collapse: + assert results['imgs'].shape[0] == 1 + results['imgs'] = results['imgs'].squeeze(0) + results['input_shape'] = results['imgs'].shape + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f"(input_format='{self.input_format}')" + return repr_str + + +@TRANSFORMS.register_module() +class FormatAudioShape(BaseTransform): + """Format final audio shape to the given input_format. + + Required Keys: + + - audios + + Modified Keys: + + - audios + + Added Keys: + + - input_shape + + Args: + input_format (str): Define the final imgs format. + """ + + def __init__(self, input_format: str) -> None: + self.input_format = input_format + if self.input_format not in ['NCTF']: + raise ValueError( + f'The input format {self.input_format} is invalid.') + + def transform(self, results: Dict) -> Dict: + """Performs the FormatShape formatting. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + audios = results['audios'] + # clip x sample x freq -> clip x channel x sample x freq + clip, sample, freq = audios.shape + audios = audios.reshape(clip, 1, sample, freq) + results['audios'] = audios + results['input_shape'] = audios.shape + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f"(input_format='{self.input_format}')" + return repr_str + + +@TRANSFORMS.register_module() +class FormatGCNInput(BaseTransform): + """Format final skeleton shape. + + Required Keys: + + - keypoint + - keypoint_score (optional) + - num_clips (optional) + + Modified Key: + + - keypoint + + Args: + num_person (int): The maximum number of people. Defaults to 2. + mode (str): The padding mode. Defaults to ``'zero'``. + """ + + def __init__(self, num_person: int = 2, mode: str = 'zero') -> None: + self.num_person = num_person + assert mode in ['zero', 'loop'] + self.mode = mode + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`FormatGCNInput`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + keypoint = results['keypoint'] + if 'keypoint_score' in results: + keypoint = np.concatenate( + (keypoint, results['keypoint_score'][..., None]), axis=-1) + + cur_num_person = keypoint.shape[0] + if cur_num_person < self.num_person: + pad_dim = self.num_person - cur_num_person + pad = np.zeros( + (pad_dim, ) + keypoint.shape[1:], dtype=keypoint.dtype) + keypoint = np.concatenate((keypoint, pad), axis=0) + if self.mode == 'loop' and cur_num_person == 1: + for i in range(1, self.num_person): + keypoint[i] = keypoint[0] + + elif cur_num_person > self.num_person: + keypoint = keypoint[:self.num_person] + + M, T, V, C = keypoint.shape + nc = results.get('num_clips', 1) + assert T % nc == 0 + keypoint = keypoint.reshape( + (M, nc, T // nc, V, C)).transpose(1, 0, 2, 3, 4) + + results['keypoint'] = np.ascontiguousarray(keypoint) + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'num_person={self.num_person}, ' + f'mode={self.mode})') + return repr_str diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf349bdf5f22f6a99224e05253d6ee855d41ee6 --- /dev/null +++ b/mmaction/datasets/transforms/loading.py @@ -0,0 +1,1929 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +import io +import os +import os.path as osp +import shutil +from typing import Dict, List, Optional, Union + +import mmcv +import numpy as np +import torch +from mmcv.transforms import BaseTransform +from mmengine.fileio import FileClient + +from mmaction.registry import TRANSFORMS +from mmaction.utils import get_random_string, get_shm_dir, get_thread_id + + +@TRANSFORMS.register_module() +class LoadRGBFromFile(BaseTransform): + """Load a RGB image from file. + + Required Keys: + + - img_path + + Modified Keys: + + - img + - img_shape + - ori_shape + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:``mmcv.imfrombytes``. + Defaults to 'color'. + imdecode_backend (str): The image decoding backend type. The backend + argument for :func:``mmcv.imfrombytes``. + See :func:``mmcv.imfrombytes`` for details. + Defaults to 'cv2'. + io_backend (str): io backend where frames are store. + Default: 'disk'. + ignore_empty (bool): Whether to allow loading empty image or file path + not existent. Defaults to False. + kwargs (dict): Args for file client. + """ + + def __init__(self, + to_float32: bool = False, + color_type: str = 'color', + imdecode_backend: str = 'cv2', + io_backend: str = 'disk', + ignore_empty: bool = False, + **kwargs) -> None: + self.ignore_empty = ignore_empty + self.to_float32 = to_float32 + self.color_type = color_type + self.imdecode_backend = imdecode_backend + self.file_client = FileClient(io_backend, **kwargs) + self.io_backend = io_backend + + def transform(self, results: dict) -> dict: + """Functions to load image. + + Args: + results (dict): Result dict from :obj:``mmcv.BaseDataset``. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + filename = results['img_path'] + try: + img_bytes = self.file_client.get(filename) + img = mmcv.imfrombytes( + img_bytes, + flag=self.color_type, + channel_order='rgb', + backend=self.imdecode_backend) + except Exception as e: + if self.ignore_empty: + return None + else: + raise e + if self.to_float32: + img = img.astype(np.float32) + + results['img'] = img + results['img_shape'] = img.shape[:2] + results['ori_shape'] = img.shape[:2] + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'ignore_empty={self.ignore_empty}, ' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f"imdecode_backend='{self.imdecode_backend}', " + f"io_backend='{self.io_backend}')") + return repr_str + + +@TRANSFORMS.register_module() +class LoadHVULabel(BaseTransform): + """Convert the HVU label from dictionaries to torch tensors. + + Required keys are "label", "categories", "category_nums", added or modified + keys are "label", "mask" and "category_mask". + """ + + def __init__(self, **kwargs): + self.hvu_initialized = False + self.kwargs = kwargs + + def init_hvu_info(self, categories, category_nums): + """Initialize hvu information.""" + assert len(categories) == len(category_nums) + self.categories = categories + self.category_nums = category_nums + self.num_categories = len(self.categories) + self.num_tags = sum(self.category_nums) + self.category2num = dict(zip(categories, category_nums)) + self.start_idx = [0] + for i in range(self.num_categories - 1): + self.start_idx.append(self.start_idx[-1] + self.category_nums[i]) + self.category2startidx = dict(zip(categories, self.start_idx)) + self.hvu_initialized = True + + def transform(self, results): + """Convert the label dictionary to 3 tensors: "label", "mask" and + "category_mask". + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + if not self.hvu_initialized: + self.init_hvu_info(results['categories'], results['category_nums']) + + onehot = torch.zeros(self.num_tags) + onehot_mask = torch.zeros(self.num_tags) + category_mask = torch.zeros(self.num_categories) + + for category, tags in results['label'].items(): + # skip if not training on this category + if category not in self.categories: + continue + category_mask[self.categories.index(category)] = 1. + start_idx = self.category2startidx[category] + category_num = self.category2num[category] + tags = [idx + start_idx for idx in tags] + onehot[tags] = 1. + onehot_mask[start_idx:category_num + start_idx] = 1. + + results['label'] = onehot + results['mask'] = onehot_mask + results['category_mask'] = category_mask + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'hvu_initialized={self.hvu_initialized})') + return repr_str + + +@TRANSFORMS.register_module() +class SampleFrames(BaseTransform): + """Sample frames from the video. + + Required Keys: + + - total_frames + - start_index + + Added Keys: + + - frame_inds + - frame_interval + - num_clips + + Args: + clip_len (int): Frames of each sampled output clip. + frame_interval (int): Temporal interval of adjacent sampled frames. + Defaults to 1. + num_clips (int): Number of clips to be sampled. Default: 1. + temporal_jitter (bool): Whether to apply temporal jittering. + Defaults to False. + twice_sample (bool): Whether to use twice sample when testing. + If set to True, it will sample frames with and without fixed shift, + which is commonly used for testing in TSM model. Defaults to False. + out_of_bound_opt (str): The way to deal with out of bounds frame + indexes. Available options are 'loop', 'repeat_last'. + Defaults to 'loop'. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + keep_tail_frames (bool): Whether to keep tail frames when sampling. + Defaults to False. + target_fps (optional, int): Convert input videos with arbitrary frame + rates to the unified target FPS before sampling frames. If + ``None``, the frame rate will not be adjusted. Defaults to + ``None``. + """ + + def __init__(self, + clip_len: int, + frame_interval: int = 1, + num_clips: int = 1, + temporal_jitter: bool = False, + twice_sample: bool = False, + out_of_bound_opt: str = 'loop', + test_mode: bool = False, + keep_tail_frames: bool = False, + target_fps: Optional[int] = None, + **kwargs) -> None: + + self.clip_len = clip_len + self.frame_interval = frame_interval + self.num_clips = num_clips + self.temporal_jitter = temporal_jitter + self.twice_sample = twice_sample + self.out_of_bound_opt = out_of_bound_opt + self.test_mode = test_mode + self.keep_tail_frames = keep_tail_frames + self.target_fps = target_fps + assert self.out_of_bound_opt in ['loop', 'repeat_last'] + + def _get_train_clips(self, num_frames: int, + ori_clip_len: float) -> np.array: + """Get clip offsets in train mode. + + It will calculate the average interval for selected frames, + and randomly shift them within offsets between [0, avg_interval]. + If the total number of frames is smaller than clips num or origin + frames length, it will return all zero indices. + + Args: + num_frames (int): Total number of frame in the video. + ori_clip_len (float): length of original sample clip. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + + if self.keep_tail_frames: + avg_interval = (num_frames - ori_clip_len + 1) / float( + self.num_clips) + if num_frames > ori_clip_len - 1: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = (base_offsets + np.random.uniform( + 0, avg_interval, self.num_clips)).astype(np.int32) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int32) + else: + avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips + + if avg_interval > 0: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = base_offsets + np.random.randint( + avg_interval, size=self.num_clips) + elif num_frames > max(self.num_clips, ori_clip_len): + clip_offsets = np.sort( + np.random.randint( + num_frames - ori_clip_len + 1, size=self.num_clips)) + elif avg_interval == 0: + ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips + clip_offsets = np.around(np.arange(self.num_clips) * ratio) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int32) + + return clip_offsets + + def _get_test_clips(self, num_frames: int, + ori_clip_len: float) -> np.array: + """Get clip offsets in test mode. + + If the total number of frames is + not enough, it will return all zero indices. + + Args: + num_frames (int): Total number of frame in the video. + ori_clip_len (float): length of original sample clip. + + Returns: + np.ndarray: Sampled frame indices in test mode. + """ + if self.clip_len == 1: # 2D recognizer + # assert self.frame_interval == 1 + avg_interval = num_frames / float(self.num_clips) + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = base_offsets + avg_interval / 2.0 + if self.twice_sample: + clip_offsets = np.concatenate([clip_offsets, base_offsets]) + else: # 3D recognizer + max_offset = max(num_frames - ori_clip_len, 0) + if self.twice_sample: + num_clips = self.num_clips * 2 + else: + num_clips = self.num_clips + if num_clips > 1: + num_segments = self.num_clips - 1 + # align test sample strategy with `PySlowFast` repo + if self.target_fps is not None: + offset_between = np.floor(max_offset / float(num_segments)) + clip_offsets = np.arange(num_clips) * offset_between + else: + offset_between = max_offset / float(num_segments) + clip_offsets = np.arange(num_clips) * offset_between + clip_offsets = np.round(clip_offsets) + else: + clip_offsets = np.array([max_offset // 2]) + return clip_offsets + + def _sample_clips(self, num_frames: int, ori_clip_len: float) -> np.array: + """Choose clip offsets for the video in a given mode. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices. + """ + if self.test_mode: + clip_offsets = self._get_test_clips(num_frames, ori_clip_len) + else: + clip_offsets = self._get_train_clips(num_frames, ori_clip_len) + + return clip_offsets + + def _get_ori_clip_len(self, fps_scale_ratio: float) -> float: + """calculate length of clip segment for different strategy. + + Args: + fps_scale_ratio (float): Scale ratio to adjust fps. + """ + if self.target_fps is not None: + # align test sample strategy with `PySlowFast` repo + ori_clip_len = self.clip_len * self.frame_interval + ori_clip_len = np.maximum(1, ori_clip_len * fps_scale_ratio) + elif self.test_mode: + ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1 + else: + ori_clip_len = self.clip_len * self.frame_interval + + return ori_clip_len + + def transform(self, results: dict) -> dict: + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + # if can't get fps, same value of `fps` and `target_fps` + # will perform nothing + fps = results.get('avg_fps') + if self.target_fps is None or not fps: + fps_scale_ratio = 1.0 + else: + fps_scale_ratio = fps / self.target_fps + ori_clip_len = self._get_ori_clip_len(fps_scale_ratio) + clip_offsets = self._sample_clips(total_frames, ori_clip_len) + + if self.target_fps: + frame_inds = clip_offsets[:, None] + np.linspace( + 0, ori_clip_len - 1, self.clip_len).astype(np.int32) + else: + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + + frame_inds = frame_inds.reshape((-1, self.clip_len)) + if self.out_of_bound_opt == 'loop': + frame_inds = np.mod(frame_inds, total_frames) + elif self.out_of_bound_opt == 'repeat_last': + safe_inds = frame_inds < total_frames + unsafe_inds = 1 - safe_inds + last_ind = np.max(safe_inds * frame_inds, axis=1) + new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) + frame_inds = new_inds + else: + raise ValueError('Illegal out_of_bound option.') + + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = self.num_clips + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'num_clips={self.num_clips}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'twice_sample={self.twice_sample}, ' + f'out_of_bound_opt={self.out_of_bound_opt}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@TRANSFORMS.register_module() +class UniformSample(BaseTransform): + """Uniformly sample frames from the video. + + Modified from https://github.com/facebookresearch/SlowFast/blob/64a + bcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159. + + To sample an n-frame clip from the video. UniformSample basically + divides the video into n segments of equal length and randomly samples one + frame from each segment. + + Required keys: + + - total_frames + - start_index + + Added keys: + + - frame_inds + - clip_len + - frame_interval + - num_clips + + Args: + clip_len (int): Frames of each sampled output clip. + num_clips (int): Number of clips to be sampled. Defaults to 1. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + """ + + def __init__(self, + clip_len: int, + num_clips: int = 1, + test_mode: bool = False) -> None: + + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + + def _get_sample_clips(self, num_frames: int) -> np.ndarray: + """To sample an n-frame clip from the video. UniformSample basically + divides the video into n segments of equal length and randomly samples + one frame from each segment. When the duration of video frames is + shorter than the desired length of the target clip, this approach will + duplicate the sampled frame instead of looping the sample in "loop" + mode. In the test mode, when we need to sample multiple clips, + specifically 'n' clips, this method will further divide the segments + based on the number of clips to be sampled. The 'i-th' clip will. + + sample the frame located at the position 'i * len(segment) / n' + within the segment. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + seq (np.ndarray): the indexes of frames of sampled from the video. + """ + seg_size = float(num_frames - 1) / self.clip_len + inds = [] + if not self.test_mode: + for i in range(self.clip_len): + start = int(np.round(seg_size * i)) + end = int(np.round(seg_size * (i + 1))) + inds.append(np.random.randint(start, end + 1)) + else: + duration = seg_size / (self.num_clips + 1) + for k in range(self.num_clips): + for i in range(self.clip_len): + start = int(np.round(seg_size * i)) + frame_index = start + int(duration * (k + 1)) + inds.append(frame_index) + + return np.array(inds) + + def transform(self, results: Dict) -> Dict: + """Perform the Uniform Sampling. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + num_frames = results['total_frames'] + + inds = self._get_sample_clips(num_frames) + start_index = results['start_index'] + inds = inds + start_index + + results['frame_inds'] = inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'num_clips={self.num_clips}, ' + f'test_mode={self.test_mode}') + return repr_str + + +@TRANSFORMS.register_module() +class UntrimmedSampleFrames(BaseTransform): + """Sample frames from the untrimmed video. + + Required keys are "filename", "total_frames", added or modified keys are + "frame_inds", "clip_interval" and "num_clips". + + Args: + clip_len (int): The length of sampled clips. Defaults to 1. + clip_interval (int): Clip interval of adjacent center of sampled + clips. Defaults to 16. + frame_interval (int): Temporal interval of adjacent sampled frames. + Defaults to 1. + """ + + def __init__(self, clip_len=1, clip_interval=16, frame_interval=1): + self.clip_len = clip_len + self.clip_interval = clip_interval + self.frame_interval = frame_interval + + def transform(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + start_index = results['start_index'] + + clip_centers = np.arange(self.clip_interval // 2, total_frames, + self.clip_interval) + num_clips = clip_centers.shape[0] + frame_inds = clip_centers[:, None] + np.arange( + -(self.clip_len // 2 * self.frame_interval), + self.frame_interval * + (self.clip_len - + (self.clip_len // 2)), self.frame_interval)[None, :] + # clip frame_inds to legal range + frame_inds = np.clip(frame_inds, 0, total_frames - 1) + + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['clip_interval'] = self.clip_interval + results['frame_interval'] = self.frame_interval + results['num_clips'] = num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'clip_interval={self.clip_interval}, ' + f'frame_interval={self.frame_interval})') + return repr_str + + +@TRANSFORMS.register_module() +class DenseSampleFrames(SampleFrames): + """Select frames from the video by dense sample strategy. + + Required keys: + + - total_frames + - start_index + + Added keys: + + - frame_inds + - clip_len + - frame_interval + - num_clips + + Args: + clip_len (int): Frames of each sampled output clip. + frame_interval (int): Temporal interval of adjacent sampled frames. + Defaults to 1. + num_clips (int): Number of clips to be sampled. Defaults to 1. + sample_range (int): Total sample range for dense sample. + Defaults to 64. + num_sample_positions (int): Number of sample start positions, Which is + only used in test mode. Defaults to 10. That is to say, by default, + there are at least 10 clips for one input sample in test mode. + temporal_jitter (bool): Whether to apply temporal jittering. + Defaults to False. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + """ + + def __init__(self, + *args, + sample_range: int = 64, + num_sample_positions: int = 10, + **kwargs): + super().__init__(*args, **kwargs) + self.sample_range = sample_range + self.num_sample_positions = num_sample_positions + + def _get_train_clips(self, num_frames: int) -> np.array: + """Get clip offsets by dense sample strategy in train mode. + + It will calculate a sample position and sample interval and set + start index 0 when sample_pos == 1 or randomly choose from + [0, sample_pos - 1]. Then it will shift the start index by each + base offset. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + sample_position = max(1, 1 + num_frames - self.sample_range) + interval = self.sample_range // self.num_clips + start_idx = 0 if sample_position == 1 else np.random.randint( + 0, sample_position - 1) + base_offsets = np.arange(self.num_clips) * interval + clip_offsets = (base_offsets + start_idx) % num_frames + return clip_offsets + + def _get_test_clips(self, num_frames: int) -> np.array: + """Get clip offsets by dense sample strategy in test mode. + + It will calculate a sample position and sample interval and evenly + sample several start indexes as start positions between + [0, sample_position-1]. Then it will shift each start index by the + base offsets. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + sample_position = max(1, 1 + num_frames - self.sample_range) + interval = self.sample_range // self.num_clips + start_list = np.linspace( + 0, sample_position - 1, num=self.num_sample_positions, dtype=int) + base_offsets = np.arange(self.num_clips) * interval + clip_offsets = list() + for start_idx in start_list: + clip_offsets.extend((base_offsets + start_idx) % num_frames) + clip_offsets = np.array(clip_offsets) + return clip_offsets + + def _sample_clips(self, num_frames: int) -> np.array: + """Choose clip offsets for the video in a given mode. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices. + """ + if self.test_mode: + clip_offsets = self._get_test_clips(num_frames) + else: + clip_offsets = self._get_train_clips(num_frames) + + return clip_offsets + + def transform(self, results: dict) -> dict: + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + + clip_offsets = self._sample_clips(total_frames) + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + + frame_inds = frame_inds.reshape((-1, self.clip_len)) + if self.out_of_bound_opt == 'loop': + frame_inds = np.mod(frame_inds, total_frames) + elif self.out_of_bound_opt == 'repeat_last': + safe_inds = frame_inds < total_frames + unsafe_inds = 1 - safe_inds + last_ind = np.max(safe_inds * frame_inds, axis=1) + new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) + frame_inds = new_inds + else: + raise ValueError('Illegal out_of_bound option.') + + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'num_clips={self.num_clips}, ' + f'sample_range={self.sample_range}, ' + f'num_sample_positions={self.num_sample_positions}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'out_of_bound_opt={self.out_of_bound_opt}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@TRANSFORMS.register_module() +class SampleAVAFrames(SampleFrames): + + def __init__(self, clip_len, frame_interval=2, test_mode=False): + + super().__init__(clip_len, frame_interval, test_mode=test_mode) + + def _get_clips(self, center_index, skip_offsets, shot_info): + """Get clip offsets.""" + start = center_index - (self.clip_len // 2) * self.frame_interval + end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval + frame_inds = list(range(start, end, self.frame_interval)) + if not self.test_mode: + frame_inds = frame_inds + skip_offsets + frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1) + return frame_inds + + def transform(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + fps = results['fps'] + timestamp = results['timestamp'] + timestamp_start = results['timestamp_start'] + start_index = results.get('start_index', 0) + if results.get('total_frames') is not None: + shot_info = (0, results['total_frames']) + else: + shot_info = results['shot_info'] + + center_index = fps * (timestamp - timestamp_start) + start_index + + skip_offsets = np.random.randint( + -self.frame_interval // 2, (self.frame_interval + 1) // 2, + size=self.clip_len) + frame_inds = self._get_clips(center_index, skip_offsets, shot_info) + + frame_inds = np.array(frame_inds, dtype=np.int32) + start_index + results['frame_inds'] = frame_inds + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = 1 + results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32) + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@TRANSFORMS.register_module() +class PyAVInit(BaseTransform): + """Using pyav to initialize the video. + + PyAV: https://github.com/mikeboers/PyAV + + Required keys are "filename", + added or modified keys are "video_reader", and "total_frames". + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', **kwargs): + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + + def transform(self, results): + """Perform the PyAV initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + try: + import av + except ImportError: + raise ImportError('Please run "conda install av -c conda-forge" ' + 'or "pip install av" to install PyAV first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + file_obj = io.BytesIO(self.file_client.get(results['filename'])) + container = av.open(file_obj) + + results['video_reader'] = container + results['total_frames'] = container.streams.video[0].frames + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(io_backend={self.io_backend})' + return repr_str + + +@TRANSFORMS.register_module() +class PyAVDecode(BaseTransform): + """Using PyAV to decode the video. + + PyAV: https://github.com/mikeboers/PyAV + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + + Args: + multi_thread (bool): If set to True, it will apply multi + thread processing. Default: False. + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will decode videos into accurate frames. + If set to 'efficient', it will adopt fast seeking but only return + the nearest key frames, which may be duplicated and inaccurate, + and more suitable for large scene-based video datasets. + Default: 'accurate'. + """ + + def __init__(self, multi_thread=False, mode='accurate'): + self.multi_thread = multi_thread + self.mode = mode + assert mode in ['accurate', 'efficient'] + + @staticmethod + def frame_generator(container, stream): + """Frame generator for PyAV.""" + for packet in container.demux(stream): + for frame in packet.decode(): + if frame: + return frame.to_rgb().to_ndarray() + + def transform(self, results): + """Perform the PyAV decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if self.multi_thread: + container.streams.video[0].thread_type = 'AUTO' + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + if self.mode == 'accurate': + # set max indice to make early stop + max_inds = max(results['frame_inds']) + i = 0 + for frame in container.decode(video=0): + if i > max_inds + 1: + break + imgs.append(frame.to_rgb().to_ndarray()) + i += 1 + + # the available frame in pyav may be less than its length, + # which may raise error + results['imgs'] = [ + imgs[i % len(imgs)] for i in results['frame_inds'] + ] + elif self.mode == 'efficient': + for frame in container.decode(video=0): + backup_frame = frame + break + stream = container.streams.video[0] + for idx in results['frame_inds']: + pts_scale = stream.average_rate * stream.time_base + frame_pts = int(idx / pts_scale) + container.seek( + frame_pts, any_frame=False, backward=True, stream=stream) + frame = self.frame_generator(container, stream) + if frame is not None: + imgs.append(frame) + backup_frame = frame + else: + imgs.append(backup_frame) + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + results['video_reader'] = None + del container + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(multi_thread={self.multi_thread}, mode={self.mode})' + return repr_str + + +@TRANSFORMS.register_module() +class PIMSInit(BaseTransform): + """Use PIMS to initialize the video. + + PIMS: https://github.com/soft-matter/pims + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will always use ``pims.PyAVReaderIndexed`` + to decode videos into accurate frames. If set to 'efficient', it + will adopt fast seeking by using ``pims.PyAVReaderTimed``. + Both will return the accurate frames in most cases. + Default: 'accurate'. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', mode='accurate', **kwargs): + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + self.mode = mode + assert mode in ['accurate', 'efficient'] + + def transform(self, results): + """Perform the PIMS initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + try: + import pims + except ImportError: + raise ImportError('Please run "conda install pims -c conda-forge" ' + 'or "pip install pims" to install pims first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + file_obj = io.BytesIO(self.file_client.get(results['filename'])) + if self.mode == 'accurate': + container = pims.PyAVReaderIndexed(file_obj) + else: + container = pims.PyAVReaderTimed(file_obj) + + results['video_reader'] = container + results['total_frames'] = len(container) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(io_backend={self.io_backend}, ' + f'mode={self.mode})') + return repr_str + + +@TRANSFORMS.register_module() +class PIMSDecode(BaseTransform): + """Using PIMS to decode the videos. + + PIMS: https://github.com/soft-matter/pims + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + """ + + def transform(self, results): + """Perform the PIMS decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + container = results['video_reader'] + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + frame_inds = results['frame_inds'] + imgs = [container[idx] for idx in frame_inds] + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + +@TRANSFORMS.register_module() +class PyAVDecodeMotionVector(PyAVDecode): + """Using pyav to decode the motion vectors from video. + + Reference: https://github.com/PyAV-Org/PyAV/ + blob/main/tests/test_decode.py + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "motion_vectors", "frame_inds". + """ + + @staticmethod + def _parse_vectors(mv, vectors, height, width): + """Parse the returned vectors.""" + (w, h, src_x, src_y, dst_x, + dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], + vectors['src_y'], vectors['dst_x'], vectors['dst_y']) + val_x = dst_x - src_x + val_y = dst_y - src_y + start_x = dst_x - w // 2 + start_y = dst_y - h // 2 + end_x = start_x + w + end_y = start_y + h + for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y, + val_x, val_y): + if (sx >= 0 and ex < width and sy >= 0 and ey < height): + mv[sy:ey, sx:ex] = (vx, vy) + + return mv + + def transform(self, results): + """Perform the PyAV motion vector decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if self.multi_thread: + container.streams.video[0].thread_type = 'AUTO' + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + # set max index to make early stop + max_idx = max(results['frame_inds']) + i = 0 + stream = container.streams.video[0] + codec_context = stream.codec_context + codec_context.options = {'flags2': '+export_mvs'} + for packet in container.demux(stream): + for frame in packet.decode(): + if i > max_idx + 1: + break + i += 1 + height = frame.height + width = frame.width + mv = np.zeros((height, width, 2), dtype=np.int8) + vectors = frame.side_data.get('MOTION_VECTORS') + if frame.key_frame: + # Key frame don't have motion vectors + assert vectors is None + if vectors is not None and len(vectors) > 0: + mv = self._parse_vectors(mv, vectors.to_ndarray(), height, + width) + imgs.append(mv) + + results['video_reader'] = None + del container + + # the available frame in pyav may be less than its length, + # which may raise error + results['motion_vectors'] = np.array( + [imgs[i % len(imgs)] for i in results['frame_inds']]) + return results + + +@TRANSFORMS.register_module() +class DecordInit(BaseTransform): + """Using decord to initialize the video_reader. + + Decord: https://github.com/dmlc/decord + + Required Keys: + + - filename + + Added Keys: + + - video_reader + - total_frames + - fps + + Args: + io_backend (str): io backend where frames are store. + Defaults to ``'disk'``. + num_threads (int): Number of thread to decode the video. Defaults to 1. + kwargs (dict): Args for file client. + """ + + def __init__(self, + io_backend: str = 'disk', + num_threads: int = 1, + **kwargs) -> None: + self.io_backend = io_backend + self.num_threads = num_threads + self.kwargs = kwargs + self.file_client = None + + def _get_video_reader(self, filename: str) -> object: + if osp.splitext(filename)[0] == filename: + filename = filename + '.mp4' + try: + import decord + except ImportError: + raise ImportError( + 'Please run "pip install decord" to install Decord first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + file_obj = io.BytesIO(self.file_client.get(filename)) + container = decord.VideoReader(file_obj, num_threads=self.num_threads) + return container + + def transform(self, results: Dict) -> Dict: + """Perform the Decord initialization. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + container = self._get_video_reader(results['filename']) + results['total_frames'] = len(container) + + results['video_reader'] = container + results['avg_fps'] = container.get_avg_fps() + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'num_threads={self.num_threads})') + return repr_str + + +@TRANSFORMS.register_module() +class DecordDecode(BaseTransform): + """Using decord to decode the video. + + Decord: https://github.com/dmlc/decord + + Required Keys: + + - video_reader + - frame_inds + + Added Keys: + + - imgs + - original_shape + - img_shape + + Args: + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will decode videos into accurate frames. + If set to 'efficient', it will adopt fast seeking but only return + key frames, which may be duplicated and inaccurate, and more + suitable for large scene-based video datasets. + Defaults to ``'accurate'``. + """ + + def __init__(self, mode: str = 'accurate') -> None: + self.mode = mode + assert mode in ['accurate', 'efficient'] + + def _decord_load_frames(self, container: object, + frame_inds: np.ndarray) -> List[np.ndarray]: + if self.mode == 'accurate': + imgs = container.get_batch(frame_inds).asnumpy() + imgs = list(imgs) + elif self.mode == 'efficient': + # This mode is faster, however it always returns I-FRAME + container.seek(0) + imgs = list() + for idx in frame_inds: + container.seek(idx) + frame = container.next() + imgs.append(frame.asnumpy()) + return imgs + + def transform(self, results: Dict) -> Dict: + """Perform the Decord decoding. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + container = results['video_reader'] + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + frame_inds = results['frame_inds'] + imgs = self._decord_load_frames(container, frame_inds) + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + # we resize the gt_bboxes and proposals to their real scale + if 'gt_bboxes' in results: + h, w = results['img_shape'] + scale_factor = np.array([w, h, w, h]) + gt_bboxes = results['gt_bboxes'] + gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32) + results['gt_bboxes'] = gt_bboxes + if 'proposals' in results and results['proposals'] is not None: + proposals = results['proposals'] + proposals = (proposals * scale_factor).astype(np.float32) + results['proposals'] = proposals + + return results + + def __repr__(self) -> str: + repr_str = f'{self.__class__.__name__}(mode={self.mode})' + return repr_str + + +@TRANSFORMS.register_module() +class OpenCVInit(BaseTransform): + """Using OpenCV to initialize the video_reader. + + Required keys are ``'filename'``, added or modified keys are ` + `'new_path'``, ``'video_reader'`` and ``'total_frames'``. + + Args: + io_backend (str): io backend where frames are store. + Defaults to ``'disk'``. + """ + + def __init__(self, io_backend: str = 'disk', **kwargs) -> None: + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + self.tmp_folder = None + if self.io_backend != 'disk': + random_string = get_random_string() + thread_id = get_thread_id() + self.tmp_folder = osp.join(get_shm_dir(), + f'{random_string}_{thread_id}') + os.mkdir(self.tmp_folder) + + def transform(self, results: dict) -> dict: + """Perform the OpenCV initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if self.io_backend == 'disk': + new_path = results['filename'] + else: + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + thread_id = get_thread_id() + # save the file of same thread at the same place + new_path = osp.join(self.tmp_folder, f'tmp_{thread_id}.mp4') + with open(new_path, 'wb') as f: + f.write(self.file_client.get(results['filename'])) + + container = mmcv.VideoReader(new_path) + results['new_path'] = new_path + results['video_reader'] = container + results['total_frames'] = len(container) + + return results + + def __del__(self): + if self.tmp_folder and osp.exists(self.tmp_folder): + shutil.rmtree(self.tmp_folder) + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend})') + return repr_str + + +@TRANSFORMS.register_module() +class OpenCVDecode(BaseTransform): + """Using OpenCV to decode the video. + + Required keys are ``'video_reader'``, ``'filename'`` and ``'frame_inds'``, + added or modified keys are ``'imgs'``, ``'img_shape'`` and + ``'original_shape'``. + """ + + def transform(self, results: dict) -> dict: + """Perform the OpenCV decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + for frame_ind in results['frame_inds']: + cur_frame = container[frame_ind] + # last frame may be None in OpenCV + while isinstance(cur_frame, type(None)): + frame_ind -= 1 + cur_frame = container[frame_ind] + imgs.append(cur_frame) + + results['video_reader'] = None + del container + + imgs = np.array(imgs) + # The default channel order of OpenCV is BGR, thus we change it to RGB + imgs = imgs[:, :, :, ::-1] + results['imgs'] = list(imgs) + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + +@TRANSFORMS.register_module() +class RawFrameDecode(BaseTransform): + """Load and decode frames with given indices. + + Required Keys: + + - frame_dir + - filename_tmpl + - frame_inds + - modality + - offset (optional) + + Added Keys: + + - img + - img_shape + - original_shape + + Args: + io_backend (str): IO backend where frames are stored. + Defaults to ``'disk'``. + decoding_backend (str): Backend used for image decoding. + Defaults to ``'cv2'``. + """ + + def __init__(self, + io_backend: str = 'disk', + decoding_backend: str = 'cv2', + **kwargs) -> None: + self.io_backend = io_backend + self.decoding_backend = decoding_backend + self.kwargs = kwargs + self.file_client = None + + def transform(self, results: dict) -> dict: + """Perform the ``RawFrameDecode`` to pick frames given indices. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + mmcv.use_backend(self.decoding_backend) + + directory = results['frame_dir'] + filename_tmpl = results['filename_tmpl'] + modality = results['modality'] + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + + cache = {} + for i, frame_idx in enumerate(results['frame_inds']): + # Avoid loading duplicated frames + if frame_idx in cache: + imgs.append(cp.deepcopy(imgs[cache[frame_idx]])) + continue + else: + cache[frame_idx] = i + + frame_idx += offset + if modality == 'RGB': + filepath = osp.join(directory, filename_tmpl.format(frame_idx)) + img_bytes = self.file_client.get(filepath) + # Get frame with channel order RGB directly. + cur_frame = mmcv.imfrombytes(img_bytes, channel_order='rgb') + imgs.append(cur_frame) + elif modality == 'Flow': + x_filepath = osp.join(directory, + filename_tmpl.format('x', frame_idx)) + y_filepath = osp.join(directory, + filename_tmpl.format('y', frame_idx)) + x_img_bytes = self.file_client.get(x_filepath) + x_frame = mmcv.imfrombytes(x_img_bytes, flag='grayscale') + y_img_bytes = self.file_client.get(y_filepath) + y_frame = mmcv.imfrombytes(y_img_bytes, flag='grayscale') + imgs.append(np.stack([x_frame, y_frame], axis=-1)) + else: + raise NotImplementedError + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + # we resize the gt_bboxes and proposals to their real scale + if 'gt_bboxes' in results: + h, w = results['img_shape'] + scale_factor = np.array([w, h, w, h]) + gt_bboxes = results['gt_bboxes'] + gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32) + results['gt_bboxes'] = gt_bboxes + if 'proposals' in results and results['proposals'] is not None: + proposals = results['proposals'] + proposals = (proposals * scale_factor).astype(np.float32) + results['proposals'] = proposals + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'decoding_backend={self.decoding_backend})') + return repr_str + + +@TRANSFORMS.register_module() +class InferencerPackInput(BaseTransform): + + def __init__(self, + input_format='video', + filename_tmpl='img_{:05}.jpg', + modality='RGB', + start_index=1) -> None: + self.input_format = input_format + self.filename_tmpl = filename_tmpl + self.modality = modality + self.start_index = start_index + + def transform(self, video: Union[str, np.ndarray, dict]) -> dict: + if self.input_format == 'dict': + results = video + elif self.input_format == 'video': + results = dict( + filename=video, label=-1, start_index=0, modality='RGB') + elif self.input_format == 'rawframes': + import re + + # count the number of frames that match the format of + # `filename_tmpl` + # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$ + # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$ + pattern = f'^{self.filename_tmpl}$' + if self.modality == 'Flow': + pattern = pattern.replace('{}', 'x') + pattern = pattern.replace( + pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+') + total_frames = len( + list( + filter(lambda x: re.match(pattern, x) is not None, + os.listdir(video)))) + results = dict( + frame_dir=video, + total_frames=total_frames, + label=-1, + start_index=self.start_index, + filename_tmpl=self.filename_tmpl, + modality=self.modality) + elif self.input_format == 'array': + modality_map = {2: 'Flow', 3: 'RGB'} + modality = modality_map.get(video.shape[-1]) + results = dict( + total_frames=video.shape[0], + label=-1, + start_index=0, + array=video, + modality=modality) + + return results + + +@TRANSFORMS.register_module() +class ArrayDecode(BaseTransform): + """Load and decode frames with given indices from a 4D array. + + Required keys are "array and "frame_inds", added or modified keys are + "imgs", "img_shape" and "original_shape". + """ + + def transform(self, results): + """Perform the ``RawFrameDecode`` to pick frames given indices. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + modality = results['modality'] + array = results['array'] + + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + + for i, frame_idx in enumerate(results['frame_inds']): + + frame_idx += offset + if modality == 'RGB': + imgs.append(array[frame_idx]) + elif modality == 'Flow': + imgs.extend( + [array[frame_idx, ..., 0], array[frame_idx, ..., 1]]) + else: + raise NotImplementedError + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + def __repr__(self): + return f'{self.__class__.__name__}()' + + +@TRANSFORMS.register_module() +class ImageDecode(BaseTransform): + """Load and decode images. + + Required key is "filename", added or modified keys are "imgs", "img_shape" + and "original_shape". + + Args: + io_backend (str): IO backend where frames are stored. Default: 'disk'. + decoding_backend (str): Backend used for image decoding. + Default: 'cv2'. + kwargs (dict, optional): Arguments for FileClient. + """ + + def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs): + self.io_backend = io_backend + self.decoding_backend = decoding_backend + self.kwargs = kwargs + self.file_client = None + + def transform(self, results): + """Perform the ``ImageDecode`` to load image given the file path. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + mmcv.use_backend(self.decoding_backend) + + filename = results['filename'] + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + imgs = list() + img_bytes = self.file_client.get(filename) + + img = mmcv.imfrombytes(img_bytes, channel_order='rgb') + imgs.append(img) + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class LoadAudioFeature(BaseTransform): + """Load offline extracted audio features. + + Required Keys: + + - audio_path + + Added Keys: + + - length + - audios + + Args: + pad_method (str): Padding method. Defaults to ``'zero'``. + """ + + def __init__(self, pad_method: str = 'zero') -> None: + if pad_method not in ['zero', 'random']: + raise NotImplementedError + self.pad_method = pad_method + + @staticmethod + def _zero_pad(shape: int) -> np.ndarray: + """Zero padding method.""" + return np.zeros(shape, dtype=np.float32) + + @staticmethod + def _random_pad(shape: int) -> np.ndarray: + """Random padding method.""" + # spectrogram is normalized into a distribution of 0~1 + return np.random.rand(shape).astype(np.float32) + + def transform(self, results: Dict) -> Dict: + """Perform the numpy loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if osp.exists(results['audio_path']): + feature_map = np.load(results['audio_path']) + else: + # Generate a random dummy 10s input + # Some videos do not have audio stream + pad_func = getattr(self, f'_{self.pad_method}_pad') + feature_map = pad_func((640, 80)) + + results['length'] = feature_map.shape[0] + results['audios'] = feature_map + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'pad_method={self.pad_method})') + return repr_str + + +@TRANSFORMS.register_module() +class BuildPseudoClip(BaseTransform): + """Build pseudo clips with one single image by repeating it n times. + + Required key is "imgs", added or modified key is "imgs", "num_clips", + "clip_len". + + Args: + clip_len (int): Frames of the generated pseudo clips. + """ + + def __init__(self, clip_len): + self.clip_len = clip_len + + def transform(self, results): + """Perform the building of pseudo clips. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + # the input should be one single image + assert len(results['imgs']) == 1 + im = results['imgs'][0] + for _ in range(1, self.clip_len): + results['imgs'].append(np.copy(im)) + results['clip_len'] = self.clip_len + results['num_clips'] = 1 + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'fix_length={self.fixed_length})') + return repr_str + + +@TRANSFORMS.register_module() +class AudioFeatureSelector(BaseTransform): + """Sample the audio feature w.r.t. the frames selected. + + Required Keys: + + - audios + - frame_inds + - num_clips + - length + - total_frames + + Modified Keys: + + - audios + + Added Keys: + + - audios_shape + + Args: + fixed_length (int): As the features selected by frames sampled may + not be exactly the same, `fixed_length` will truncate or pad them + into the same size. Defaults to 128. + """ + + def __init__(self, fixed_length: int = 128) -> None: + self.fixed_length = fixed_length + + def transform(self, results: Dict) -> Dict: + """Perform the ``AudioFeatureSelector`` to pick audio feature clips. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + audio = results['audios'] + frame_inds = results['frame_inds'] + num_clips = results['num_clips'] + resampled_clips = list() + + frame_inds = frame_inds.reshape(num_clips, -1) + for clip_idx in range(num_clips): + clip_frame_inds = frame_inds[clip_idx] + start_idx = max( + 0, + int( + round((clip_frame_inds[0] + 1) / results['total_frames'] * + results['length']))) + end_idx = min( + results['length'], + int( + round((clip_frame_inds[-1] + 1) / results['total_frames'] * + results['length']))) + cropped_audio = audio[start_idx:end_idx, :] + if cropped_audio.shape[0] >= self.fixed_length: + truncated_audio = cropped_audio[:self.fixed_length, :] + else: + truncated_audio = np.pad( + cropped_audio, + ((0, self.fixed_length - cropped_audio.shape[0]), (0, 0)), + mode='constant') + + resampled_clips.append(truncated_audio) + results['audios'] = np.array(resampled_clips) + results['audios_shape'] = results['audios'].shape + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'fix_length={self.fixed_length})') + return repr_str + + +@TRANSFORMS.register_module() +class LoadLocalizationFeature(BaseTransform): + """Load Video features for localizer with given video_name list. + + The required key is "feature_path", added or modified keys + are "raw_feature". + + Args: + raw_feature_ext (str): Raw feature file extension. Default: '.csv'. + """ + + def transform(self, results): + """Perform the LoadLocalizationFeature loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + data_path = results['feature_path'] + raw_feature = np.loadtxt( + data_path, dtype=np.float32, delimiter=',', skiprows=1) + + results['raw_feature'] = np.transpose(raw_feature, (1, 0)) + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}' + return repr_str + + +@TRANSFORMS.register_module() +class GenerateLocalizationLabels(BaseTransform): + """Load video label for localizer with given video_name list. + + Required keys are "duration_frame", "duration_second", "feature_frame", + "annotations", added or modified keys are "gt_bbox". + """ + + def transform(self, results): + """Perform the GenerateLocalizationLabels loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + video_frame = results['duration_frame'] + video_second = results['duration_second'] + feature_frame = results['feature_frame'] + corrected_second = float(feature_frame) / video_frame * video_second + annotations = results['annotations'] + + gt_bbox = [] + + for annotation in annotations: + current_start = max( + min(1, annotation['segment'][0] / corrected_second), 0) + current_end = max( + min(1, annotation['segment'][1] / corrected_second), 0) + gt_bbox.append([current_start, current_end]) + + gt_bbox = np.array(gt_bbox) + results['gt_bbox'] = gt_bbox + return results + + +@TRANSFORMS.register_module() +class LoadProposals(BaseTransform): + """Loading proposals with given proposal results. + + Required keys are "video_name", added or modified keys are 'bsp_feature', + 'tmin', 'tmax', 'tmin_score', 'tmax_score' and 'reference_temporal_iou'. + + Args: + top_k (int): The top k proposals to be loaded. + pgm_proposals_dir (str): Directory to load proposals. + pgm_features_dir (str): Directory to load proposal features. + proposal_ext (str): Proposal file extension. Default: '.csv'. + feature_ext (str): Feature file extension. Default: '.npy'. + """ + + def __init__(self, + top_k, + pgm_proposals_dir, + pgm_features_dir, + proposal_ext='.csv', + feature_ext='.npy'): + self.top_k = top_k + self.pgm_proposals_dir = pgm_proposals_dir + self.pgm_features_dir = pgm_features_dir + valid_proposal_ext = ('.csv', ) + if proposal_ext not in valid_proposal_ext: + raise NotImplementedError + self.proposal_ext = proposal_ext + valid_feature_ext = ('.npy', ) + if feature_ext not in valid_feature_ext: + raise NotImplementedError + self.feature_ext = feature_ext + + def transform(self, results): + """Perform the LoadProposals loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + video_name = results['video_name'] + proposal_path = osp.join(self.pgm_proposals_dir, + video_name + self.proposal_ext) + if self.proposal_ext == '.csv': + pgm_proposals = np.loadtxt( + proposal_path, dtype=np.float32, delimiter=',', skiprows=1) + + pgm_proposals = np.array(pgm_proposals[:self.top_k]) + tmin = pgm_proposals[:, 0] + tmax = pgm_proposals[:, 1] + tmin_score = pgm_proposals[:, 2] + tmax_score = pgm_proposals[:, 3] + reference_temporal_iou = pgm_proposals[:, 5] + + feature_path = osp.join(self.pgm_features_dir, + video_name + self.feature_ext) + if self.feature_ext == '.npy': + bsp_feature = np.load(feature_path).astype(np.float32) + + bsp_feature = bsp_feature[:self.top_k, :] + results['bsp_feature'] = bsp_feature + results['tmin'] = tmin + results['tmax'] = tmax + results['tmin_score'] = tmin_score + results['tmax_score'] = tmax_score + results['reference_temporal_iou'] = reference_temporal_iou + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'top_k={self.top_k}, ' + f'pgm_proposals_dir={self.pgm_proposals_dir}, ' + f'pgm_features_dir={self.pgm_features_dir}, ' + f'proposal_ext={self.proposal_ext}, ' + f'feature_ext={self.feature_ext})') + return repr_str diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..25c4ed636fdbe64f638f0096bc648774990b5e6e --- /dev/null +++ b/mmaction/datasets/transforms/pose_transforms.py @@ -0,0 +1,1523 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import scipy +from mmcv.transforms import BaseTransform, KeyMapper +from mmengine.dataset import Compose +from packaging import version as pv +from scipy.stats import mode +from torch.nn.modules.utils import _pair + +from mmaction.registry import TRANSFORMS +from .loading import DecordDecode, DecordInit +from .processing import _combine_quadruple + +if pv.parse(scipy.__version__) < pv.parse('1.11.0'): + get_mode = mode +else: + from functools import partial + get_mode = partial(mode, keepdims=True) + + +@TRANSFORMS.register_module() +class DecompressPose(BaseTransform): + """Load Compressed Pose. + + Required Keys: + + - frame_inds + - total_frames + - keypoint + - anno_inds (optional) + + Modified Keys: + + - keypoint + - frame_inds + + Added Keys: + + - keypoint_score + - num_person + + Args: + squeeze (bool): Whether to remove frames with no human pose. + Defaults to True. + max_person (int): The max number of persons in a frame. Defaults to 10. + """ + + def __init__(self, squeeze: bool = True, max_person: int = 10) -> None: + self.squeeze = squeeze + self.max_person = max_person + + def transform(self, results: Dict) -> Dict: + """Perform the pose decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + required_keys = ['total_frames', 'frame_inds', 'keypoint'] + for k in required_keys: + assert k in results + + total_frames = results['total_frames'] + frame_inds = results.pop('frame_inds') + keypoint = results['keypoint'] + + if 'anno_inds' in results: + frame_inds = frame_inds[results['anno_inds']] + keypoint = keypoint[results['anno_inds']] + + assert np.all(np.diff(frame_inds) >= 0), \ + 'frame_inds should be monotonical increasing' + + def mapinds(inds): + uni = np.unique(inds) + map_ = {x: i for i, x in enumerate(uni)} + inds = [map_[x] for x in inds] + return np.array(inds, dtype=np.int16) + + if self.squeeze: + frame_inds = mapinds(frame_inds) + total_frames = np.max(frame_inds) + 1 + + results['total_frames'] = total_frames + + num_joints = keypoint.shape[1] + num_person = get_mode(frame_inds)[-1][0] + + new_kp = np.zeros([num_person, total_frames, num_joints, 2], + dtype=np.float16) + new_kpscore = np.zeros([num_person, total_frames, num_joints], + dtype=np.float16) + nperson_per_frame = np.zeros([total_frames], dtype=np.int16) + + for frame_ind, kp in zip(frame_inds, keypoint): + person_ind = nperson_per_frame[frame_ind] + new_kp[person_ind, frame_ind] = kp[:, :2] + new_kpscore[person_ind, frame_ind] = kp[:, 2] + nperson_per_frame[frame_ind] += 1 + + if num_person > self.max_person: + for i in range(total_frames): + nperson = nperson_per_frame[i] + val = new_kpscore[:nperson, i] + score_sum = val.sum(-1) + + inds = sorted(range(nperson), key=lambda x: -score_sum[x]) + new_kpscore[:nperson, i] = new_kpscore[inds, i] + new_kp[:nperson, i] = new_kp[inds, i] + num_person = self.max_person + results['num_person'] = num_person + + results['keypoint'] = new_kp[:num_person] + results['keypoint_score'] = new_kpscore[:num_person] + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'squeeze={self.squeeze}, ' + f'max_person={self.max_person})') + return repr_str + + +@TRANSFORMS.register_module() +class GeneratePoseTarget(BaseTransform): + """Generate pseudo heatmaps based on joint coordinates and confidence. + + Required Keys: + + - keypoint + - keypoint_score (optional) + - img_shape + + Added Keys: + + - imgs (optional) + - heatmap_imgs (optional) + + Args: + sigma (float): The sigma of the generated gaussian map. + Defaults to 0.6. + use_score (bool): Use the confidence score of keypoints as the maximum + of the gaussian maps. Defaults to True. + with_kp (bool): Generate pseudo heatmaps for keypoints. + Defaults to True. + with_limb (bool): Generate pseudo heatmaps for limbs. At least one of + 'with_kp' and 'with_limb' should be True. Defaults to False. + skeletons (tuple[tuple]): The definition of human skeletons. + Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), + (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), + (13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``, + which is the definition of COCO-17p skeletons. + double (bool): Output both original heatmaps and flipped heatmaps. + Defaults to False. + left_kp (tuple[int]): Indexes of left keypoints, which is used when + flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15), + which is left keypoints in COCO-17p. + right_kp (tuple[int]): Indexes of right keypoints, which is used when + flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16), + which is right keypoints in COCO-17p. + left_limb (tuple[int]): Indexes of left limbs, which is used when + flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12), + which is left limbs of skeletons we defined for COCO-17p. + right_limb (tuple[int]): Indexes of right limbs, which is used when + flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15), + which is right limbs of skeletons we defined for COCO-17p. + scaling (float): The ratio to scale the heatmaps. Defaults to 1. + """ + + def __init__(self, + sigma: float = 0.6, + use_score: bool = True, + with_kp: bool = True, + with_limb: bool = False, + skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3), + (2, 4), (0, 5), (5, 7), + (7, 9), (0, 6), (6, 8), + (8, 10), (5, 11), (11, 13), + (13, 15), (6, 12), (12, 14), + (14, 16), (11, 12)), + double: bool = False, + left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15), + right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16), + left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12), + right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15), + scaling: float = 1.) -> None: + + self.sigma = sigma + self.use_score = use_score + self.with_kp = with_kp + self.with_limb = with_limb + self.double = double + + # an auxiliary const + self.eps = 1e-4 + + assert self.with_kp or self.with_limb, ( + 'At least one of "with_limb" ' + 'and "with_kp" should be set as True.') + self.left_kp = left_kp + self.right_kp = right_kp + self.skeletons = skeletons + self.left_limb = left_limb + self.right_limb = right_limb + self.scaling = scaling + + def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray, + max_values: np.ndarray) -> None: + """Generate pseudo heatmap for one keypoint in one frame. + + Args: + arr (np.ndarray): The array to store the generated heatmaps. + Shape: img_h * img_w. + centers (np.ndarray): The coordinates of corresponding keypoints + (of multiple persons). Shape: M * 2. + max_values (np.ndarray): The max values of each keypoint. Shape: M. + """ + + sigma = self.sigma + img_h, img_w = arr.shape + + for center, max_value in zip(centers, max_values): + if max_value < self.eps: + continue + + mu_x, mu_y = center[0], center[1] + st_x = max(int(mu_x - 3 * sigma), 0) + ed_x = min(int(mu_x + 3 * sigma) + 1, img_w) + st_y = max(int(mu_y - 3 * sigma), 0) + ed_y = min(int(mu_y + 3 * sigma) + 1, img_h) + x = np.arange(st_x, ed_x, 1, np.float32) + y = np.arange(st_y, ed_y, 1, np.float32) + + # if the keypoint not in the heatmap coordinate system + if not (len(x) and len(y)): + continue + y = y[:, None] + + patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2) + patch = patch * max_value + arr[st_y:ed_y, st_x:ed_x] = \ + np.maximum(arr[st_y:ed_y, st_x:ed_x], patch) + + def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray, + ends: np.ndarray, start_values: np.ndarray, + end_values: np.ndarray) -> None: + """Generate pseudo heatmap for one limb in one frame. + + Args: + arr (np.ndarray): The array to store the generated heatmaps. + Shape: img_h * img_w. + starts (np.ndarray): The coordinates of one keypoint in the + corresponding limbs. Shape: M * 2. + ends (np.ndarray): The coordinates of the other keypoint in the + corresponding limbs. Shape: M * 2. + start_values (np.ndarray): The max values of one keypoint in the + corresponding limbs. Shape: M. + end_values (np.ndarray): The max values of the other keypoint + in the corresponding limbs. Shape: M. + """ + + sigma = self.sigma + img_h, img_w = arr.shape + + for start, end, start_value, end_value in zip(starts, ends, + start_values, + end_values): + value_coeff = min(start_value, end_value) + if value_coeff < self.eps: + continue + + min_x, max_x = min(start[0], end[0]), max(start[0], end[0]) + min_y, max_y = min(start[1], end[1]), max(start[1], end[1]) + + min_x = max(int(min_x - 3 * sigma), 0) + max_x = min(int(max_x + 3 * sigma) + 1, img_w) + min_y = max(int(min_y - 3 * sigma), 0) + max_y = min(int(max_y + 3 * sigma) + 1, img_h) + + x = np.arange(min_x, max_x, 1, np.float32) + y = np.arange(min_y, max_y, 1, np.float32) + + if not (len(x) and len(y)): + continue + + y = y[:, None] + x_0 = np.zeros_like(x) + y_0 = np.zeros_like(y) + + # distance to start keypoints + d2_start = ((x - start[0])**2 + (y - start[1])**2) + + # distance to end keypoints + d2_end = ((x - end[0])**2 + (y - end[1])**2) + + # the distance between start and end keypoints. + d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2) + + if d2_ab < 1: + self.generate_a_heatmap(arr, start[None], start_value[None]) + continue + + coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab + + a_dominate = coeff <= 0 + b_dominate = coeff >= 1 + seg_dominate = 1 - a_dominate - b_dominate + + position = np.stack([x + y_0, y + x_0], axis=-1) + projection = start + np.stack([coeff, coeff], axis=-1) * ( + end - start) + d2_line = position - projection + d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2 + d2_seg = ( + a_dominate * d2_start + b_dominate * d2_end + + seg_dominate * d2_line) + + patch = np.exp(-d2_seg / 2. / sigma**2) + patch = patch * value_coeff + + arr[min_y:max_y, min_x:max_x] = \ + np.maximum(arr[min_y:max_y, min_x:max_x], patch) + + def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray, + max_values: np.ndarray) -> None: + """Generate pseudo heatmap for all keypoints and limbs in one frame (if + needed). + + Args: + arr (np.ndarray): The array to store the generated heatmaps. + Shape: V * img_h * img_w. + kps (np.ndarray): The coordinates of keypoints in this frame. + Shape: M * V * 2. + max_values (np.ndarray): The confidence score of each keypoint. + Shape: M * V. + """ + + if self.with_kp: + num_kp = kps.shape[1] + for i in range(num_kp): + self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i]) + + if self.with_limb: + for i, limb in enumerate(self.skeletons): + start_idx, end_idx = limb + starts = kps[:, start_idx] + ends = kps[:, end_idx] + + start_values = max_values[:, start_idx] + end_values = max_values[:, end_idx] + self.generate_a_limb_heatmap(arr[i], starts, ends, + start_values, end_values) + + def gen_an_aug(self, results: Dict) -> np.ndarray: + """Generate pseudo heatmaps for all frames. + + Args: + results (dict): The dictionary that contains all info of a sample. + + Returns: + np.ndarray: The generated pseudo heatmaps. + """ + + all_kps = results['keypoint'].astype(np.float32) + kp_shape = all_kps.shape + + if 'keypoint_score' in results: + all_kpscores = results['keypoint_score'] + else: + all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32) + + img_h, img_w = results['img_shape'] + + # scale img_h, img_w and kps + img_h = int(img_h * self.scaling + 0.5) + img_w = int(img_w * self.scaling + 0.5) + all_kps[..., :2] *= self.scaling + + num_frame = kp_shape[1] + num_c = 0 + if self.with_kp: + num_c += all_kps.shape[2] + if self.with_limb: + num_c += len(self.skeletons) + + ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32) + + for i in range(num_frame): + # M, V, C + kps = all_kps[:, i] + # M, C + kpscores = all_kpscores[:, i] if self.use_score else \ + np.ones_like(all_kpscores[:, i]) + + self.generate_heatmap(ret[i], kps, kpscores) + return ret + + def transform(self, results: Dict) -> Dict: + """Generate pseudo heatmaps based on joint coordinates and confidence. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + heatmap = self.gen_an_aug(results) + key = 'heatmap_imgs' if 'imgs' in results else 'imgs' + + if self.double: + indices = np.arange(heatmap.shape[1], dtype=np.int64) + left, right = (self.left_kp, self.right_kp) if self.with_kp else ( + self.left_limb, self.right_limb) + for l, r in zip(left, right): # noqa: E741 + indices[l] = r + indices[r] = l + heatmap_flip = heatmap[..., ::-1][:, indices] + heatmap = np.concatenate([heatmap, heatmap_flip]) + results[key] = heatmap + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'sigma={self.sigma}, ' + f'use_score={self.use_score}, ' + f'with_kp={self.with_kp}, ' + f'with_limb={self.with_limb}, ' + f'skeletons={self.skeletons}, ' + f'double={self.double}, ' + f'left_kp={self.left_kp}, ' + f'right_kp={self.right_kp}, ' + f'left_limb={self.left_limb}, ' + f'right_limb={self.right_limb}, ' + f'scaling={self.scaling})') + return repr_str + + +@TRANSFORMS.register_module() +class PoseCompact(BaseTransform): + """Convert the coordinates of keypoints to make it more compact. + Specifically, it first find a tight bounding box that surrounds all joints + in each frame, then we expand the tight box by a given padding ratio. For + example, if 'padding == 0.25', then the expanded box has unchanged center, + and 1.25x width and height. + + Required Keys: + + - keypoint + - img_shape + + Modified Keys: + + - img_shape + - keypoint + + Added Keys: + + - crop_quadruple + + Args: + padding (float): The padding size. Defaults to 0.25. + threshold (int): The threshold for the tight bounding box. If the width + or height of the tight bounding box is smaller than the threshold, + we do not perform the compact operation. Defaults to 10. + hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded + box. Float indicates the specific ratio and tuple indicates a + ratio range. If set as None, it means there is no requirement on + hw_ratio. Defaults to None. + allow_imgpad (bool): Whether to allow expanding the box outside the + image to meet the hw_ratio requirement. Defaults to True. + """ + + def __init__(self, + padding: float = 0.25, + threshold: int = 10, + hw_ratio: Optional[Union[float, Tuple[float]]] = None, + allow_imgpad: bool = True) -> None: + + self.padding = padding + self.threshold = threshold + if hw_ratio is not None: + hw_ratio = _pair(hw_ratio) + + self.hw_ratio = hw_ratio + + self.allow_imgpad = allow_imgpad + assert self.padding >= 0 + + def transform(self, results: Dict) -> Dict: + """Convert the coordinates of keypoints to make it more compact. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + img_shape = results['img_shape'] + h, w = img_shape + kp = results['keypoint'] + + # Make NaN zero + kp[np.isnan(kp)] = 0. + kp_x = kp[..., 0] + kp_y = kp[..., 1] + + min_x = np.min(kp_x[kp_x != 0], initial=np.Inf) + min_y = np.min(kp_y[kp_y != 0], initial=np.Inf) + max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf) + max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf) + + # The compact area is too small + if max_x - min_x < self.threshold or max_y - min_y < self.threshold: + return results + + center = ((max_x + min_x) / 2, (max_y + min_y) / 2) + half_width = (max_x - min_x) / 2 * (1 + self.padding) + half_height = (max_y - min_y) / 2 * (1 + self.padding) + + if self.hw_ratio is not None: + half_height = max(self.hw_ratio[0] * half_width, half_height) + half_width = max(1 / self.hw_ratio[1] * half_height, half_width) + + min_x, max_x = center[0] - half_width, center[0] + half_width + min_y, max_y = center[1] - half_height, center[1] + half_height + + # hot update + if not self.allow_imgpad: + min_x, min_y = int(max(0, min_x)), int(max(0, min_y)) + max_x, max_y = int(min(w, max_x)), int(min(h, max_y)) + else: + min_x, min_y = int(min_x), int(min_y) + max_x, max_y = int(max_x), int(max_y) + + kp_x[kp_x != 0] -= min_x + kp_y[kp_y != 0] -= min_y + + new_shape = (max_y - min_y, max_x - min_x) + results['img_shape'] = new_shape + + # the order is x, y, w, h (in [0, 1]), a tuple + crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.)) + new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w, + (max_y - min_y) / h) + crop_quadruple = _combine_quadruple(crop_quadruple, new_crop_quadruple) + results['crop_quadruple'] = crop_quadruple + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(padding={self.padding}, ' + f'threshold={self.threshold}, ' + f'hw_ratio={self.hw_ratio}, ' + f'allow_imgpad={self.allow_imgpad})') + return repr_str + + +@TRANSFORMS.register_module() +class PreNormalize3D(BaseTransform): + """PreNormalize for NTURGB+D 3D keypoints (x, y, z). + + PreNormalize3D first subtracts the coordinates of each joint + from the coordinates of the 'spine' (joint #1 in ntu) of the first person + in the first frame. Subsequently, it performs a 3D rotation to fix the Z + axis parallel to the 3D vector from the 'hip' (joint #0) and the 'spine' + (joint #1) and the X axis toward the 3D vector from the 'right shoulder' + (joint #8) and the 'left shoulder' (joint #4). Codes adapted from + https://github.com/lshiwjx/2s-AGCN. + + Required Keys: + + - keypoint + - total_frames (optional) + + Modified Keys: + + - keypoint + + Added Keys: + + - body_center + + Args: + zaxis (list[int]): The target Z axis for the 3D rotation. + Defaults to ``[0, 1]``. + xaxis (list[int]): The target X axis for the 3D rotation. + Defaults to ``[8, 4]``. + align_spine (bool): Whether to perform a 3D rotation to + align the spine. Defaults to True. + align_shoulder (bool): Whether to perform a 3D rotation + to align the shoulder. Defaults to True. + align_center (bool): Whether to align the body center. + Defaults to True. + """ + + def __init__(self, + zaxis: List[int] = [0, 1], + xaxis: List[int] = [8, 4], + align_spine: bool = True, + align_shoulder: bool = True, + align_center: bool = True) -> None: + self.zaxis = zaxis + self.xaxis = xaxis + self.align_center = align_center + self.align_spine = align_spine + self.align_shoulder = align_shoulder + + def unit_vector(self, vector: np.ndarray) -> np.ndarray: + """Returns the unit vector of the vector.""" + return vector / np.linalg.norm(vector) + + def angle_between(self, v1: np.ndarray, v2: np.ndarray) -> float: + """Returns the angle in radians between vectors 'v1' and 'v2'.""" + if np.abs(v1).sum() < 1e-6 or np.abs(v2).sum() < 1e-6: + return 0 + v1_u = self.unit_vector(v1) + v2_u = self.unit_vector(v2) + return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) + + def rotation_matrix(self, axis: np.ndarray, theta: float) -> np.ndarray: + """Returns the rotation matrix associated with counterclockwise + rotation about the given axis by theta radians.""" + if np.abs(axis).sum() < 1e-6 or np.abs(theta) < 1e-6: + return np.eye(3) + axis = np.asarray(axis) + axis = axis / np.sqrt(np.dot(axis, axis)) + a = np.cos(theta / 2.0) + b, c, d = -axis * np.sin(theta / 2.0) + aa, bb, cc, dd = a * a, b * b, c * c, d * d + bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d + return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)], + [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)], + [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]]) + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`PreNormalize3D`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + skeleton = results['keypoint'] + total_frames = results.get('total_frames', skeleton.shape[1]) + + M, T, V, C = skeleton.shape + assert T == total_frames + if skeleton.sum() == 0: + return results + + index0 = [ + i for i in range(T) if not np.all(np.isclose(skeleton[0, i], 0)) + ] + + assert M in [1, 2] + if M == 2: + index1 = [ + i for i in range(T) + if not np.all(np.isclose(skeleton[1, i], 0)) + ] + if len(index0) < len(index1): + skeleton = skeleton[:, np.array(index1)] + skeleton = skeleton[[1, 0]] + else: + skeleton = skeleton[:, np.array(index0)] + else: + skeleton = skeleton[:, np.array(index0)] + + T_new = skeleton.shape[1] + + if self.align_center: + if skeleton.shape[2] == 25: + main_body_center = skeleton[0, 0, 1].copy() + else: + main_body_center = skeleton[0, 0, -1].copy() + mask = ((skeleton != 0).sum(-1) > 0)[..., None] + skeleton = (skeleton - main_body_center) * mask + + if self.align_spine: + joint_bottom = skeleton[0, 0, self.zaxis[0]] + joint_top = skeleton[0, 0, self.zaxis[1]] + axis = np.cross(joint_top - joint_bottom, [0, 0, 1]) + angle = self.angle_between(joint_top - joint_bottom, [0, 0, 1]) + matrix_z = self.rotation_matrix(axis, angle) + skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_z) + + if self.align_shoulder: + joint_rshoulder = skeleton[0, 0, self.xaxis[0]] + joint_lshoulder = skeleton[0, 0, self.xaxis[1]] + axis = np.cross(joint_rshoulder - joint_lshoulder, [1, 0, 0]) + angle = self.angle_between(joint_rshoulder - joint_lshoulder, + [1, 0, 0]) + matrix_x = self.rotation_matrix(axis, angle) + skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_x) + + results['keypoint'] = skeleton + results['total_frames'] = T_new + results['body_center'] = main_body_center + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'zaxis={self.zaxis}, ' + f'xaxis={self.xaxis}, ' + f'align_center={self.align_center}, ' + f'align_spine={self.align_spine}, ' + f'align_shoulder={self.align_shoulder})') + return repr_str + + +@TRANSFORMS.register_module() +class PreNormalize2D(BaseTransform): + """Normalize the range of keypoint values. + + Required Keys: + + - keypoint + - img_shape (optional) + + Modified Keys: + + - keypoint + + Args: + img_shape (tuple[int, int]): The resolution of the original video. + Defaults to ``(1080, 1920)``. + """ + + def __init__(self, img_shape: Tuple[int, int] = (1080, 1920)) -> None: + self.img_shape = img_shape + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`PreNormalize2D`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + h, w = results.get('img_shape', self.img_shape) + results['keypoint'][..., 0] = \ + (results['keypoint'][..., 0] - (w / 2)) / (w / 2) + results['keypoint'][..., 1] = \ + (results['keypoint'][..., 1] - (h / 2)) / (h / 2) + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'img_shape={self.img_shape})') + return repr_str + + +@TRANSFORMS.register_module() +class JointToBone(BaseTransform): + """Convert the joint information to bone information. + + Required Keys: + + - keypoint + + Modified Keys: + + - keypoint + + Args: + dataset (str): Define the type of dataset: 'nturgb+d', 'openpose', + 'coco'. Defaults to ``'nturgb+d'``. + target (str): The target key for the bone information. + Defaults to ``'keypoint'``. + """ + + def __init__(self, + dataset: str = 'nturgb+d', + target: str = 'keypoint') -> None: + self.dataset = dataset + self.target = target + if self.dataset not in ['nturgb+d', 'openpose', 'coco']: + raise ValueError( + f'The dataset type {self.dataset} is not supported') + if self.dataset == 'nturgb+d': + self.pairs = [(0, 1), (1, 20), (2, 20), (3, 2), (4, 20), (5, 4), + (6, 5), (7, 6), (8, 20), (9, 8), (10, 9), (11, 10), + (12, 0), (13, 12), (14, 13), (15, 14), (16, 0), + (17, 16), (18, 17), (19, 18), (21, 22), (20, 20), + (22, 7), (23, 24), (24, 11)] + elif self.dataset == 'openpose': + self.pairs = ((0, 0), (1, 0), (2, 1), (3, 2), (4, 3), (5, 1), + (6, 5), (7, 6), (8, 2), (9, 8), (10, 9), (11, 5), + (12, 11), (13, 12), (14, 0), (15, 0), (16, 14), (17, + 15)) + elif self.dataset == 'coco': + self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0), + (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0), + (12, 0), (13, 11), (14, 12), (15, 13), (16, 14)) + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`JointToBone`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + keypoint = results['keypoint'] + M, T, V, C = keypoint.shape + bone = np.zeros((M, T, V, C), dtype=np.float32) + + assert C in [2, 3] + for v1, v2 in self.pairs: + bone[..., v1, :] = keypoint[..., v1, :] - keypoint[..., v2, :] + if C == 3 and self.dataset in ['openpose', 'coco']: + score = (keypoint[..., v1, 2] + keypoint[..., v2, 2]) / 2 + bone[..., v1, 2] = score + + results[self.target] = bone + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'dataset={self.dataset}, ' + f'target={self.target})') + return repr_str + + +@TRANSFORMS.register_module() +class ToMotion(BaseTransform): + """Convert the joint information or bone information to corresponding + motion information. + + Required Keys: + + - keypoint + + Added Keys: + + - motion + + Args: + dataset (str): Define the type of dataset: 'nturgb+d', 'openpose', + 'coco'. Defaults to ``'nturgb+d'``. + source (str): The source key for the joint or bone information. + Defaults to ``'keypoint'``. + target (str): The target key for the motion information. + Defaults to ``'motion'``. + """ + + def __init__(self, + dataset: str = 'nturgb+d', + source: str = 'keypoint', + target: str = 'motion') -> None: + self.dataset = dataset + self.source = source + self.target = target + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`ToMotion`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + data = results[self.source] + M, T, V, C = data.shape + motion = np.zeros_like(data) + + assert C in [2, 3] + motion[:, :T - 1] = np.diff(data, axis=1) + if C == 3 and self.dataset in ['openpose', 'coco']: + score = (data[:, :T - 1, :, 2] + data[:, 1:, :, 2]) / 2 + motion[:, :T - 1, :, 2] = score + + results[self.target] = motion + + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'dataset={self.dataset}, ' + f'source={self.source}, ' + f'target={self.target})') + return repr_str + + +@TRANSFORMS.register_module() +class MergeSkeFeat(BaseTransform): + """Merge multi-stream features. + + Args: + feat_list (list[str]): The list of the keys of features. + Defaults to ``['keypoint']``. + target (str): The target key for the merged multi-stream information. + Defaults to ``'keypoint'``. + axis (int): The axis along which the features will be joined. + Defaults to -1. + """ + + def __init__(self, + feat_list: List[str] = ['keypoint'], + target: str = 'keypoint', + axis: int = -1) -> None: + self.feat_list = feat_list + self.target = target + self.axis = axis + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MergeSkeFeat`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + feats = [] + for name in self.feat_list: + feats.append(results.pop(name)) + feats = np.concatenate(feats, axis=self.axis) + results[self.target] = feats + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'feat_list={self.feat_list}, ' + f'target={self.target}, ' + f'axis={self.axis})') + return repr_str + + +@TRANSFORMS.register_module() +class GenSkeFeat(BaseTransform): + """Unified interface for generating multi-stream skeleton features. + + Required Keys: + + - keypoint + - keypoint_score (optional) + + Args: + dataset (str): Define the type of dataset: 'nturgb+d', 'openpose', + 'coco'. Defaults to ``'nturgb+d'``. + feats (list[str]): The list of the keys of features. + Defaults to ``['j']``. + axis (int): The axis along which the features will be joined. + Defaults to -1. + """ + + def __init__(self, + dataset: str = 'nturgb+d', + feats: List[str] = ['j'], + axis: int = -1) -> None: + self.dataset = dataset + self.feats = feats + self.axis = axis + ops = [] + if 'b' in feats or 'bm' in feats: + ops.append(JointToBone(dataset=dataset, target='b')) + ops.append(KeyMapper(remapping={'keypoint': 'j'})) + if 'jm' in feats: + ops.append(ToMotion(dataset=dataset, source='j', target='jm')) + if 'bm' in feats: + ops.append(ToMotion(dataset=dataset, source='b', target='bm')) + ops.append(MergeSkeFeat(feat_list=feats, axis=axis)) + self.ops = Compose(ops) + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`GenSkeFeat`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if 'keypoint_score' in results and 'keypoint' in results: + assert self.dataset != 'nturgb+d' + assert results['keypoint'].shape[ + -1] == 2, 'Only 2D keypoints have keypoint_score. ' + keypoint = results.pop('keypoint') + keypoint_score = results.pop('keypoint_score') + results['keypoint'] = np.concatenate( + [keypoint, keypoint_score[..., None]], -1) + return self.ops(results) + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'dataset={self.dataset}, ' + f'feats={self.feats}, ' + f'axis={self.axis})') + return repr_str + + +@TRANSFORMS.register_module() +class UniformSampleFrames(BaseTransform): + """Uniformly sample frames from the video. + + To sample an n-frame clip from the video. UniformSampleFrames basically + divide the video into n segments of equal length and randomly sample one + frame from each segment. To make the testing results reproducible, a + random seed is set during testing, to make the sampling results + deterministic. + + Required Keys: + + - total_frames + - start_index (optional) + + Added Keys: + + - frame_inds + - frame_interval + - num_clips + - clip_len + + Args: + clip_len (int): Frames of each sampled output clip. + num_clips (int): Number of clips to be sampled. Defaults to 1. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + seed (int): The random seed used during test time. Defaults to 255. + """ + + def __init__(self, + clip_len: int, + num_clips: int = 1, + test_mode: bool = False, + seed: int = 255) -> None: + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + self.seed = seed + + def _get_train_clips(self, num_frames: int, clip_len: int) -> np.ndarray: + """Uniformly sample indices for training clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + + Returns: + np.ndarray: The sampled indices for training clips. + """ + all_inds = [] + for clip_idx in range(self.num_clips): + if num_frames < clip_len: + start = np.random.randint(0, num_frames) + inds = np.arange(start, start + clip_len) + elif clip_len <= num_frames < 2 * clip_len: + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int32) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + offset = np.random.randint(bsize) + inds = bst + offset + + all_inds.append(inds) + + return np.concatenate(all_inds) + + def _get_test_clips(self, num_frames: int, clip_len: int) -> np.ndarray: + """Uniformly sample indices for testing clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + + Returns: + np.ndarray: The sampled indices for testing clips. + """ + + np.random.seed(self.seed) + all_inds = [] + for i in range(self.num_clips): + if num_frames < clip_len: + start_ind = i if num_frames < self.num_clips \ + else i * num_frames // self.num_clips + inds = np.arange(start_ind, start_ind + clip_len) + elif clip_len <= num_frames < clip_len * 2: + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int64) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + offset = np.random.randint(bsize) + inds = bst + offset + + all_inds.append(inds) + + return np.concatenate(all_inds) + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`UniformSampleFrames`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + num_frames = results['total_frames'] + + if self.test_mode: + inds = self._get_test_clips(num_frames, self.clip_len) + else: + inds = self._get_train_clips(num_frames, self.clip_len) + + inds = np.mod(inds, num_frames) + start_index = results.get('start_index', 0) + inds = inds + start_index + + if 'keypoint' in results: + kp = results['keypoint'] + assert num_frames == kp.shape[1] + num_person = kp.shape[0] + num_persons = [num_person] * num_frames + for i in range(num_frames): + j = num_person - 1 + while j >= 0 and np.all(np.abs(kp[j, i]) < 1e-5): + j -= 1 + num_persons[i] = j + 1 + transitional = [False] * num_frames + for i in range(1, num_frames - 1): + if num_persons[i] != num_persons[i - 1]: + transitional[i] = transitional[i - 1] = True + if num_persons[i] != num_persons[i + 1]: + transitional[i] = transitional[i + 1] = True + inds_int = inds.astype(np.int64) + coeff = np.array([transitional[i] for i in inds_int]) + inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32) + + results['frame_inds'] = inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'num_clips={self.num_clips}, ' + f'test_mode={self.test_mode}, ' + f'seed={self.seed})') + return repr_str + + +@TRANSFORMS.register_module() +class PadTo(BaseTransform): + """Sample frames from the video. + + To sample an n-frame clip from the video, PadTo samples + the frames from zero index, and loop or zero pad the frames + if the length of video frames is less than the value of `length`. + + Required Keys: + + - keypoint + - total_frames + - start_index (optional) + + Modified Keys: + + - keypoint + - total_frames + + Args: + length (int): The maximum length of the sampled output clip. + mode (str): The padding mode. Defaults to ``'loop'``. + """ + + def __init__(self, length: int, mode: str = 'loop') -> None: + self.length = length + assert mode in ['loop', 'zero'] + self.mode = mode + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`PadTo`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + total_frames = results['total_frames'] + assert total_frames <= self.length + start_index = results.get('start_index', 0) + inds = np.arange(start_index, start_index + self.length) + inds = np.mod(inds, total_frames) + + keypoint = results['keypoint'][:, inds].copy() + if self.mode == 'zero': + keypoint[:, total_frames:] = 0 + + results['keypoint'] = keypoint + results['total_frames'] = self.length + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'length={self.length}, ' + f'mode={self.mode})') + return repr_str + + +@TRANSFORMS.register_module() +class PoseDecode(BaseTransform): + """Load and decode pose with given indices. + + Required Keys: + + - keypoint + - total_frames (optional) + - frame_inds (optional) + - offset (optional) + - keypoint_score (optional) + + Modified Keys: + + - keypoint + - keypoint_score (optional) + """ + + @staticmethod + def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray: + """Load keypoints according to sampled indexes.""" + return kp[:, frame_inds].astype(np.float32) + + @staticmethod + def _load_kpscore(kpscore: np.ndarray, + frame_inds: np.ndarray) -> np.ndarray: + """Load keypoint scores according to sampled indexes.""" + return kpscore[:, frame_inds].astype(np.float32) + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`PoseDecode`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if 'total_frames' not in results: + results['total_frames'] = results['keypoint'].shape[1] + + if 'frame_inds' not in results: + results['frame_inds'] = np.arange(results['total_frames']) + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + frame_inds = results['frame_inds'] + offset + + if 'keypoint_score' in results: + results['keypoint_score'] = self._load_kpscore( + results['keypoint_score'], frame_inds) + + results['keypoint'] = self._load_kp(results['keypoint'], frame_inds) + + return results + + def __repr__(self) -> str: + repr_str = f'{self.__class__.__name__}()' + return repr_str + + +@TRANSFORMS.register_module() +class MMUniformSampleFrames(UniformSampleFrames): + """Uniformly sample frames from the multi-modal data.""" + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMUniformSampleFrames`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + num_frames = results['total_frames'] + modalities = [] + for modality, clip_len in self.clip_len.items(): + if self.test_mode: + inds = self._get_test_clips(num_frames, clip_len) + else: + inds = self._get_train_clips(num_frames, clip_len) + inds = np.mod(inds, num_frames) + results[f'{modality}_inds'] = inds.astype(np.int32) + modalities.append(modality) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + if not isinstance(results['modality'], list): + # should override + results['modality'] = modalities + return results + + +@TRANSFORMS.register_module() +class MMDecode(DecordInit, DecordDecode, PoseDecode): + """Decode RGB videos and skeletons.""" + + def __init__(self, io_backend: str = 'disk', **kwargs) -> None: + DecordInit.__init__(self, io_backend=io_backend, **kwargs) + DecordDecode.__init__(self) + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMDecode`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + for mod in results['modality']: + if results[f'{mod}_inds'].ndim != 1: + results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds']) + frame_inds = results[f'{mod}_inds'] + if mod == 'RGB': + if 'filename' not in results: + results['filename'] = results['frame_dir'] + '.mp4' + video_reader = self._get_video_reader(results['filename']) + imgs = self._decord_load_frames(video_reader, frame_inds) + del video_reader + results['imgs'] = imgs + elif mod == 'Pose': + assert 'keypoint' in results + if 'keypoint_score' not in results: + keypoint_score = [ + np.ones(keypoint.shape[:-1], dtype=np.float32) + for keypoint in results['keypoint'] + ] + results['keypoint_score'] = np.stack(keypoint_score) + results['keypoint'] = self._load_kp(results['keypoint'], + frame_inds) + results['keypoint_score'] = self._load_kpscore( + results['keypoint_score'], frame_inds) + else: + raise NotImplementedError( + f'MMDecode: Modality {mod} not supported') + + # We need to scale human keypoints to the new image size + if 'imgs' in results and 'keypoint' in results: + real_img_shape = results['imgs'][0].shape[:2] + if real_img_shape != results['img_shape']: + oh, ow = results['img_shape'] + nh, nw = real_img_shape + + assert results['keypoint'].shape[-1] in [2, 3] + results['keypoint'][..., 0] *= (nw / ow) + results['keypoint'][..., 1] *= (nh / oh) + results['img_shape'] = real_img_shape + results['original_shape'] = real_img_shape + + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend})') + return repr_str + + +@TRANSFORMS.register_module() +class MMCompact(BaseTransform): + """Convert the coordinates of keypoints and crop the images to make them + more compact. + + Required Keys: + + - imgs + - keypoint + - img_shape + + Modified Keys: + + - imgs + - keypoint + - img_shape + + Args: + padding (float): The padding size. Defaults to 0.25. + threshold (int): The threshold for the tight bounding box. If the width + or height of the tight bounding box is smaller than the threshold, + we do not perform the compact operation. Defaults to 10. + hw_ratio (float | tuple[float]): The hw_ratio of the expanded + box. Float indicates the specific ratio and tuple indicates a + ratio range. If set as None, it means there is no requirement on + hw_ratio. Defaults to 1. + allow_imgpad (bool): Whether to allow expanding the box outside the + image to meet the hw_ratio requirement. Defaults to True. + """ + + def __init__(self, + padding: float = 0.25, + threshold: int = 10, + hw_ratio: Union[float, Tuple[float]] = 1, + allow_imgpad: bool = True) -> None: + + self.padding = padding + self.threshold = threshold + if hw_ratio is not None: + hw_ratio = _pair(hw_ratio) + self.hw_ratio = hw_ratio + self.allow_imgpad = allow_imgpad + assert self.padding >= 0 + + def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple: + """Calculate the bounding box surrounding all joints in the frames.""" + h, w = img_shape + + kp_x = keypoint[..., 0] + kp_y = keypoint[..., 1] + + min_x = np.min(kp_x[kp_x != 0], initial=np.Inf) + min_y = np.min(kp_y[kp_y != 0], initial=np.Inf) + max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf) + max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf) + + # The compact area is too small + if max_x - min_x < self.threshold or max_y - min_y < self.threshold: + return 0, 0, w, h + + center = ((max_x + min_x) / 2, (max_y + min_y) / 2) + half_width = (max_x - min_x) / 2 * (1 + self.padding) + half_height = (max_y - min_y) / 2 * (1 + self.padding) + + if self.hw_ratio is not None: + half_height = max(self.hw_ratio[0] * half_width, half_height) + half_width = max(1 / self.hw_ratio[1] * half_height, half_width) + + min_x, max_x = center[0] - half_width, center[0] + half_width + min_y, max_y = center[1] - half_height, center[1] + half_height + + # hot update + if not self.allow_imgpad: + min_x, min_y = int(max(0, min_x)), int(max(0, min_y)) + max_x, max_y = int(min(w, max_x)), int(min(h, max_y)) + else: + min_x, min_y = int(min_x), int(min_y) + max_x, max_y = int(max_x), int(max_y) + return min_x, min_y, max_x, max_y + + def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int], + box: Tuple[int]) -> List: + """Crop the images acoordding the bounding box.""" + h, w = img_shape + min_x, min_y, max_x, max_y = box + pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0 + if min_x < 0: + pad_l = -min_x + min_x, max_x = 0, max_x + pad_l + w += pad_l + if min_y < 0: + pad_u = -min_y + min_y, max_y = 0, max_y + pad_u + h += pad_u + if max_x > w: + pad_r = max_x - w + w = max_x + if max_y > h: + pad_d = max_y - h + h = max_y + + if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0: + imgs = [ + np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0))) + for img in imgs + ] + imgs = [img[min_y:max_y, min_x:max_x] for img in imgs] + return imgs + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMCompact`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + img_shape = results['img_shape'] + kp = results['keypoint'] + # Make NaN zero + kp[np.isnan(kp)] = 0. + min_x, min_y, max_x, max_y = self._get_box(kp, img_shape) + + kp_x, kp_y = kp[..., 0], kp[..., 1] + kp_x[kp_x != 0] -= min_x + kp_y[kp_y != 0] -= min_y + + new_shape = (max_y - min_y, max_x - min_x) + results['img_shape'] = new_shape + results['imgs'] = self._compact_images(results['imgs'], img_shape, + (min_x, min_y, max_x, max_y)) + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(padding={self.padding}, ' + f'threshold={self.threshold}, ' + f'hw_ratio={self.hw_ratio}, ' + f'allow_imgpad={self.allow_imgpad})') + return repr_str diff --git a/mmaction/datasets/transforms/processing.py b/mmaction/datasets/transforms/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..0ba68fac23babeb8f2067ba36622ce3795f86857 --- /dev/null +++ b/mmaction/datasets/transforms/processing.py @@ -0,0 +1,1444 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +import warnings +from numbers import Number +from typing import Sequence + +import cv2 +import mmcv +import mmengine +import numpy as np +from mmcv.transforms import BaseTransform +from mmcv.transforms.utils import cache_randomness +from torch.nn.modules.utils import _pair + +from mmaction.registry import TRANSFORMS + + +def _combine_quadruple(a, b): + return a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2], a[3] * b[3] + + +def _flip_quadruple(a): + return 1 - a[0] - a[2], a[1], a[2], a[3] + + +def _init_lazy_if_proper(results, lazy): + """Initialize lazy operation properly. + + Make sure that a lazy operation is properly initialized, + and avoid a non-lazy operation accidentally getting mixed in. + + Required keys in results are "imgs" if "img_shape" not in results, + otherwise, Required keys in results are "img_shape", add or modified keys + are "img_shape", "lazy". + Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip", + "flip_direction", "interpolation". + + Args: + results (dict): A dict stores data pipeline result. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + if 'img_shape' not in results: + results['img_shape'] = results['imgs'][0].shape[:2] + if lazy: + if 'lazy' not in results: + img_h, img_w = results['img_shape'] + lazyop = dict() + lazyop['original_shape'] = results['img_shape'] + lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h], + dtype=np.float32) + lazyop['flip'] = False + lazyop['flip_direction'] = None + lazyop['interpolation'] = None + results['lazy'] = lazyop + else: + assert 'lazy' not in results, 'Use Fuse after lazy operations' + + +@TRANSFORMS.register_module() +class Fuse(BaseTransform): + """Fuse lazy operations. + + Fusion order: + crop -> resize -> flip + + Required keys are "imgs", "img_shape" and "lazy", added or modified keys + are "imgs", "lazy". + Required keys in "lazy" are "crop_bbox", "interpolation", "flip_direction". + """ + + def transform(self, results): + """Fuse lazy operations. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if 'lazy' not in results: + raise ValueError('No lazy operation detected') + lazyop = results['lazy'] + imgs = results['imgs'] + + # crop + left, top, right, bottom = lazyop['crop_bbox'].round().astype(int) + imgs = [img[top:bottom, left:right] for img in imgs] + + # resize + img_h, img_w = results['img_shape'] + if lazyop['interpolation'] is None: + interpolation = 'bilinear' + else: + interpolation = lazyop['interpolation'] + imgs = [ + mmcv.imresize(img, (img_w, img_h), interpolation=interpolation) + for img in imgs + ] + + # flip + if lazyop['flip']: + for img in imgs: + mmcv.imflip_(img, lazyop['flip_direction']) + + results['imgs'] = imgs + del results['lazy'] + + return results + + +@TRANSFORMS.register_module() +class RandomCrop(BaseTransform): + """Vanilla square random crop that specifics the output size. + + Required keys in results are "img_shape", "keypoint" (optional), "imgs" + (optional), added or modified keys are "keypoint", "imgs", "lazy"; Required + keys in "lazy" are "flip", "crop_bbox", added or modified key is + "crop_bbox". + + Args: + size (int): The output size of the images. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, size, lazy=False): + if not isinstance(size, int): + raise TypeError(f'Size must be an int, but got {type(size)}') + self.size = size + self.lazy = lazy + + @staticmethod + def _crop_kps(kps, crop_bbox): + """Static method for cropping keypoint.""" + return kps - crop_bbox[:2] + + @staticmethod + def _crop_imgs(imgs, crop_bbox): + """Static method for cropping images.""" + x1, y1, x2, y2 = crop_bbox + return [img[y1:y2, x1:x2] for img in imgs] + + @staticmethod + def _box_crop(box, crop_bbox): + """Crop the bounding boxes according to the crop_bbox. + + Args: + box (np.ndarray): The bounding boxes. + crop_bbox(np.ndarray): The bbox used to crop the original image. + """ + + x1, y1, x2, y2 = crop_bbox + img_w, img_h = x2 - x1, y2 - y1 + + box_ = box.copy() + box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1) + box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1) + return box_ + + def _all_box_crop(self, results, crop_bbox): + """Crop the gt_bboxes and proposals in results according to crop_bbox. + + Args: + results (dict): All information about the sample, which contain + 'gt_bboxes' and 'proposals' (optional). + crop_bbox(np.ndarray): The bbox used to crop the original image. + """ + results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox) + if 'proposals' in results and results['proposals'] is not None: + assert results['proposals'].shape[1] == 4 + results['proposals'] = self._box_crop(results['proposals'], + crop_bbox) + return results + + def transform(self, results): + """Performs the RandomCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + assert self.size <= img_h and self.size <= img_w + + y_offset = 0 + x_offset = 0 + if img_h > self.size: + y_offset = int(np.random.randint(0, img_h - self.size)) + if img_w > self.size: + x_offset = int(np.random.randint(0, img_w - self.size)) + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = x_offset / img_w, y_offset / img_h + w_ratio, h_ratio = self.size / img_w, self.size / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + new_h, new_w = self.size, self.size + + crop_bbox = np.array( + [x_offset, y_offset, x_offset + new_w, y_offset + new_h]) + results['crop_bbox'] = crop_bbox + + results['img_shape'] = (new_h, new_w) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = x_offset * (lazy_right - lazy_left) / img_w + right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w + top = y_offset * (lazy_bottom - lazy_top) / img_h + bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array([(lazy_left + left), + (lazy_top + top), + (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + # Process entity boxes + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(size={self.size}, ' + f'lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class RandomResizedCrop(RandomCrop): + """Random crop that specifics the area and height-weight ratio range. + + Required keys in results are "img_shape", "crop_bbox", "imgs" (optional), + "keypoint" (optional), added or modified keys are "imgs", "keypoint", + "crop_bbox" and "lazy"; Required keys in "lazy" are "flip", "crop_bbox", + added or modified key is "crop_bbox". + + Args: + area_range (Tuple[float]): The candidate area scales range of + output cropped images. Default: (0.08, 1.0). + aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of + output cropped images. Default: (3 / 4, 4 / 3). + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, + area_range=(0.08, 1.0), + aspect_ratio_range=(3 / 4, 4 / 3), + lazy=False): + self.area_range = area_range + self.aspect_ratio_range = aspect_ratio_range + self.lazy = lazy + if not mmengine.is_tuple_of(self.area_range, float): + raise TypeError(f'Area_range must be a tuple of float, ' + f'but got {type(area_range)}') + if not mmengine.is_tuple_of(self.aspect_ratio_range, float): + raise TypeError(f'Aspect_ratio_range must be a tuple of float, ' + f'but got {type(aspect_ratio_range)}') + + @staticmethod + def get_crop_bbox(img_shape, + area_range, + aspect_ratio_range, + max_attempts=10): + """Get a crop bbox given the area range and aspect ratio range. + + Args: + img_shape (Tuple[int]): Image shape + area_range (Tuple[float]): The candidate area scales range of + output cropped images. Default: (0.08, 1.0). + aspect_ratio_range (Tuple[float]): The candidate aspect + ratio range of output cropped images. Default: (3 / 4, 4 / 3). + max_attempts (int): The maximum of attempts. Default: 10. + max_attempts (int): Max attempts times to generate random candidate + bounding box. If it doesn't qualified one, the center bounding + box will be used. + Returns: + (list[int]) A random crop bbox within the area range and aspect + ratio range. + """ + assert 0 < area_range[0] <= area_range[1] <= 1 + assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1] + + img_h, img_w = img_shape + area = img_h * img_w + + min_ar, max_ar = aspect_ratio_range + aspect_ratios = np.exp( + np.random.uniform( + np.log(min_ar), np.log(max_ar), size=max_attempts)) + target_areas = np.random.uniform(*area_range, size=max_attempts) * area + candidate_crop_w = np.round(np.sqrt(target_areas * + aspect_ratios)).astype(np.int32) + candidate_crop_h = np.round(np.sqrt(target_areas / + aspect_ratios)).astype(np.int32) + + for i in range(max_attempts): + crop_w = candidate_crop_w[i] + crop_h = candidate_crop_h[i] + if crop_h <= img_h and crop_w <= img_w: + x_offset = random.randint(0, img_w - crop_w) + y_offset = random.randint(0, img_h - crop_h) + return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h + + # Fallback + crop_size = min(img_h, img_w) + x_offset = (img_w - crop_size) // 2 + y_offset = (img_h - crop_size) // 2 + return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size + + def transform(self, results): + """Performs the RandomResizeCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + + left, top, right, bottom = self.get_crop_bbox( + (img_h, img_w), self.area_range, self.aspect_ratio_range) + new_h, new_w = bottom - top, right - left + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = left / img_w, top / img_h + w_ratio, h_ratio = new_w / img_w, new_h / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + crop_bbox = np.array([left, top, right, bottom]) + results['crop_bbox'] = crop_bbox + results['img_shape'] = (new_h, new_w) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = left * (lazy_right - lazy_left) / img_w + right = right * (lazy_right - lazy_left) / img_w + top = top * (lazy_bottom - lazy_top) / img_h + bottom = bottom * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array([(lazy_left + left), + (lazy_top + top), + (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'area_range={self.area_range}, ' + f'aspect_ratio_range={self.aspect_ratio_range}, ' + f'lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class MultiScaleCrop(RandomCrop): + """Crop images with a list of randomly selected scales. + + Randomly select the w and h scales from a list of scales. Scale of 1 means + the base size, which is the minimal of image width and height. The scale + level of w and h is controlled to be smaller than a certain value to + prevent too large or small aspect ratio. + + Required keys are "img_shape", "imgs" (optional), "keypoint" (optional), + added or modified keys are "imgs", "crop_bbox", "img_shape", "lazy" and + "scales". Required keys in "lazy" are "crop_bbox", added or modified key is + "crop_bbox". + + Args: + input_size (int | tuple[int]): (w, h) of network input. + scales (tuple[float]): width and height scales to be selected. + max_wh_scale_gap (int): Maximum gap of w and h scale levels. + Default: 1. + random_crop (bool): If set to True, the cropping bbox will be randomly + sampled, otherwise it will be sampler from fixed regions. + Default: False. + num_fixed_crops (int): If set to 5, the cropping bbox will keep 5 + basic fixed regions: "upper left", "upper right", "lower left", + "lower right", "center". If set to 13, the cropping bbox will + append another 8 fix regions: "center left", "center right", + "lower center", "upper center", "upper left quarter", + "upper right quarter", "lower left quarter", "lower right quarter". + Default: 5. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, + input_size, + scales=(1, ), + max_wh_scale_gap=1, + random_crop=False, + num_fixed_crops=5, + lazy=False): + self.input_size = _pair(input_size) + if not mmengine.is_tuple_of(self.input_size, int): + raise TypeError(f'Input_size must be int or tuple of int, ' + f'but got {type(input_size)}') + + if not isinstance(scales, tuple): + raise TypeError(f'Scales must be tuple, but got {type(scales)}') + + if num_fixed_crops not in [5, 13]: + raise ValueError(f'Num_fix_crops must be in {[5, 13]}, ' + f'but got {num_fixed_crops}') + + self.scales = scales + self.max_wh_scale_gap = max_wh_scale_gap + self.random_crop = random_crop + self.num_fixed_crops = num_fixed_crops + self.lazy = lazy + + def transform(self, results): + """Performs the MultiScaleCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + base_size = min(img_h, img_w) + crop_sizes = [int(base_size * s) for s in self.scales] + + candidate_sizes = [] + for i, h in enumerate(crop_sizes): + for j, w in enumerate(crop_sizes): + if abs(i - j) <= self.max_wh_scale_gap: + candidate_sizes.append([w, h]) + + crop_size = random.choice(candidate_sizes) + for i in range(2): + if abs(crop_size[i] - self.input_size[i]) < 3: + crop_size[i] = self.input_size[i] + + crop_w, crop_h = crop_size + + if self.random_crop: + x_offset = random.randint(0, img_w - crop_w) + y_offset = random.randint(0, img_h - crop_h) + else: + w_step = (img_w - crop_w) // 4 + h_step = (img_h - crop_h) // 4 + candidate_offsets = [ + (0, 0), # upper left + (4 * w_step, 0), # upper right + (0, 4 * h_step), # lower left + (4 * w_step, 4 * h_step), # lower right + (2 * w_step, 2 * h_step), # center + ] + if self.num_fixed_crops == 13: + extra_candidate_offsets = [ + (0, 2 * h_step), # center left + (4 * w_step, 2 * h_step), # center right + (2 * w_step, 4 * h_step), # lower center + (2 * w_step, 0 * h_step), # upper center + (1 * w_step, 1 * h_step), # upper left quarter + (3 * w_step, 1 * h_step), # upper right quarter + (1 * w_step, 3 * h_step), # lower left quarter + (3 * w_step, 3 * h_step) # lower right quarter + ] + candidate_offsets.extend(extra_candidate_offsets) + x_offset, y_offset = random.choice(candidate_offsets) + + new_h, new_w = crop_h, crop_w + + crop_bbox = np.array( + [x_offset, y_offset, x_offset + new_w, y_offset + new_h]) + results['crop_bbox'] = crop_bbox + results['img_shape'] = (new_h, new_w) + results['scales'] = self.scales + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = x_offset / img_w, y_offset / img_h + w_ratio, h_ratio = new_w / img_w, new_h / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = x_offset * (lazy_right - lazy_left) / img_w + right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w + top = y_offset * (lazy_bottom - lazy_top) / img_h + bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array([(lazy_left + left), + (lazy_top + top), + (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'input_size={self.input_size}, scales={self.scales}, ' + f'max_wh_scale_gap={self.max_wh_scale_gap}, ' + f'random_crop={self.random_crop}, ' + f'num_fixed_crops={self.num_fixed_crops}, ' + f'lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class Resize(BaseTransform): + """Resize images to a specific size. + + Required keys are "img_shape", "modality", "imgs" (optional), "keypoint" + (optional), added or modified keys are "imgs", "img_shape", "keep_ratio", + "scale_factor", "lazy", "resize_size". Required keys in "lazy" is None, + added or modified key is "interpolation". + + Args: + scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling + factor or maximum size: + If it is a float number, the image will be rescaled by this + factor, else if it is a tuple of 2 integers, the image will + be rescaled as large as possible within the scale. + Otherwise, it serves as (w, h) of output size. + keep_ratio (bool): If set to True, Images will be resized without + changing the aspect ratio. Otherwise, it will resize images to a + given size. Default: True. + interpolation (str): Algorithm used for interpolation, + accepted values are "nearest", "bilinear", "bicubic", "area", + "lanczos". Default: "bilinear". + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, + scale, + keep_ratio=True, + interpolation='bilinear', + lazy=False): + if isinstance(scale, float): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + if max_short_edge == -1: + # assign np.inf to long edge for rescaling short edge later. + scale = (np.inf, max_long_edge) + else: + raise TypeError( + f'Scale must be float or tuple of int, but got {type(scale)}') + self.scale = scale + self.keep_ratio = keep_ratio + self.interpolation = interpolation + self.lazy = lazy + + def _resize_imgs(self, imgs, new_w, new_h): + """Static method for resizing keypoint.""" + return [ + mmcv.imresize( + img, (new_w, new_h), interpolation=self.interpolation) + for img in imgs + ] + + @staticmethod + def _resize_kps(kps, scale_factor): + """Static method for resizing keypoint.""" + return kps * scale_factor + + @staticmethod + def _box_resize(box, scale_factor): + """Rescale the bounding boxes according to the scale_factor. + + Args: + box (np.ndarray): The bounding boxes. + scale_factor (np.ndarray): The scale factor used for rescaling. + """ + assert len(scale_factor) == 2 + scale_factor = np.concatenate([scale_factor, scale_factor]) + return box * scale_factor + + def transform(self, results): + """Performs the Resize augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + if 'scale_factor' not in results: + results['scale_factor'] = np.array([1, 1], dtype=np.float32) + img_h, img_w = results['img_shape'] + + if self.keep_ratio: + new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale) + else: + new_w, new_h = self.scale + + self.scale_factor = np.array([new_w / img_w, new_h / img_h], + dtype=np.float32) + + results['img_shape'] = (new_h, new_w) + results['keep_ratio'] = self.keep_ratio + results['scale_factor'] = results['scale_factor'] * self.scale_factor + + if not self.lazy: + if 'imgs' in results: + results['imgs'] = self._resize_imgs(results['imgs'], new_w, + new_h) + if 'keypoint' in results: + results['keypoint'] = self._resize_kps(results['keypoint'], + self.scale_factor) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + lazyop['interpolation'] = self.interpolation + + if 'gt_bboxes' in results: + assert not self.lazy + results['gt_bboxes'] = self._box_resize(results['gt_bboxes'], + self.scale_factor) + if 'proposals' in results and results['proposals'] is not None: + assert results['proposals'].shape[1] == 4 + results['proposals'] = self._box_resize( + results['proposals'], self.scale_factor) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'scale={self.scale}, keep_ratio={self.keep_ratio}, ' + f'interpolation={self.interpolation}, ' + f'lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class RandomRescale(BaseTransform): + """Randomly resize images so that the short_edge is resized to a specific + size in a given range. The scale ratio is unchanged after resizing. + + Required keys are "imgs", "img_shape", "modality", added or modified + keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size", + "short_edge". + + Args: + scale_range (tuple[int]): The range of short edge length. A closed + interval. + interpolation (str): Algorithm used for interpolation: + "nearest" | "bilinear". Default: "bilinear". + """ + + def __init__(self, scale_range, interpolation='bilinear'): + self.scale_range = scale_range + # make sure scale_range is legal, first make sure the type is OK + assert mmengine.is_tuple_of(scale_range, int) + assert len(scale_range) == 2 + assert scale_range[0] < scale_range[1] + assert np.all([x > 0 for x in scale_range]) + + self.keep_ratio = True + self.interpolation = interpolation + + def transform(self, results): + """Performs the Resize augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + short_edge = np.random.randint(self.scale_range[0], + self.scale_range[1] + 1) + resize = Resize((-1, short_edge), + keep_ratio=True, + interpolation=self.interpolation, + lazy=False) + results = resize(results) + + results['short_edge'] = short_edge + return results + + def __repr__(self): + scale_range = self.scale_range + repr_str = (f'{self.__class__.__name__}(' + f'scale_range=({scale_range[0]}, {scale_range[1]}), ' + f'interpolation={self.interpolation})') + return repr_str + + +@TRANSFORMS.register_module() +class Flip(BaseTransform): + """Flip the input images with a probability. + + Reverse the order of elements in the given imgs with a specific direction. + The shape of the imgs is preserved, but the elements are reordered. + + Required keys are "img_shape", "modality", "imgs" (optional), "keypoint" + (optional), added or modified keys are "imgs", "keypoint", "lazy" and + "flip_direction". Required keys in "lazy" is None, added or modified key + are "flip" and "flip_direction". The Flip augmentation should be placed + after any cropping / reshaping augmentations, to make sure crop_quadruple + is calculated properly. + + Args: + flip_ratio (float): Probability of implementing flip. Default: 0.5. + direction (str): Flip imgs horizontally or vertically. Options are + "horizontal" | "vertical". Default: "horizontal". + flip_label_map (Dict[int, int] | None): Transform the label of the + flipped image with the specific label. Default: None. + left_kp (list[int]): Indexes of left keypoints, used to flip keypoints. + Default: None. + right_kp (list[ind]): Indexes of right keypoints, used to flip + keypoints. Default: None. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + _directions = ['horizontal', 'vertical'] + + def __init__(self, + flip_ratio=0.5, + direction='horizontal', + flip_label_map=None, + left_kp=None, + right_kp=None, + lazy=False): + if direction not in self._directions: + raise ValueError(f'Direction {direction} is not supported. ' + f'Currently support ones are {self._directions}') + self.flip_ratio = flip_ratio + self.direction = direction + self.flip_label_map = flip_label_map + self.left_kp = left_kp + self.right_kp = right_kp + self.lazy = lazy + + def _flip_imgs(self, imgs, modality): + """Utility function for flipping images.""" + _ = [mmcv.imflip_(img, self.direction) for img in imgs] + lt = len(imgs) + if modality == 'Flow': + # The 1st frame of each 2 frames is flow-x + for i in range(0, lt, 2): + imgs[i] = mmcv.iminvert(imgs[i]) + return imgs + + def _flip_kps(self, kps, kpscores, img_width): + """Utility function for flipping keypoint.""" + kp_x = kps[..., 0] + kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0] + new_order = list(range(kps.shape[2])) + if self.left_kp is not None and self.right_kp is not None: + for left, right in zip(self.left_kp, self.right_kp): + new_order[left] = right + new_order[right] = left + kps = kps[:, :, new_order] + if kpscores is not None: + kpscores = kpscores[:, :, new_order] + return kps, kpscores + + @staticmethod + def _box_flip(box, img_width): + """Flip the bounding boxes given the width of the image. + + Args: + box (np.ndarray): The bounding boxes. + img_width (int): The img width. + """ + box_ = box.copy() + box_[..., 0::4] = img_width - box[..., 2::4] + box_[..., 2::4] = img_width - box[..., 0::4] + return box_ + + def transform(self, results): + """Performs the Flip augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + assert self.direction == 'horizontal', ( + 'Only horizontal flips are' + 'supported for human keypoints') + + modality = results['modality'] + if modality == 'Flow': + assert self.direction == 'horizontal' + + flip = np.random.rand() < self.flip_ratio + + results['flip'] = flip + results['flip_direction'] = self.direction + img_width = results['img_shape'][1] + + if self.flip_label_map is not None and flip: + results['label'] = self.flip_label_map.get(results['label'], + results['label']) + + if not self.lazy: + if flip: + if 'imgs' in results: + results['imgs'] = self._flip_imgs(results['imgs'], + modality) + if 'keypoint' in results: + kp = results['keypoint'] + kpscore = results.get('keypoint_score', None) + kp, kpscore = self._flip_kps(kp, kpscore, img_width) + results['keypoint'] = kp + if 'keypoint_score' in results: + results['keypoint_score'] = kpscore + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Use one Flip please') + lazyop['flip'] = flip + lazyop['flip_direction'] = self.direction + + if 'gt_bboxes' in results and flip: + assert not self.lazy and self.direction == 'horizontal' + width = results['img_shape'][1] + results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width) + if 'proposals' in results and results['proposals'] is not None: + assert results['proposals'].shape[1] == 4 + results['proposals'] = self._box_flip(results['proposals'], + width) + + return results + + def __repr__(self): + repr_str = ( + f'{self.__class__.__name__}(' + f'flip_ratio={self.flip_ratio}, direction={self.direction}, ' + f'flip_label_map={self.flip_label_map}, lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class ColorJitter(BaseTransform): + """Perform ColorJitter to each img. + + Required keys are "imgs", added or modified keys are "imgs". + + Args: + brightness (float | tuple[float]): The jitter range for brightness, if + set as a float, the range will be (1 - brightness, 1 + brightness). + Default: 0.5. + contrast (float | tuple[float]): The jitter range for contrast, if set + as a float, the range will be (1 - contrast, 1 + contrast). + Default: 0.5. + saturation (float | tuple[float]): The jitter range for saturation, if + set as a float, the range will be (1 - saturation, 1 + saturation). + Default: 0.5. + hue (float | tuple[float]): The jitter range for hue, if set as a + float, the range will be (-hue, hue). Default: 0.1. + """ + + @staticmethod + def check_input(val, max, base): + if isinstance(val, tuple): + assert base - max <= val[0] <= val[1] <= base + max + return val + assert val <= max + return (base - val, base + val) + + @staticmethod + def rgb_to_grayscale(img): + return 0.2989 * img[..., 0] + 0.587 * img[..., 1] + 0.114 * img[..., 2] + + @staticmethod + def adjust_contrast(img, factor): + val = np.mean(ColorJitter.rgb_to_grayscale(img)) + return factor * img + (1 - factor) * val + + @staticmethod + def adjust_saturation(img, factor): + gray = np.stack([ColorJitter.rgb_to_grayscale(img)] * 3, axis=-1) + return factor * img + (1 - factor) * gray + + @staticmethod + def adjust_hue(img, factor): + img = np.clip(img, 0, 255).astype(np.uint8) + hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) + offset = int(factor * 255) + hsv[..., 0] = (hsv[..., 0] + offset) % 180 + img = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB) + return img.astype(np.float32) + + def __init__(self, brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1): + self.brightness = self.check_input(brightness, 1, 1) + self.contrast = self.check_input(contrast, 1, 1) + self.saturation = self.check_input(saturation, 1, 1) + self.hue = self.check_input(hue, 0.5, 0) + self.fn_idx = np.random.permutation(4) + + def transform(self, results): + """Perform ColorJitter. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + imgs = results['imgs'] + num_clips, clip_len = 1, len(imgs) + + new_imgs = [] + for i in range(num_clips): + b = np.random.uniform( + low=self.brightness[0], high=self.brightness[1]) + c = np.random.uniform(low=self.contrast[0], high=self.contrast[1]) + s = np.random.uniform( + low=self.saturation[0], high=self.saturation[1]) + h = np.random.uniform(low=self.hue[0], high=self.hue[1]) + start, end = i * clip_len, (i + 1) * clip_len + + for img in imgs[start:end]: + img = img.astype(np.float32) + for fn_id in self.fn_idx: + if fn_id == 0 and b != 1: + img *= b + if fn_id == 1 and c != 1: + img = self.adjust_contrast(img, c) + if fn_id == 2 and s != 1: + img = self.adjust_saturation(img, s) + if fn_id == 3 and h != 0: + img = self.adjust_hue(img, h) + img = np.clip(img, 0, 255).astype(np.uint8) + new_imgs.append(img) + results['imgs'] = new_imgs + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'brightness={self.brightness}, ' + f'contrast={self.contrast}, ' + f'saturation={self.saturation}, ' + f'hue={self.hue})') + return repr_str + + +@TRANSFORMS.register_module() +class CenterCrop(RandomCrop): + """Crop the center area from images. + + Required keys are "img_shape", "imgs" (optional), "keypoint" (optional), + added or modified keys are "imgs", "keypoint", "crop_bbox", "lazy" and + "img_shape". Required keys in "lazy" is "crop_bbox", added or modified key + is "crop_bbox". + + Args: + crop_size (int | tuple[int]): (w, h) of crop size. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, crop_size, lazy=False): + self.crop_size = _pair(crop_size) + self.lazy = lazy + if not mmengine.is_tuple_of(self.crop_size, int): + raise TypeError(f'Crop_size must be int or tuple of int, ' + f'but got {type(crop_size)}') + + def transform(self, results): + """Performs the CenterCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + crop_w, crop_h = self.crop_size + + left = (img_w - crop_w) // 2 + top = (img_h - crop_h) // 2 + right = left + crop_w + bottom = top + crop_h + new_h, new_w = bottom - top, right - left + + crop_bbox = np.array([left, top, right, bottom]) + results['crop_bbox'] = crop_bbox + results['img_shape'] = (new_h, new_w) + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = left / img_w, top / img_h + w_ratio, h_ratio = new_w / img_w, new_h / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = left * (lazy_right - lazy_left) / img_w + right = right * (lazy_right - lazy_left) / img_w + top = top * (lazy_bottom - lazy_top) / img_h + bottom = bottom * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array([(lazy_left + left), + (lazy_top + top), + (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, ' + f'lazy={self.lazy})') + return repr_str + + +@TRANSFORMS.register_module() +class ThreeCrop(BaseTransform): + """Crop images into three crops. + + Crop the images equally into three crops with equal intervals along the + shorter side. + Required keys are "imgs", "img_shape", added or modified keys are "imgs", + "crop_bbox" and "img_shape". + + Args: + crop_size(int | tuple[int]): (w, h) of crop size. + """ + + def __init__(self, crop_size): + self.crop_size = _pair(crop_size) + if not mmengine.is_tuple_of(self.crop_size, int): + raise TypeError(f'Crop_size must be int or tuple of int, ' + f'but got {type(crop_size)}') + + def transform(self, results): + """Performs the ThreeCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, False) + if 'gt_bboxes' in results or 'proposals' in results: + warnings.warn('ThreeCrop cannot process bounding boxes') + + imgs = results['imgs'] + img_h, img_w = results['imgs'][0].shape[:2] + crop_w, crop_h = self.crop_size + assert crop_h == img_h or crop_w == img_w + + if crop_h == img_h: + w_step = (img_w - crop_w) // 2 + offsets = [ + (0, 0), # left + (2 * w_step, 0), # right + (w_step, 0), # middle + ] + elif crop_w == img_w: + h_step = (img_h - crop_h) // 2 + offsets = [ + (0, 0), # top + (0, 2 * h_step), # down + (0, h_step), # middle + ] + + cropped = [] + crop_bboxes = [] + for x_offset, y_offset in offsets: + bbox = [x_offset, y_offset, x_offset + crop_w, y_offset + crop_h] + crop = [ + img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w] + for img in imgs + ] + cropped.extend(crop) + crop_bboxes.extend([bbox for _ in range(len(imgs))]) + + crop_bboxes = np.array(crop_bboxes) + results['imgs'] = cropped + results['crop_bbox'] = crop_bboxes + results['img_shape'] = results['imgs'][0].shape[:2] + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(crop_size={self.crop_size})' + return repr_str + + +@TRANSFORMS.register_module() +class TenCrop(BaseTransform): + """Crop the images into 10 crops (corner + center + flip). + + Crop the four corners and the center part of the image with the same + given crop_size, and flip it horizontally. + Required keys are "imgs", "img_shape", added or modified keys are "imgs", + "crop_bbox" and "img_shape". + + Args: + crop_size(int | tuple[int]): (w, h) of crop size. + """ + + def __init__(self, crop_size): + self.crop_size = _pair(crop_size) + if not mmengine.is_tuple_of(self.crop_size, int): + raise TypeError(f'Crop_size must be int or tuple of int, ' + f'but got {type(crop_size)}') + + def transform(self, results): + """Performs the TenCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, False) + + if 'gt_bboxes' in results or 'proposals' in results: + warnings.warn('TenCrop cannot process bounding boxes') + + imgs = results['imgs'] + + img_h, img_w = results['imgs'][0].shape[:2] + crop_w, crop_h = self.crop_size + + w_step = (img_w - crop_w) // 4 + h_step = (img_h - crop_h) // 4 + + offsets = [ + (0, 0), # upper left + (4 * w_step, 0), # upper right + (0, 4 * h_step), # lower left + (4 * w_step, 4 * h_step), # lower right + (2 * w_step, 2 * h_step), # center + ] + + img_crops = list() + crop_bboxes = list() + for x_offset, y_offsets in offsets: + crop = [ + img[y_offsets:y_offsets + crop_h, x_offset:x_offset + crop_w] + for img in imgs + ] + flip_crop = [np.flip(c, axis=1).copy() for c in crop] + bbox = [x_offset, y_offsets, x_offset + crop_w, y_offsets + crop_h] + img_crops.extend(crop) + img_crops.extend(flip_crop) + crop_bboxes.extend([bbox for _ in range(len(imgs) * 2)]) + + crop_bboxes = np.array(crop_bboxes) + results['imgs'] = img_crops + results['crop_bbox'] = crop_bboxes + results['img_shape'] = results['imgs'][0].shape[:2] + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(crop_size={self.crop_size})' + return repr_str + + +@TRANSFORMS.register_module() +class RandomErasing(BaseTransform): + """Randomly selects a rectangle region in an image and erase pixels. + basically refer mmcls. + + **Required Keys:** + + - img + + **Modified Keys:** + + - img + + Args: + erase_prob (float): Probability that image will be randomly erased. + Default: 0.5 + min_area_ratio (float): Minimum erased area / input image area + Default: 0.02 + max_area_ratio (float): Maximum erased area / input image area + Default: 1/3 + aspect_range (sequence | float): Aspect ratio range of erased area. + if float, it will be converted to (aspect_ratio, 1/aspect_ratio) + Default: (3/10, 10/3) + mode (str): Fill method in erased area, can be: + + - const (default): All pixels are assign with the same value. + - rand: each pixel is assigned with a random value in [0, 255] + + fill_color (sequence | Number): Base color filled in erased area. + Defaults to (128, 128, 128). + fill_std (sequence | Number, optional): If set and ``mode`` is 'rand', + fill erased area with random color from normal distribution + (mean=fill_color, std=fill_std); If not set, fill erased area with + random color from uniform distribution (0~255). Defaults to None. + + Note: + See `Random Erasing Data Augmentation + `_ + + This paper provided 4 modes: RE-R, RE-M, RE-0, RE-255, and use RE-M as + default. The config of these 4 modes are: + + - RE-R: RandomErasing(mode='rand') + - RE-M: RandomErasing(mode='const', fill_color=(123.67, 116.3, 103.5)) + - RE-0: RandomErasing(mode='const', fill_color=0) + - RE-255: RandomErasing(mode='const', fill_color=255) + """ + + def __init__(self, + erase_prob=0.5, + min_area_ratio=0.02, + max_area_ratio=1 / 3, + aspect_range=(3 / 10, 10 / 3), + mode='const', + fill_color=(128, 128, 128), + fill_std=None): + assert isinstance(erase_prob, float) and 0. <= erase_prob <= 1. + assert isinstance(min_area_ratio, float) and 0. <= min_area_ratio <= 1. + assert isinstance(max_area_ratio, float) and 0. <= max_area_ratio <= 1. + assert min_area_ratio <= max_area_ratio, \ + 'min_area_ratio should be smaller than max_area_ratio' + if isinstance(aspect_range, float): + aspect_range = min(aspect_range, 1 / aspect_range) + aspect_range = (aspect_range, 1 / aspect_range) + assert isinstance(aspect_range, Sequence) and len(aspect_range) == 2 \ + and all(isinstance(x, float) for x in aspect_range), \ + 'aspect_range should be a float or Sequence with two float.' + assert all(x > 0 for x in aspect_range), \ + 'aspect_range should be positive.' + assert aspect_range[0] <= aspect_range[1], \ + 'In aspect_range (min, max), min should be smaller than max.' + assert mode in ['const', 'rand'], \ + 'Please select `mode` from ["const", "rand"].' + if isinstance(fill_color, Number): + fill_color = [fill_color] * 3 + assert isinstance(fill_color, Sequence) and len(fill_color) == 3 \ + and all(isinstance(x, Number) for x in fill_color), \ + 'fill_color should be a float or Sequence with three int.' + if fill_std is not None: + if isinstance(fill_std, Number): + fill_std = [fill_std] * 3 + assert isinstance(fill_std, Sequence) and len(fill_std) == 3 \ + and all(isinstance(x, Number) for x in fill_std), \ + 'fill_std should be a float or Sequence with three int.' + + self.erase_prob = erase_prob + self.min_area_ratio = min_area_ratio + self.max_area_ratio = max_area_ratio + self.aspect_range = aspect_range + self.mode = mode + self.fill_color = fill_color + self.fill_std = fill_std + + def _img_fill_pixels(self, img, top, left, h, w): + """Fill pixels to the patch of image.""" + if self.mode == 'const': + patch = np.empty((h, w, 3), dtype=np.uint8) + patch[:, :] = np.array(self.fill_color, dtype=np.uint8) + elif self.fill_std is None: + # Uniform distribution + patch = np.random.uniform(0, 256, (h, w, 3)).astype(np.uint8) + else: + # Normal distribution + patch = np.random.normal(self.fill_color, self.fill_std, (h, w, 3)) + patch = np.clip(patch.astype(np.int32), 0, 255).astype(np.uint8) + + img[top:top + h, left:left + w] = patch + return img + + def _fill_pixels(self, imgs, top, left, h, w): + """Fill pixels to the patch of each image in frame clip.""" + return [self._img_fill_pixels(img, top, left, h, w) for img in imgs] + + @cache_randomness + def random_disable(self): + """Randomly disable the transform.""" + return np.random.rand() > self.erase_prob + + @cache_randomness + def random_patch(self, img_h, img_w): + """Randomly generate patch the erase.""" + # convert the aspect ratio to log space to equally handle width and + # height. + log_aspect_range = np.log( + np.array(self.aspect_range, dtype=np.float32)) + aspect_ratio = np.exp(np.random.uniform(*log_aspect_range)) + area = img_h * img_w + area *= np.random.uniform(self.min_area_ratio, self.max_area_ratio) + + h = min(int(round(np.sqrt(area * aspect_ratio))), img_h) + w = min(int(round(np.sqrt(area / aspect_ratio))), img_w) + top = np.random.randint(0, img_h - h) if img_h > h else 0 + left = np.random.randint(0, img_w - w) if img_w > w else 0 + return top, left, h, w + + def transform(self, results): + """ + Args: + results (dict): Results dict from pipeline + + Returns: + dict: Results after the transformation. + """ + if self.random_disable(): + return results + + imgs = results['imgs'] + img_h, img_w = imgs[0].shape[:2] + + imgs = self._fill_pixels(imgs, *self.random_patch(img_h, img_w)) + + results['imgs'] = imgs + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(erase_prob={self.erase_prob}, ' + repr_str += f'min_area_ratio={self.min_area_ratio}, ' + repr_str += f'max_area_ratio={self.max_area_ratio}, ' + repr_str += f'aspect_range={self.aspect_range}, ' + repr_str += f'mode={self.mode}, ' + repr_str += f'fill_color={self.fill_color}, ' + repr_str += f'fill_std={self.fill_std})' + return repr_str diff --git a/mmaction/datasets/transforms/text_transforms.py b/mmaction/datasets/transforms/text_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..46b5d982ca17779cfc696432b6e6ffcd6a1d29cb --- /dev/null +++ b/mmaction/datasets/transforms/text_transforms.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + +from mmcv.transforms import BaseTransform + +from mmaction.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class CLIPTokenize(BaseTransform): + """Tokenize text and convert to tensor.""" + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`CLIPTokenize`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + + try: + import clip + except ImportError: + raise ImportError('Please run `pip install ' + 'git+https://github.com/openai/CLIP.git` ' + 'to install clip first. ') + + text = results['text'] + text_tokenized = clip.tokenize(text)[0] + results['text'] = text_tokenized + return results diff --git a/mmaction/datasets/transforms/wrappers.py b/mmaction/datasets/transforms/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..cf41be7f60f23e63624b3c527542f5614f519932 --- /dev/null +++ b/mmaction/datasets/transforms/wrappers.py @@ -0,0 +1,380 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random + +import mmengine +import numpy as np +from mmcv.transforms import BaseTransform, to_tensor +from mmengine.utils import digit_version + +from mmaction.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class TorchVisionWrapper(BaseTransform): + """Torchvision Augmentations, under torchvision.transforms. + + Args: + op (str): The name of the torchvision transformation. + """ + + def __init__(self, op, **kwargs): + try: + import torchvision + import torchvision.transforms as tv_trans + except ImportError: + raise RuntimeError('Install torchvision to use TorchvisionTrans') + if digit_version(torchvision.__version__) < digit_version('0.8.0'): + raise RuntimeError('The version of torchvision should be at least ' + '0.8.0') + + trans = getattr(tv_trans, op, None) + assert trans, f'Transform {op} not in torchvision' + self.trans = trans(**kwargs) + + def transform(self, results): + """Perform Torchvision augmentations. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + assert 'imgs' in results + + imgs = [x.transpose(2, 0, 1) for x in results['imgs']] + imgs = to_tensor(np.stack(imgs)) + + imgs = self.trans(imgs).data.numpy() + imgs[imgs > 255] = 255 + imgs[imgs < 0] = 0 + imgs = imgs.astype(np.uint8) + imgs = [x.transpose(1, 2, 0) for x in imgs] + results['imgs'] = imgs + return results + + +@TRANSFORMS.register_module() +class PytorchVideoWrapper(BaseTransform): + """PytorchVideoTrans Augmentations, under pytorchvideo.transforms. + + Args: + op (str): The name of the pytorchvideo transformation. + """ + + def __init__(self, op, **kwargs): + try: + import pytorchvideo.transforms as ptv_trans + import torch + except ImportError: + raise RuntimeError('Install pytorchvideo to use PytorchVideoTrans') + if digit_version(torch.__version__) < digit_version('1.8.0'): + raise RuntimeError( + 'The version of PyTorch should be at least 1.8.0') + + trans = getattr(ptv_trans, op, None) + assert trans, f'Transform {op} not in pytorchvideo' + + supported_pytorchvideo_trans = ('AugMix', 'RandAugment', + 'RandomResizedCrop', 'ShortSideScale', + 'RandomShortSideScale') + assert op in supported_pytorchvideo_trans,\ + f'PytorchVideo Transform {op} is not supported in MMAction2' + + self.trans = trans(**kwargs) + self.op = op + + def transform(self, results): + """Perform PytorchVideoTrans augmentations. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + assert 'imgs' in results + + assert 'gt_bboxes' not in results,\ + f'PytorchVideo {self.op} doesn\'t support bboxes yet.' + assert 'proposals' not in results,\ + f'PytorchVideo {self.op} doesn\'t support bboxes yet.' + + if self.op in ('AugMix', 'RandAugment'): + # list[ndarray(h, w, 3)] -> torch.tensor(t, c, h, w) + imgs = [x.transpose(2, 0, 1) for x in results['imgs']] + imgs = to_tensor(np.stack(imgs)) + else: + # list[ndarray(h, w, 3)] -> torch.tensor(c, t, h, w) + # uint8 -> float32 + imgs = to_tensor((np.stack(results['imgs']).transpose(3, 0, 1, 2) / + 255.).astype(np.float32)) + + imgs = self.trans(imgs).data.numpy() + + if self.op in ('AugMix', 'RandAugment'): + imgs[imgs > 255] = 255 + imgs[imgs < 0] = 0 + imgs = imgs.astype(np.uint8) + + # torch.tensor(t, c, h, w) -> list[ndarray(h, w, 3)] + imgs = [x.transpose(1, 2, 0) for x in imgs] + else: + # float32 -> uint8 + imgs = imgs * 255 + imgs[imgs > 255] = 255 + imgs[imgs < 0] = 0 + imgs = imgs.astype(np.uint8) + + # torch.tensor(c, t, h, w) -> list[ndarray(h, w, 3)] + imgs = [x for x in imgs.transpose(1, 2, 3, 0)] + + results['imgs'] = imgs + + return results + + +@TRANSFORMS.register_module() +class ImgAug(BaseTransform): + """Imgaug augmentation. + + Adds custom transformations from imgaug library. + Please visit `https://imgaug.readthedocs.io/en/latest/index.html` + to get more information. Two demo configs could be found in tsn and i3d + config folder. + + It's better to use uint8 images as inputs since imgaug works best with + numpy dtype uint8 and isn't well tested with other dtypes. It should be + noted that not all of the augmenters have the same input and output dtype, + which may cause unexpected results. + + Required keys are "imgs", "img_shape"(if "gt_bboxes" is not None) and + "modality", added or modified keys are "imgs", "img_shape", "gt_bboxes" + and "proposals". + + It is worth mentioning that `Imgaug` will NOT create custom keys like + "interpolation", "crop_bbox", "flip_direction", etc. So when using + `Imgaug` along with other mmaction2 pipelines, we should pay more attention + to required keys. + + Two steps to use `Imgaug` pipeline: + 1. Create initialization parameter `transforms`. There are three ways + to create `transforms`. + 1) string: only support `default` for now. + e.g. `transforms='default'` + 2) list[dict]: create a list of augmenters by a list of dicts, each + dict corresponds to one augmenter. Every dict MUST contain a key + named `type`. `type` should be a string(iaa.Augmenter's name) or + an iaa.Augmenter subclass. + e.g. `transforms=[dict(type='Rotate', rotate=(-20, 20))]` + e.g. `transforms=[dict(type=iaa.Rotate, rotate=(-20, 20))]` + 3) iaa.Augmenter: create an imgaug.Augmenter object. + e.g. `transforms=iaa.Rotate(rotate=(-20, 20))` + 2. Add `Imgaug` in dataset pipeline. It is recommended to insert imgaug + pipeline before `Normalize`. A demo pipeline is listed as follows. + ``` + pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + ), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Imgaug', transforms='default'), + # dict(type='Imgaug', transforms=[ + # dict(type='Rotate', rotate=(-20, 20)) + # ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) + ] + ``` + + Args: + transforms (str | list[dict] | :obj:`iaa.Augmenter`): Three different + ways to create imgaug augmenter. + """ + + def __init__(self, transforms): + # Hack to fix incompatibility of ImgAug and latest Numpy + if digit_version(np.__version__) >= digit_version('1.24.0'): + np.bool = bool + import imgaug.augmenters as iaa + + if transforms == 'default': + self.transforms = self.default_transforms() + elif isinstance(transforms, list): + assert all(isinstance(trans, dict) for trans in transforms) + self.transforms = transforms + elif isinstance(transforms, iaa.Augmenter): + self.aug = self.transforms = transforms + else: + raise ValueError('transforms must be `default` or a list of dicts' + ' or iaa.Augmenter object') + + if not isinstance(transforms, iaa.Augmenter): + self.aug = iaa.Sequential( + [self.imgaug_builder(t) for t in self.transforms]) + + @staticmethod + def default_transforms(): + """Default transforms for imgaug. + + Implement RandAugment by imgaug. + Please visit `https://arxiv.org/abs/1909.13719` for more information. + + Augmenters and hyper parameters are borrowed from the following repo: + https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py # noqa + + Miss one augmenter ``SolarizeAdd`` since imgaug doesn't support this. + + Returns: + dict: The constructed RandAugment transforms. + """ + # RandAugment hyper params + num_augmenters = 2 + cur_magnitude, max_magnitude = 9, 10 + cur_level = 1.0 * cur_magnitude / max_magnitude + + return [ + dict( + type='SomeOf', + n=num_augmenters, + children=[ + dict( + type='ShearX', + shear=17.19 * cur_level * random.choice([-1, 1])), + dict( + type='ShearY', + shear=17.19 * cur_level * random.choice([-1, 1])), + dict( + type='TranslateX', + percent=.2 * cur_level * random.choice([-1, 1])), + dict( + type='TranslateY', + percent=.2 * cur_level * random.choice([-1, 1])), + dict( + type='Rotate', + rotate=30 * cur_level * random.choice([-1, 1])), + dict(type='Posterize', nb_bits=max(1, int(4 * cur_level))), + dict(type='Solarize', threshold=256 * cur_level), + dict(type='EnhanceColor', factor=1.8 * cur_level + .1), + dict(type='EnhanceContrast', factor=1.8 * cur_level + .1), + dict( + type='EnhanceBrightness', factor=1.8 * cur_level + .1), + dict(type='EnhanceSharpness', factor=1.8 * cur_level + .1), + dict(type='Autocontrast', cutoff=0), + dict(type='Equalize'), + dict(type='Invert', p=1.), + dict( + type='Cutout', + nb_iterations=1, + size=0.2 * cur_level, + squared=True) + ]) + ] + + def imgaug_builder(self, cfg): + """Import a module from imgaug. + + It follows the logic of :func:`build_from_cfg`. Use a dict object to + create an iaa.Augmenter object. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + + Returns: + obj:`iaa.Augmenter`: The constructed imgaug augmenter. + """ + import imgaug.augmenters as iaa + + assert isinstance(cfg, dict) and 'type' in cfg + args = cfg.copy() + + obj_type = args.pop('type') + if mmengine.is_str(obj_type): + obj_cls = getattr(iaa, obj_type) if hasattr(iaa, obj_type) \ + else getattr(iaa.pillike, obj_type) + elif issubclass(obj_type, iaa.Augmenter): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + + for aug_list_key in ['children', 'then_list', 'else_list']: + if aug_list_key in args: + args[aug_list_key] = [ + self.imgaug_builder(child) for child in args[aug_list_key] + ] + + return obj_cls(**args) + + def __repr__(self): + repr_str = self.__class__.__name__ + f'(transforms={self.aug})' + return repr_str + + def transform(self, results): + """Perform Imgaug augmentations. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + assert results['modality'] == 'RGB', 'Imgaug only support RGB images.' + in_type = results['imgs'][0].dtype + + cur_aug = self.aug.to_deterministic() + + results['imgs'] = [ + cur_aug.augment_image(frame) for frame in results['imgs'] + ] + img_h, img_w, _ = results['imgs'][0].shape + + out_type = results['imgs'][0].dtype + assert in_type == out_type, \ + ('Imgaug input dtype and output dtype are not the same. ', + f'Convert from {in_type} to {out_type}') + + if 'gt_bboxes' in results: + from imgaug.augmentables import bbs + bbox_list = [ + bbs.BoundingBox( + x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3]) + for bbox in results['gt_bboxes'] + ] + bboxes = bbs.BoundingBoxesOnImage( + bbox_list, shape=results['img_shape']) + bbox_aug, *_ = cur_aug.augment_bounding_boxes([bboxes]) + results['gt_bboxes'] = [[ + max(bbox.x1, 0), + max(bbox.y1, 0), + min(bbox.x2, img_w), + min(bbox.y2, img_h) + ] for bbox in bbox_aug.items] + if 'proposals' in results: + bbox_list = [ + bbs.BoundingBox( + x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3]) + for bbox in results['proposals'] + ] + bboxes = bbs.BoundingBoxesOnImage( + bbox_list, shape=results['img_shape']) + bbox_aug, *_ = cur_aug.augment_bounding_boxes([bboxes]) + results['proposals'] = [[ + max(bbox.x1, 0), + max(bbox.y1, 0), + min(bbox.x2, img_w), + min(bbox.y2, img_h) + ] for bbox in bbox_aug.items] + + results['img_shape'] = (img_h, img_w) + + return results diff --git a/mmaction/datasets/video_dataset.py b/mmaction/datasets/video_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ac76dd9ae426060c75e395c9477e11d0eca56de4 --- /dev/null +++ b/mmaction/datasets/video_dataset.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Callable, List, Optional, Union + +from mmengine.fileio import exists, list_from_file + +from mmaction.registry import DATASETS +from mmaction.utils import ConfigType +from .base import BaseActionDataset + + +@DATASETS.register_module() +class VideoDataset(BaseActionDataset): + """Video dataset for action recognition. + + The dataset loads raw videos and apply specified transforms to return a + dict containing the frame tensors and other information. + + The ann_file is a text file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a + whitespace. Example of a annotation file: + + .. code-block:: txt + + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + some/path/003.mp4 2 + some/path/004.mp4 3 + some/path/005.mp4 3 + + + Args: + ann_file (str): Path to the annotation file. + pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of + data transforms. + data_prefix (dict or ConfigDict): Path to a directory where videos + are held. Defaults to ``dict(video='')``. + multi_class (bool): Determines whether the dataset is a multi-class + dataset. Defaults to False. + num_classes (int, optional): Number of classes of the dataset, used in + multi-class datasets. Defaults to None. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking videos as input, + it should be set to 0, since frames loaded from videos count + from 0. Defaults to 0. + modality (str): Modality of data. Support ``'RGB'``, ``'Flow'``. + Defaults to ``'RGB'``. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + delimiter (str): Delimiter for the annotation file. + Defaults to ``' '`` (whitespace). + """ + + def __init__(self, + ann_file: str, + pipeline: List[Union[dict, Callable]], + data_prefix: ConfigType = dict(video=''), + multi_class: bool = False, + num_classes: Optional[int] = None, + start_index: int = 0, + modality: str = 'RGB', + test_mode: bool = False, + delimiter: str = ' ', + **kwargs) -> None: + self.delimiter = delimiter + super().__init__( + ann_file, + pipeline=pipeline, + data_prefix=data_prefix, + multi_class=multi_class, + num_classes=num_classes, + start_index=start_index, + modality=modality, + test_mode=test_mode, + **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split(self.delimiter) + if self.multi_class: + assert self.num_classes is not None + filename, label = line_split[0], line_split[1:] + label = list(map(int, label)) + # add fake label for inference datalist without label + elif len(line_split) == 1: + filename, label = line_split[0], -1 + else: + filename, label = line_split + label = int(label) + if self.data_prefix['video'] is not None: + filename = osp.join(self.data_prefix['video'], filename) + data_list.append(dict(filename=filename, label=label)) + return data_list diff --git a/mmaction/datasets/video_text_dataset.py b/mmaction/datasets/video_text_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b9687bcb9f2ab56e46c4d238e89ea574db1b0a38 --- /dev/null +++ b/mmaction/datasets/video_text_dataset.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from typing import Dict, List + +from mmengine.fileio import exists + +from mmaction.registry import DATASETS +from .base import BaseActionDataset + + +@DATASETS.register_module() +class VideoTextDataset(BaseActionDataset): + """Video dataset for video-text task like video retrieval.""" + + def load_data_list(self) -> List[Dict]: + """Load annotation file to get video information.""" + exists(self.ann_file) + data_list = [] + + with open(self.ann_file) as f: + video_dict = json.load(f) + for filename, texts in video_dict.items(): + filename = osp.join(self.data_prefix['video'], filename) + video_text_pairs = [] + for text in texts: + data_item = dict(filename=filename, text=text) + video_text_pairs.append(data_item) + data_list.extend(video_text_pairs) + + return data_list diff --git a/mmaction/engine/__init__.py b/mmaction/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d572e2dd39b3b65cf58406fcb651d9e31c11a0 --- /dev/null +++ b/mmaction/engine/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hooks import * # noqa: F401, F403 +from .model import * # noqa: F401, F403 +from .optimizers import * # noqa: F401, F403 +from .runner import * # noqa: F401, F403 diff --git a/mmaction/engine/hooks/__init__.py b/mmaction/engine/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1b2f6895a27ecc90efd91c3b87d365fe780bae32 --- /dev/null +++ b/mmaction/engine/hooks/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .output import OutputHook +from .visualization_hook import VisualizationHook + +__all__ = ['OutputHook', 'VisualizationHook'] diff --git a/mmaction/engine/hooks/output.py b/mmaction/engine/hooks/output.py new file mode 100644 index 0000000000000000000000000000000000000000..3744b5b2da6a3c2fc7c08b6afebd295ad10efe78 --- /dev/null +++ b/mmaction/engine/hooks/output.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import warnings + +import torch + + +class OutputHook: + """Output feature map of some layers. + + Args: + module (nn.Module): The whole module to get layers. + outputs (tuple[str] | list[str]): Layer name to output. Default: None. + as_tensor (bool): Determine to return a tensor or a numpy array. + Default: False. + """ + + def __init__(self, module, outputs=None, as_tensor=False): + self.outputs = outputs + self.as_tensor = as_tensor + self.layer_outputs = {} + self.handles = [] + self.register(module) + + def register(self, module): + + def hook_wrapper(name): + + def hook(model, input, output): + if not isinstance(output, torch.Tensor): + warnings.warn(f'Directly return the output from {name}, ' + f'since it is not a tensor') + self.layer_outputs[name] = output + elif self.as_tensor: + self.layer_outputs[name] = output + else: + self.layer_outputs[name] = output.detach().cpu().numpy() + + return hook + + if isinstance(self.outputs, (list, tuple)): + for name in self.outputs: + try: + layer = rgetattr(module, name) + h = layer.register_forward_hook(hook_wrapper(name)) + except AttributeError: + raise AttributeError(f'Module {name} not found') + self.handles.append(h) + + def remove(self): + for h in self.handles: + h.remove() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.remove() + + +# using wonder's beautiful simplification: +# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects +def rgetattr(obj, attr, *args): + + def _getattr(obj, attr): + return getattr(obj, attr, *args) + + return functools.reduce(_getattr, [obj] + attr.split('.')) diff --git a/mmaction/engine/hooks/visualization_hook.py b/mmaction/engine/hooks/visualization_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..423efeb19d61f84f66b71c50da2e2a641d1b5056 --- /dev/null +++ b/mmaction/engine/hooks/visualization_hook.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import os.path as osp +from typing import Optional, Sequence + +from mmengine import FileClient +from mmengine.hooks import Hook +from mmengine.runner import EpochBasedTrainLoop, Runner +from mmengine.visualization import Visualizer + +from mmaction.registry import HOOKS +from mmaction.structures import ActionDataSample + + +@HOOKS.register_module() +class VisualizationHook(Hook): + """Classification Visualization Hook. Used to visualize validation and + testing prediction results. + + - If ``out_dir`` is specified, all storage backends are ignored + and save the image to the ``out_dir``. + - If ``show`` is True, plot the result image in a window, please + confirm you are able to access the graphical interface. + Args: + enable (bool): Whether to enable this hook. Defaults to False. + interval (int): The interval of samples to visualize. Defaults to 5000. + show (bool): Whether to display the drawn image. Defaults to False. + out_dir (str, optional): directory where painted images will be saved + in the testing process. If None, handle with the backends of the + visualizer. Defaults to None. + **kwargs: other keyword arguments of + :meth:`mmcls.visualization.ClsVisualizer.add_datasample`. + """ + + def __init__(self, + enable=False, + interval: int = 5000, + show: bool = False, + out_dir: Optional[str] = None, + **kwargs): + self._visualizer: Visualizer = Visualizer.get_current_instance() + + self.enable = enable + self.interval = interval + self.show = show + self.out_dir = out_dir + if out_dir is not None: + self.file_client = FileClient.infer_client(uri=out_dir) + else: + self.file_client = None + + self.draw_args = {**kwargs, 'show': show} + + def _draw_samples(self, + batch_idx: int, + data_batch: dict, + data_samples: Sequence[ActionDataSample], + step: int = 0) -> None: + """Visualize every ``self.interval`` samples from a data batch. + + Args: + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`ActionDataSample`]): Outputs from model. + step (int): Global step value to record. Defaults to 0. + """ + if self.enable is False: + return + + batch_size = len(data_samples) + videos = data_batch['inputs'] + start_idx = batch_size * batch_idx + end_idx = start_idx + batch_size + + # The first index divisible by the interval, after the start index + first_sample_id = math.ceil(start_idx / self.interval) * self.interval + + for sample_id in range(first_sample_id, end_idx, self.interval): + video = videos[sample_id - start_idx] + # move channel to the last + video = video.permute(1, 2, 3, 0).numpy().astype('uint8') + + data_sample = data_samples[sample_id - start_idx] + if 'filename' in data_sample: + # osp.basename works on different platforms even file clients. + sample_name = osp.basename(data_sample.get('filename')) + elif 'frame_dir' in data_sample: + sample_name = osp.basename(data_sample.get('frame_dir')) + else: + sample_name = str(sample_id) + + draw_args = self.draw_args + if self.out_dir is not None: + draw_args['out_path'] = self.file_client.join_path( + self.out_dir, f'{sample_name}_{step}') + + self._visualizer.add_datasample( + sample_name, + video=video, + data_sample=data_sample, + step=step, + **self.draw_args, + ) + + def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[ActionDataSample]) -> None: + """Visualize every ``self.interval`` samples during validation. + + Args: + runner (:obj:`Runner`): The runner of the validation process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`ActionDataSample`]): Outputs from model. + """ + if isinstance(runner.train_loop, EpochBasedTrainLoop): + step = runner.epoch + else: + step = runner.iter + + self._draw_samples(batch_idx, data_batch, outputs, step=step) + + def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[ActionDataSample]) -> None: + """Visualize every ``self.interval`` samples during test. + + Args: + runner (:obj:`Runner`): The runner of the testing process. + batch_idx (int): The index of the current batch in the test loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`DetDataSample`]): Outputs from model. + """ + self._draw_samples(batch_idx, data_batch, outputs, step=0) diff --git a/mmaction/engine/model/__init__.py b/mmaction/engine/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52a25d0c72e735bb62283613ab4715e206005782 --- /dev/null +++ b/mmaction/engine/model/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .weight_init import ConvBranchInit + +__all__ = ['ConvBranchInit'] diff --git a/mmaction/engine/model/weight_init.py b/mmaction/engine/model/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..655771031b4f4e7f5a58569918e5c07aee62a775 --- /dev/null +++ b/mmaction/engine/model/weight_init.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch.nn as nn +from mmengine.model import BaseInit, update_init_info + +from mmaction.registry import WEIGHT_INITIALIZERS + + +def conv_branch_init(conv: nn.Module, branches: int) -> None: + """Perform initialization for a conv branch. + + Args: + conv (nn.Module): The conv module of a branch. + branches (int): The number of branches. + """ + + weight = conv.weight + n = weight.size(0) + k1 = weight.size(1) + k2 = weight.size(2) + nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches))) + nn.init.constant_(conv.bias, 0) + + +@WEIGHT_INITIALIZERS.register_module('ConvBranch') +class ConvBranchInit(BaseInit): + """Initialize the module parameters of different branches. + + Args: + name (str): The name of the target module. + """ + + def __init__(self, name: str, **kwargs) -> None: + super(ConvBranchInit, self).__init__(**kwargs) + self.name = name + + def __call__(self, module) -> None: + assert hasattr(module, self.name) + + # Take a short cut to get the target module + module = getattr(module, self.name) + num_subset = len(module) + for conv in module: + conv_branch_init(conv, num_subset) + + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self) -> str: + info = f'{self.__class__.__name__}' + return info diff --git a/mmaction/engine/optimizers/__init__.py b/mmaction/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..691ea56728c686e724c8e6a730fdb9abee3be09e --- /dev/null +++ b/mmaction/engine/optimizers/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .layer_decay_optim_wrapper_constructor import \ + LearningRateDecayOptimizerConstructor +from .swin_optim_wrapper_constructor import SwinOptimWrapperConstructor +from .tsm_optim_wrapper_constructor import TSMOptimWrapperConstructor + +__all__ = [ + 'TSMOptimWrapperConstructor', 'SwinOptimWrapperConstructor', + 'LearningRateDecayOptimizerConstructor' +] diff --git a/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..5996be7fa01590292f6a617992f448475e922e27 --- /dev/null +++ b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import List + +import torch.nn as nn +from mmengine.dist import get_dist_info +from mmengine.logging import MMLogger +from mmengine.optim import DefaultOptimWrapperConstructor + +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +def get_layer_id_for_vit(var_name: str, max_layer_id: int) -> int: + """Get the layer id to set the different learning rates for ViT. + + Args: + var_name (str): The key of the model. + num_max_layer (int): Maximum number of backbone layers. + Returns: + int: Returns the layer id of the key. + """ + + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.patch_embed'): + return 0 + elif var_name.startswith('backbone.blocks'): + layer_id = int(var_name.split('.')[2]) + return layer_id + 1 + else: + return max_layer_id + 1 + + +def get_layer_id_for_mvit(var_name, max_layer_id): + """Get the layer id to set the different learning rates in ``layer_wise`` + decay_type. + + Args: + var_name (str): The key of the model. + max_layer_id (int): Maximum layer id. + + Returns: + int: The id number corresponding to different learning rate in + ``LearningRateDecayOptimizerConstructor``. + """ + + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.patch_embed'): + return 0 + elif var_name.startswith('backbone.blocks'): + layer_id = int(var_name.split('.')[2]) + 1 + return layer_id + else: + return max_layer_id + 1 + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor): + """ + Different learning rates are set for different layers of backbone. + Note: Currently, this optimizer constructor is built for MViT. + + Inspiration from `the implementation in PySlowFast + `_ and MMDetection + `_ + """ + + def add_params(self, params: List[dict], module: nn.Module, + **kwargs) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + """ + logger = MMLogger.get_current_instance() + + parameter_groups = {} + logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}') + num_layers = self.paramwise_cfg.get('num_layers') + decay_rate = self.paramwise_cfg.get('decay_rate') + decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise') + logger.info('Build LearningRateDecayOptimizerConstructor ' + f'{decay_type} {decay_rate} - {num_layers}') + weight_decay = self.base_wd + + for m in module.modules(): + assert not isinstance(m, nn.modules.batchnorm._NormBase + ), 'BN is not supported with layer decay' + + for name, param in module.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith('.bias'): + group_name = 'no_decay' + this_weight_decay = 0. + else: + group_name = 'decay' + this_weight_decay = weight_decay + if 'layer_wise' in decay_type: + if 'MViT' in module.backbone.__class__.__name__: + layer_id = get_layer_id_for_mvit( + name, self.paramwise_cfg.get('num_layers')) + logger.info(f'set param {name} as id {layer_id}') + elif 'VisionTransformer' in module.backbone.__class__.__name__: + layer_id = get_layer_id_for_vit(name, num_layers) + logger.info(f'set param {name} as id {layer_id}') + else: + raise NotImplementedError() + else: + raise NotImplementedError(f'Only support layer wise decay,' + f'but got {decay_type}') + + group_name = f'layer_{layer_id}_{group_name}' + + if group_name not in parameter_groups: + scale = decay_rate**(num_layers - layer_id + 1) + + parameter_groups[group_name] = { + 'weight_decay': this_weight_decay, + 'params': [], + 'param_names': [], + 'lr_scale': scale, + 'group_name': group_name, + 'lr': scale * self.base_lr, + } + + parameter_groups[group_name]['params'].append(param) + parameter_groups[group_name]['param_names'].append(name) + rank, _ = get_dist_info() + if rank == 0: + to_display = {} + for key in parameter_groups: + to_display[key] = { + 'param_names': parameter_groups[key]['param_names'], + 'lr_scale': parameter_groups[key]['lr_scale'], + 'lr': parameter_groups[key]['lr'], + 'weight_decay': parameter_groups[key]['weight_decay'], + } + logger.info(f'Param groups = {json.dumps(to_display, indent=2)}') + params.extend(parameter_groups.values()) diff --git a/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py b/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..8515c58466b2b30e9dc8c562a0c284ada2d6450e --- /dev/null +++ b/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import reduce +from operator import mul +from typing import List + +import torch.nn as nn +from mmengine.logging import print_log +from mmengine.optim import DefaultOptimWrapperConstructor + +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class SwinOptimWrapperConstructor(DefaultOptimWrapperConstructor): + + def add_params(self, + params: List[dict], + module: nn.Module, + prefix: str = 'base', + **kwargs) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module. Defaults to ``'base'``. + """ + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + if not param.requires_grad: + params.append(param_group) + continue + + param_group['lr'] = self.base_lr + if self.base_wd is not None: + param_group['weight_decay'] = self.base_wd + + processing_keys = [ + key for key in self.paramwise_cfg if key in f'{prefix}.{name}' + ] + if processing_keys: + param_group['lr'] *= \ + reduce(mul, [self.paramwise_cfg[key].get('lr_mult', 1.) + for key in processing_keys]) + if self.base_wd is not None: + param_group['weight_decay'] *= \ + reduce(mul, [self.paramwise_cfg[key]. + get('decay_mult', 1.) + for key in processing_keys]) + + params.append(param_group) + + for key, value in param_group.items(): + if key == 'params': + continue + full_name = f'{prefix}.{name}' if prefix else name + print_log( + f'paramwise_options -- ' + f'{full_name}: {key} = {round(value, 8)}', + logger='current') + + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}.{child_name}' if prefix else child_name + self.add_params(params, child_mod, prefix=child_prefix) diff --git a/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py b/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..9f197d9941f7cd2009db3e52d45e3821e1c5f355 --- /dev/null +++ b/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmengine.optim import DefaultOptimWrapperConstructor +from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm_, + _BatchNorm, _ConvNd) + +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class TSMOptimWrapperConstructor(DefaultOptimWrapperConstructor): + """Optimizer constructor in TSM model. + + This constructor builds optimizer in different ways from the default one. + + 1. Parameters of the first conv layer have default lr and weight decay. + 2. Parameters of BN layers have default lr and zero weight decay. + 3. If the field "fc_lr5" in paramwise_cfg is set to True, the parameters + of the last fc layer in cls_head have 5x lr multiplier and 10x weight + decay multiplier. + 4. Weights of other layers have default lr and weight decay, and biases + have a 2x lr multiplier and zero weight decay. + """ + + def add_params(self, params, model, **kwargs): + """Add parameters and their corresponding lr and wd to the params. + + Args: + params (list): The list to be modified, containing all parameter + groups and their corresponding lr and wd configurations. + model (nn.Module): The model to be trained with the optimizer. + """ + # use fc_lr5 to determine whether to specify higher multi-factor + # for fc layer weights and bias. + fc_lr5 = self.paramwise_cfg['fc_lr5'] + first_conv_weight = [] + first_conv_bias = [] + normal_weight = [] + normal_bias = [] + lr5_weight = [] + lr10_bias = [] + bn = [] + + conv_cnt = 0 + + for m in model.modules(): + if isinstance(m, _ConvNd): + m_params = list(m.parameters()) + conv_cnt += 1 + if conv_cnt == 1: + first_conv_weight.append(m_params[0]) + if len(m_params) == 2: + first_conv_bias.append(m_params[1]) + else: + normal_weight.append(m_params[0]) + if len(m_params) == 2: + normal_bias.append(m_params[1]) + elif isinstance(m, torch.nn.Linear): + m_params = list(m.parameters()) + normal_weight.append(m_params[0]) + if len(m_params) == 2: + normal_bias.append(m_params[1]) + elif isinstance(m, + (_BatchNorm, SyncBatchNorm_, torch.nn.GroupNorm)): + for param in list(m.parameters()): + if param.requires_grad: + bn.append(param) + elif len(m._modules) == 0: + if len(list(m.parameters())) > 0: + raise ValueError(f'New atomic module type: {type(m)}. ' + 'Need to give it a learning policy') + + # pop the cls_head fc layer params + last_fc_weight = normal_weight.pop() + last_fc_bias = normal_bias.pop() + if fc_lr5: + lr5_weight.append(last_fc_weight) + lr10_bias.append(last_fc_bias) + else: + normal_weight.append(last_fc_weight) + normal_bias.append(last_fc_bias) + + params.append({ + 'params': first_conv_weight, + 'lr': self.base_lr, + 'weight_decay': self.base_wd + }) + params.append({ + 'params': first_conv_bias, + 'lr': self.base_lr * 2, + 'weight_decay': 0 + }) + params.append({ + 'params': normal_weight, + 'lr': self.base_lr, + 'weight_decay': self.base_wd + }) + params.append({ + 'params': normal_bias, + 'lr': self.base_lr * 2, + 'weight_decay': 0 + }) + params.append({'params': bn, 'lr': self.base_lr, 'weight_decay': 0}) + params.append({ + 'params': lr5_weight, + 'lr': self.base_lr * 5, + 'weight_decay': self.base_wd + }) + params.append({ + 'params': lr10_bias, + 'lr': self.base_lr * 10, + 'weight_decay': 0 + }) diff --git a/mmaction/engine/runner/__init__.py b/mmaction/engine/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4bc7841840bb3b35848736cebf64372053e12e7b --- /dev/null +++ b/mmaction/engine/runner/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .multi_loop import MultiLoaderEpochBasedTrainLoop +from .retrieval_loop import RetrievalTestLoop, RetrievalValLoop + +__all__ = [ + 'MultiLoaderEpochBasedTrainLoop', 'RetrievalValLoop', 'RetrievalTestLoop' +] diff --git a/mmaction/engine/runner/multi_loop.py b/mmaction/engine/runner/multi_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..466d625c6f11cb7bb3c013f097568d3eaad62762 --- /dev/null +++ b/mmaction/engine/runner/multi_loop.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import gc +from typing import Dict, List, Union + +from mmengine.runner import EpochBasedTrainLoop +from torch.utils.data import DataLoader + +from mmaction.registry import LOOPS + + +class EpochMultiLoader: + """Multi loaders based on epoch.""" + + def __init__(self, dataloaders: List[DataLoader]): + self._dataloaders = dataloaders + self.iter_loaders = [iter(loader) for loader in self._dataloaders] + + @property + def num_loaders(self): + """The number of dataloaders.""" + return len(self._dataloaders) + + def __iter__(self): + """Return self when executing __iter__.""" + return self + + def __next__(self): + """Get the next iter's data of multiple loaders.""" + data = tuple([next(loader) for loader in self.iter_loaders]) + return data + + def __len__(self): + """Get the length of loader.""" + return min([len(loader) for loader in self._dataloaders]) + + +@LOOPS.register_module() +class MultiLoaderEpochBasedTrainLoop(EpochBasedTrainLoop): + """EpochBasedTrainLoop with multiple dataloaders. + + Args: + runner (Runner): A reference of runner. + dataloader (Dataloader or Dict): A dataloader object or a dict to + build a dataloader for training the model. + other_loaders (List of Dataloader or Dict): A list of other loaders. + Each item in the list is a dataloader object or a dict to build + a dataloader. + max_epochs (int): Total training epochs. + val_begin (int): The epoch that begins validating. Defaults to 1. + val_interval (int): Validation interval. Defaults to 1. + """ + + def __init__(self, + runner, + dataloader: Union[Dict, DataLoader], + other_loaders: List[Union[Dict, DataLoader]], + max_epochs: int, + val_begin: int = 1, + val_interval: int = 1) -> None: + super().__init__(runner, dataloader, max_epochs, val_begin, + val_interval) + multi_loaders = [self.dataloader] + for loader in other_loaders: + if isinstance(loader, dict): + loader = runner.build_dataloader(loader, seed=runner.seed) + multi_loaders.append(loader) + + self.multi_loaders = multi_loaders + + def run_epoch(self) -> None: + """Iterate one epoch.""" + self.runner.call_hook('before_train_epoch') + self.runner.model.train() + + gc.collect() + for loader in self.multi_loaders: + if hasattr(loader, 'sampler') and hasattr(loader.sampler, + 'set_epoch'): + loader.sampler.set_epoch(self._epoch) + + for idx, data_batch in enumerate(EpochMultiLoader(self.multi_loaders)): + self.run_iter(idx, data_batch) + + self.runner.call_hook('after_train_epoch') + self._epoch += 1 diff --git a/mmaction/engine/runner/retrieval_loop.py b/mmaction/engine/runner/retrieval_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..db829bcecef43c34a3dc2a16b7bdc5684fe8c375 --- /dev/null +++ b/mmaction/engine/runner/retrieval_loop.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch +from mmengine.model import is_model_wrapper +from mmengine.runner import TestLoop, ValLoop, autocast + +from mmaction.registry import LOOPS + + +@LOOPS.register_module() +class RetrievalValLoop(ValLoop): + """Loop for multimodal retrieval val. + + Args: + runner (Runner): A reference of runner. + dataloader (Dataloader or dict): A dataloader object or a dict to + build a dataloader. + evaluator (Evaluator or dict or list): Used for computing metrics. + fp16 (bool): Whether to enable fp16 valing. Defaults to + False. + """ + + def run(self) -> dict: + """Launch val.""" + self.runner.call_hook('before_val') + self.runner.call_hook('before_val_epoch') + self.runner.model.eval() + + feats_local = [] + data_samples_local = [] + + for idx, data_batch in enumerate(self.dataloader): + with torch.no_grad(): + self.runner.call_hook( + 'before_val_iter', batch_idx=idx, data_batch=data_batch) + # predictions should be sequence of BaseDataElement + with autocast(enabled=self.fp16): + if is_model_wrapper(self.runner.model): + data_preprocessor = self.runner.model.module.data_preprocessor # noqa: E501 + else: + data_preprocessor = self.runner.model.data_preprocessor + + # get features for retrieval instead of data samples + data_batch = data_preprocessor(data_batch, False) + feats = self.runner.model._run_forward( + data_batch, mode='tensor') + feats_local.append(feats) + data_samples_local.extend(data_batch['data_samples']) + self.runner.call_hook( + 'after_val_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=feats) + + # concatenate different features + feats_local = { + k: torch.cat([dic[k] for dic in feats_local]) + for k in feats_local[0] + } + + # get predictions + if is_model_wrapper(self.runner.model): + predict_all_fn = self.runner.model.module.predict_all + else: + predict_all_fn = self.runner.model.predict_all + + num_videos = self.dataloader.dataset.num_videos + num_texts = self.dataloader.dataset.num_texts + with torch.no_grad(): + with autocast(enabled=self.fp16): + i2t_data_samples, t2i_data_samples = predict_all_fn( + feats_local, + data_samples_local, + num_images=num_videos, + num_texts=num_texts, + ) + # process in evaluator and compute metrics + self.evaluator.process(i2t_data_samples, None) + i2t_metrics = self.evaluator.evaluate(num_videos) + i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()} + self.evaluator.process(t2i_data_samples, None) + t2i_metrics = self.evaluator.evaluate(num_texts) + t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()} + metrics = {**i2t_metrics, **t2i_metrics} + self.runner.call_hook('after_val_epoch', metrics=metrics) + self.runner.call_hook('after_val') + return metrics + + +@LOOPS.register_module() +class RetrievalTestLoop(TestLoop): + """Loop for multimodal retrieval test. + + Args: + runner (Runner): A reference of runner. + dataloader (Dataloader or dict): A dataloader object or a dict to + build a dataloader. + evaluator (Evaluator or dict or list): Used for computing metrics. + fp16 (bool): Whether to enable fp16 testing. Defaults to + False. + """ + + def run(self) -> dict: + """Launch test.""" + self.runner.call_hook('before_test') + self.runner.call_hook('before_test_epoch') + self.runner.model.eval() + + feats_local = [] + data_samples_local = [] + + for idx, data_batch in enumerate(self.dataloader): + with torch.no_grad(): + self.runner.call_hook( + 'before_test_iter', batch_idx=idx, data_batch=data_batch) + # predictions should be sequence of BaseDataElement + with autocast(enabled=self.fp16): + if is_model_wrapper(self.runner.model): + data_preprocessor = self.runner.model.module.data_preprocessor # noqa: E501 + else: + data_preprocessor = self.runner.model.data_preprocessor + # get features for retrieval instead of data samples + data_batch = data_preprocessor(data_batch, False) + feats = self.runner.model._run_forward( + data_batch, mode='tensor') + feats_local.append(feats) + data_samples_local.extend(data_batch['data_samples']) + self.runner.call_hook( + 'after_test_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=feats) + + # concatenate different features + feats_local = { + k: torch.cat([dic[k] for dic in feats_local]) + for k in feats_local[0] + } + + # get predictions + if is_model_wrapper(self.runner.model): + predict_all_fn = self.runner.model.module.predict_all + else: + predict_all_fn = self.runner.model.predict_all + + num_videos = self.dataloader.dataset.num_videos + num_texts = self.dataloader.dataset.num_texts + with torch.no_grad(): + with autocast(enabled=self.fp16): + i2t_data_samples, t2i_data_samples = predict_all_fn( + feats_local, + data_samples_local, + num_images=num_videos, + num_texts=num_texts, + ) + + # process in evaluator and compute metrics + self.evaluator.process(i2t_data_samples, None) + i2t_metrics = self.evaluator.evaluate(num_videos) + i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()} + self.evaluator.process(t2i_data_samples, None) + t2i_metrics = self.evaluator.evaluate(num_texts) + t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()} + metrics = {**i2t_metrics, **t2i_metrics} + + self.runner.call_hook('after_test_epoch', metrics=metrics) + self.runner.call_hook('after_test') + return metrics diff --git a/mmaction/evaluation/__init__.py b/mmaction/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf038e034f065673cab674735be2ab9102d3eba5 --- /dev/null +++ b/mmaction/evaluation/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .functional import * # noqa: F401,F403 +from .metrics import * # noqa: F401,F403 diff --git a/mmaction/evaluation/__pycache__/__init__.cpython-310.pyc b/mmaction/evaluation/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9330130a9fa8e9a47c918d7ea5963998b244f3f Binary files /dev/null and b/mmaction/evaluation/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/__init__.py b/mmaction/evaluation/functional/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38e6d5756d466d2bf3669e994c6f5dfcc33a5b9f --- /dev/null +++ b/mmaction/evaluation/functional/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .accuracy import (average_precision_at_temporal_iou, + average_recall_at_avg_proposals, confusion_matrix, + get_weighted_score, interpolated_precision_recall, + mean_average_precision, mean_class_accuracy, + mmit_mean_average_precision, pairwise_temporal_iou, + softmax, top_k_accuracy, top_k_classes) +from .ava_utils import ava_eval, read_labelmap, results2csv +from .eval_detection import ActivityNetLocalization +from .multisports_utils import frameAP, link_tubes, videoAP, videoAP_all + +__all__ = [ + 'top_k_accuracy', 'mean_class_accuracy', 'confusion_matrix', + 'mean_average_precision', 'get_weighted_score', + 'average_recall_at_avg_proposals', 'pairwise_temporal_iou', + 'average_precision_at_temporal_iou', 'ActivityNetLocalization', 'softmax', + 'interpolated_precision_recall', 'mmit_mean_average_precision', + 'top_k_classes', 'read_labelmap', 'ava_eval', 'results2csv', 'frameAP', + 'videoAP', 'link_tubes', 'videoAP_all' +] diff --git a/mmaction/evaluation/functional/__pycache__/__init__.cpython-310.pyc b/mmaction/evaluation/functional/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..812658cf843d61f76198ced7ee4e05fe8c6a94c9 Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/__pycache__/accuracy.cpython-310.pyc b/mmaction/evaluation/functional/__pycache__/accuracy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b32d10f9119217826bad7ac153e3b0c893afb871 Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/accuracy.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-310.pyc b/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f317e3e9339123e0bfb3979a0580f90dfc79838 Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-310.pyc b/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e31c6c0bcdd64d5653d41e96e2327237fd39f91 Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-310.pyc b/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d89deaaf46babffc0a9705a507765167ff6f86ce Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/accuracy.py b/mmaction/evaluation/functional/accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..d033a1102af03c042de61e497215176409a0f1d0 --- /dev/null +++ b/mmaction/evaluation/functional/accuracy.py @@ -0,0 +1,568 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +def confusion_matrix(y_pred, y_real, normalize=None): + """Compute confusion matrix. + + Args: + y_pred (list[int] | np.ndarray[int]): Prediction labels. + y_real (list[int] | np.ndarray[int]): Ground truth labels. + normalize (str | None): Normalizes confusion matrix over the true + (rows), predicted (columns) conditions or all the population. + If None, confusion matrix will not be normalized. Options are + "true", "pred", "all", None. Default: None. + + Returns: + np.ndarray: Confusion matrix. + """ + if normalize not in ['true', 'pred', 'all', None]: + raise ValueError("normalize must be one of {'true', 'pred', " + "'all', None}") + + if isinstance(y_pred, list): + y_pred = np.array(y_pred) + if y_pred.dtype == np.int32: + y_pred = y_pred.astype(np.int64) + if not isinstance(y_pred, np.ndarray): + raise TypeError( + f'y_pred must be list or np.ndarray, but got {type(y_pred)}') + if not y_pred.dtype == np.int64: + raise TypeError( + f'y_pred dtype must be np.int64, but got {y_pred.dtype}') + + if isinstance(y_real, list): + y_real = np.array(y_real) + if y_real.dtype == np.int32: + y_real = y_real.astype(np.int64) + if not isinstance(y_real, np.ndarray): + raise TypeError( + f'y_real must be list or np.ndarray, but got {type(y_real)}') + if not y_real.dtype == np.int64: + raise TypeError( + f'y_real dtype must be np.int64, but got {y_real.dtype}') + + label_set = np.unique(np.concatenate((y_pred, y_real))) + num_labels = len(label_set) + max_label = label_set[-1] + label_map = np.zeros(max_label + 1, dtype=np.int64) + for i, label in enumerate(label_set): + label_map[label] = i + + y_pred_mapped = label_map[y_pred] + y_real_mapped = label_map[y_real] + + confusion_mat = np.bincount( + num_labels * y_real_mapped + y_pred_mapped, + minlength=num_labels**2).reshape(num_labels, num_labels) + + with np.errstate(all='ignore'): + if normalize == 'true': + confusion_mat = ( + confusion_mat / confusion_mat.sum(axis=1, keepdims=True)) + elif normalize == 'pred': + confusion_mat = ( + confusion_mat / confusion_mat.sum(axis=0, keepdims=True)) + elif normalize == 'all': + confusion_mat = (confusion_mat / confusion_mat.sum()) + confusion_mat = np.nan_to_num(confusion_mat) + + return confusion_mat + + +def mean_class_accuracy(scores, labels): + """Calculate mean class accuracy. + + Args: + scores (list[np.ndarray]): Prediction scores for each class. + labels (list[int]): Ground truth labels. + + Returns: + np.ndarray: Mean class accuracy. + """ + pred = np.argmax(scores, axis=1) + cf_mat = confusion_matrix(pred, labels).astype(float) + + cls_cnt = cf_mat.sum(axis=1) + cls_hit = np.diag(cf_mat) + + mean_class_acc = np.mean( + [hit / cnt if cnt else 0.0 for cnt, hit in zip(cls_cnt, cls_hit)]) + + return mean_class_acc + + +def top_k_classes(scores, labels, k=10, mode='accurate'): + """Calculate the most K accurate (inaccurate) classes. + + Given the prediction scores, ground truth label and top-k value, + compute the top K accurate (inaccurate) classes. + + Args: + scores (list[np.ndarray]): Prediction scores for each class. + labels (list[int] | np.ndarray): Ground truth labels. + k (int): Top-k values. Default: 10. + mode (str): Comparison mode for Top-k. Options are 'accurate' + and 'inaccurate'. Default: 'accurate'. + + Return: + list: List of sorted (from high accuracy to low accuracy for + 'accurate' mode, and from low accuracy to high accuracy for + inaccurate mode) top K classes in format of (label_id, + acc_ratio). + """ + assert mode in ['accurate', 'inaccurate'] + pred = np.argmax(scores, axis=1) + cf_mat = confusion_matrix(pred, labels).astype(float) + + cls_cnt = cf_mat.sum(axis=1) + cls_hit = np.diag(cf_mat) + hit_ratio = np.array( + [hit / cnt if cnt else 0.0 for cnt, hit in zip(cls_cnt, cls_hit)]) + + if mode == 'accurate': + max_index = np.argsort(hit_ratio)[-k:][::-1] + max_value = hit_ratio[max_index] + results = list(zip(max_index, max_value)) + else: + min_index = np.argsort(hit_ratio)[:k] + min_value = hit_ratio[min_index] + results = list(zip(min_index, min_value)) + return results + + +def top_k_accuracy(scores, labels, topk=(1, )): + """Calculate top k accuracy score. + + Args: + scores (list[np.ndarray]): Prediction scores for each class. + labels (list[int]): Ground truth labels. + topk (tuple[int]): K value for top_k_accuracy. Default: (1, ). + + Returns: + list[float]: Top k accuracy score for each k. + """ + res = [] + labels = np.array(labels)[:, np.newaxis] + for k in topk: + max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1] + match_array = np.logical_or.reduce(max_k_preds == labels, axis=1) + topk_acc_score = match_array.sum() / match_array.shape[0] + res.append(topk_acc_score) + + return res + + +def mmit_mean_average_precision(scores, labels): + """Mean average precision for multi-label recognition. Used for reporting + MMIT style mAP on Multi-Moments in Times. The difference is that this + method calculates average-precision for each sample and averages them among + samples. + + Args: + scores (list[np.ndarray]): Prediction scores of different classes for + each sample. + labels (list[np.ndarray]): Ground truth many-hot vector for each + sample. + + Returns: + np.float64: The MMIT style mean average precision. + """ + results = [] + for score, label in zip(scores, labels): + precision, recall, _ = binary_precision_recall_curve(score, label) + ap = -np.sum(np.diff(recall) * np.array(precision)[:-1]) + results.append(ap) + return np.mean(results) + + +def mean_average_precision(scores, labels): + """Mean average precision for multi-label recognition. + + Args: + scores (list[np.ndarray]): Prediction scores of different classes for + each sample. + labels (list[np.ndarray]): Ground truth many-hot vector for each + sample. + + Returns: + np.float64: The mean average precision. + """ + results = [] + scores = np.stack(scores).T + labels = np.stack(labels).T + + for score, label in zip(scores, labels): + precision, recall, _ = binary_precision_recall_curve(score, label) + ap = -np.sum(np.diff(recall) * np.array(precision)[:-1]) + results.append(ap) + results = [x for x in results if not np.isnan(x)] + if results == []: + return np.nan + return np.mean(results) + + +def binary_precision_recall_curve(y_score, y_true): + """Calculate the binary precision recall curve at step thresholds. + + Args: + y_score (np.ndarray): Prediction scores for each class. + Shape should be (num_classes, ). + y_true (np.ndarray): Ground truth many-hot vector. + Shape should be (num_classes, ). + + Returns: + precision (np.ndarray): The precision of different thresholds. + recall (np.ndarray): The recall of different thresholds. + thresholds (np.ndarray): Different thresholds at which precision and + recall are tested. + """ + assert isinstance(y_score, np.ndarray) + assert isinstance(y_true, np.ndarray) + assert y_score.shape == y_true.shape + + # make y_true a boolean vector + y_true = (y_true == 1) + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind='mergesort')[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + # There may be ties in values, therefore find the `distinct_value_inds` + distinct_value_inds = np.where(np.diff(y_score))[0] + threshold_inds = np.r_[distinct_value_inds, y_true.size - 1] + # accumulate the true positives with decreasing threshold + tps = np.cumsum(y_true)[threshold_inds] + fps = 1 + threshold_inds - tps + thresholds = y_score[threshold_inds] + + precision = tps / (tps + fps) + precision[np.isnan(precision)] = 0 + recall = tps / tps[-1] + # stop when full recall attained + # and reverse the outputs so recall is decreasing + last_ind = tps.searchsorted(tps[-1]) + sl = slice(last_ind, None, -1) + + return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] + + +def pairwise_temporal_iou(candidate_segments, + target_segments, + calculate_overlap_self=False): + """Compute intersection over union between segments. + + Args: + candidate_segments (np.ndarray): 1-dim/2-dim array in format + ``[init, end]/[m x 2:=[init, end]]``. + target_segments (np.ndarray): 2-dim array in format + ``[n x 2:=[init, end]]``. + calculate_overlap_self (bool): Whether to calculate overlap_self + (union / candidate_length) or not. Default: False. + + Returns: + t_iou (np.ndarray): 1-dim array [n] / + 2-dim array [n x m] with IoU ratio. + t_overlap_self (np.ndarray, optional): 1-dim array [n] / + 2-dim array [n x m] with overlap_self, returns when + calculate_overlap_self is True. + """ + candidate_segments_ndim = candidate_segments.ndim + if target_segments.ndim != 2 or candidate_segments_ndim not in [1, 2]: + raise ValueError('Dimension of arguments is incorrect') + + if candidate_segments_ndim == 1: + candidate_segments = candidate_segments[np.newaxis, :] + + n, m = target_segments.shape[0], candidate_segments.shape[0] + t_iou = np.empty((n, m), dtype=np.float32) + if calculate_overlap_self: + t_overlap_self = np.empty((n, m), dtype=np.float32) + + for i in range(m): + candidate_segment = candidate_segments[i, :] + tt1 = np.maximum(candidate_segment[0], target_segments[:, 0]) + tt2 = np.minimum(candidate_segment[1], target_segments[:, 1]) + # Intersection including Non-negative overlap score. + segments_intersection = (tt2 - tt1).clip(0) + # Segment union. + segments_union = ((target_segments[:, 1] - target_segments[:, 0]) + + (candidate_segment[1] - candidate_segment[0]) - + segments_intersection) + # Compute overlap as the ratio of the intersection + # over union of two segments. + t_iou[:, i] = (segments_intersection.astype(float) / segments_union) + if calculate_overlap_self: + candidate_length = candidate_segment[1] - candidate_segment[0] + t_overlap_self[:, i] = ( + segments_intersection.astype(float) / candidate_length) + + if candidate_segments_ndim == 1: + t_iou = np.squeeze(t_iou, axis=1) + if calculate_overlap_self: + if candidate_segments_ndim == 1: + t_overlap_self = np.squeeze(t_overlap_self, axis=1) + return t_iou, t_overlap_self + + return t_iou + + +def average_recall_at_avg_proposals(ground_truth, + proposals, + total_num_proposals, + max_avg_proposals=None, + temporal_iou_thresholds=np.linspace( + 0.5, 0.95, 10)): + """Computes the average recall given an average number (percentile) of + proposals per video. + + Args: + ground_truth (dict): Dict containing the ground truth instances. + proposals (dict): Dict containing the proposal instances. + total_num_proposals (int): Total number of proposals in the + proposal dict. + max_avg_proposals (int | None): Max number of proposals for one video. + Default: None. + temporal_iou_thresholds (np.ndarray): 1D array with temporal_iou + thresholds. Default: ``np.linspace(0.5, 0.95, 10)``. + + Returns: + tuple([np.ndarray, np.ndarray, np.ndarray, float]): + (recall, average_recall, proposals_per_video, auc) + In recall, ``recall[i,j]`` is recall at i-th temporal_iou threshold + at the j-th average number (percentile) of average number of + proposals per video. The average_recall is recall averaged + over a list of temporal_iou threshold (1D array). This is + equivalent to ``recall.mean(axis=0)``. The ``proposals_per_video`` + is the average number of proposals per video. The auc is the area + under ``AR@AN`` curve. + """ + + total_num_videos = len(ground_truth) + + if not max_avg_proposals: + max_avg_proposals = float(total_num_proposals) / total_num_videos + + ratio = (max_avg_proposals * float(total_num_videos) / total_num_proposals) + + # For each video, compute temporal_iou scores among the retrieved proposals + score_list = [] + total_num_retrieved_proposals = 0 + for video_id in ground_truth: + # Get proposals for this video. + proposals_video_id = proposals[video_id] + this_video_proposals = proposals_video_id[:, :2] + # Sort proposals by score. + sort_idx = proposals_video_id[:, 2].argsort()[::-1] + this_video_proposals = this_video_proposals[sort_idx, :].astype( + np.float32) + + # Get ground-truth instances associated to this video. + ground_truth_video_id = ground_truth[video_id] + this_video_ground_truth = ground_truth_video_id[:, :2].astype( + np.float32) + if this_video_proposals.shape[0] == 0: + n = this_video_ground_truth.shape[0] + score_list.append(np.zeros((n, 1))) + continue + + if this_video_proposals.ndim != 2: + this_video_proposals = np.expand_dims(this_video_proposals, axis=0) + if this_video_ground_truth.ndim != 2: + this_video_ground_truth = np.expand_dims( + this_video_ground_truth, axis=0) + + num_retrieved_proposals = np.minimum( + int(this_video_proposals.shape[0] * ratio), + this_video_proposals.shape[0]) + total_num_retrieved_proposals += num_retrieved_proposals + this_video_proposals = this_video_proposals[: + num_retrieved_proposals, :] + + # Compute temporal_iou scores. + t_iou = pairwise_temporal_iou(this_video_proposals, + this_video_ground_truth) + score_list.append(t_iou) + + # Given that the length of the videos is really varied, we + # compute the number of proposals in terms of a ratio of the total + # proposals retrieved, i.e. average recall at a percentage of proposals + # retrieved per video. + + # Computes average recall. + pcn_list = np.arange(1, 101) / 100.0 * ( + max_avg_proposals * float(total_num_videos) / + total_num_retrieved_proposals) + matches = np.empty((total_num_videos, pcn_list.shape[0])) + positives = np.empty(total_num_videos) + recall = np.empty((temporal_iou_thresholds.shape[0], pcn_list.shape[0])) + # Iterates over each temporal_iou threshold. + for ridx, temporal_iou in enumerate(temporal_iou_thresholds): + # Inspect positives retrieved per video at different + # number of proposals (percentage of the total retrieved). + for i, score in enumerate(score_list): + # Total positives per video. + positives[i] = score.shape[0] + # Find proposals that satisfies minimum temporal_iou threshold. + true_positives_temporal_iou = score >= temporal_iou + # Get number of proposals as a percentage of total retrieved. + pcn_proposals = np.minimum( + (score.shape[1] * pcn_list).astype(np.int32), score.shape[1]) + + for j, num_retrieved_proposals in enumerate(pcn_proposals): + # Compute the number of matches + # for each percentage of the proposals + matches[i, j] = np.count_nonzero( + (true_positives_temporal_iou[:, :num_retrieved_proposals] + ).sum(axis=1)) + + # Computes recall given the set of matches per video. + recall[ridx, :] = matches.sum(axis=0) / positives.sum() + + # Recall is averaged. + avg_recall = recall.mean(axis=0) + + # Get the average number of proposals per video. + proposals_per_video = pcn_list * ( + float(total_num_retrieved_proposals) / total_num_videos) + # Get AUC + area_under_curve = np.trapz(avg_recall, proposals_per_video) + auc = 100. * float(area_under_curve) / proposals_per_video[-1] + return recall, avg_recall, proposals_per_video, auc + + +def get_weighted_score(score_list, coeff_list): + """Get weighted score with given scores and coefficients. + + Given n predictions by different classifier: [score_1, score_2, ..., + score_n] (score_list) and their coefficients: [coeff_1, coeff_2, ..., + coeff_n] (coeff_list), return weighted score: weighted_score = + score_1 * coeff_1 + score_2 * coeff_2 + ... + score_n * coeff_n + + Args: + score_list (list[list[np.ndarray]]): List of list of scores, with shape + n(number of predictions) X num_samples X num_classes + coeff_list (list[float]): List of coefficients, with shape n. + + Returns: + list[np.ndarray]: List of weighted scores. + """ + assert len(score_list) == len(coeff_list) + num_samples = len(score_list[0]) + for i in range(1, len(score_list)): + assert len(score_list[i]) == num_samples + + scores = np.array(score_list) # (num_coeff, num_samples, num_classes) + coeff = np.array(coeff_list) # (num_coeff, ) + weighted_scores = list(np.dot(scores.T, coeff).T) + return weighted_scores + + +def softmax(x, dim=1): + """Compute softmax values for each sets of scores in x.""" + e_x = np.exp(x - np.max(x, axis=dim, keepdims=True)) + return e_x / e_x.sum(axis=dim, keepdims=True) + + +def interpolated_precision_recall(precision, recall): + """Interpolated AP - VOCdevkit from VOC 2011. + + Args: + precision (np.ndarray): The precision of different thresholds. + recall (np.ndarray): The recall of different thresholds. + + Returns๏ผš + float: Average precision score. + """ + mprecision = np.hstack([[0], precision, [0]]) + mrecall = np.hstack([[0], recall, [1]]) + for i in range(len(mprecision) - 1)[::-1]: + mprecision[i] = max(mprecision[i], mprecision[i + 1]) + idx = np.where(mrecall[1::] != mrecall[0:-1])[0] + 1 + ap = np.sum((mrecall[idx] - mrecall[idx - 1]) * mprecision[idx]) + return ap + + +def average_precision_at_temporal_iou(ground_truth, + prediction, + temporal_iou_thresholds=(np.linspace( + 0.5, 0.95, 10))): + """Compute average precision (in detection task) between ground truth and + predicted data frames. If multiple predictions match the same predicted + segment, only the one with highest score is matched as true positive. This + code is greatly inspired by Pascal VOC devkit. + + Args: + ground_truth (dict): Dict containing the ground truth instances. + Key: 'video_id' + Value (np.ndarray): 1D array of 't-start' and 't-end'. + prediction (np.ndarray): 2D array containing the information of + proposal instances, including 'video_id', 'class_id', 't-start', + 't-end' and 'score'. + temporal_iou_thresholds (np.ndarray): 1D array with temporal_iou + thresholds. Default: ``np.linspace(0.5, 0.95, 10)``. + + Returns: + np.ndarray: 1D array of average precision score. + """ + ap = np.zeros(len(temporal_iou_thresholds), dtype=np.float32) + if len(prediction) < 1: + return ap + + num_gts = 0. + lock_gt = dict() + for key in ground_truth: + lock_gt[key] = np.ones( + (len(temporal_iou_thresholds), len(ground_truth[key]))) * -1 + num_gts += len(ground_truth[key]) + + # Sort predictions by decreasing score order. + prediction = np.array(prediction) + scores = prediction[:, 4].astype(float) + sort_idx = np.argsort(scores)[::-1] + prediction = prediction[sort_idx] + + # Initialize true positive and false positive vectors. + tp = np.zeros((len(temporal_iou_thresholds), len(prediction)), + dtype=np.int32) + fp = np.zeros((len(temporal_iou_thresholds), len(prediction)), + dtype=np.int32) + + # Assigning true positive to truly grount truth instances. + for idx, this_pred in enumerate(prediction): + + # Check if there is at least one ground truth in the video. + if this_pred[0] in ground_truth: + this_gt = np.array(ground_truth[this_pred[0]], dtype=float) + else: + fp[:, idx] = 1 + continue + + t_iou = pairwise_temporal_iou(this_pred[2:4].astype(float), this_gt) + # We would like to retrieve the predictions with highest t_iou score. + t_iou_sorted_idx = t_iou.argsort()[::-1] + for t_idx, t_iou_threshold in enumerate(temporal_iou_thresholds): + for jdx in t_iou_sorted_idx: + if t_iou[jdx] < t_iou_threshold: + fp[t_idx, idx] = 1 + break + if lock_gt[this_pred[0]][t_idx, jdx] >= 0: + continue + # Assign as true positive after the filters above. + tp[t_idx, idx] = 1 + lock_gt[this_pred[0]][t_idx, jdx] = idx + break + + if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0: + fp[t_idx, idx] = 1 + + tp_cumsum = np.cumsum(tp, axis=1).astype(np.float32) + fp_cumsum = np.cumsum(fp, axis=1).astype(np.float32) + recall_cumsum = tp_cumsum / num_gts + + precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum) + + for t_idx in range(len(temporal_iou_thresholds)): + ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :], + recall_cumsum[t_idx, :]) + + return ap diff --git a/mmaction/evaluation/functional/ava_evaluation/README.md b/mmaction/evaluation/functional/ava_evaluation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3137daf47cd98c0eb97b995a1937ddbf78f630b6 --- /dev/null +++ b/mmaction/evaluation/functional/ava_evaluation/README.md @@ -0,0 +1,2 @@ +The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet). +Some unused codes are removed to minimize the length of codes added. diff --git a/mmaction/evaluation/functional/ava_evaluation/__init__.py b/mmaction/evaluation/functional/ava_evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/mmaction/evaluation/functional/ava_evaluation/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-310.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5451b14d3a23b364917b08e48b2761e310777010 Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-310.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8625223d3296a642373f108ef47d6c8472eb72e6 Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc2b5084e81180e9192d54a77906121e4bd955fc Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11401222f17ef388b5a8b17923b7e925fe9cccc5 Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc differ diff --git a/mmaction/evaluation/functional/ava_evaluation/metrics.py b/mmaction/evaluation/functional/ava_evaluation/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6e83182e2521a5ebe4766103af0db619703054 --- /dev/null +++ b/mmaction/evaluation/functional/ava_evaluation/metrics.py @@ -0,0 +1,142 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Functions for computing metrics like precision, recall, CorLoc and etc.""" + +import numpy as np + + +def compute_precision_recall(scores, labels, num_gt): + """Compute precision and recall. + + Args: + scores: A float numpy array representing detection score + labels: A boolean numpy array representing true/false positive labels + num_gt: Number of ground truth instances + + Raises: + ValueError: if the input is not of the correct format + + Returns: + precision: Fraction of positive instances over detected ones. This + value is None if no ground truth labels are present. + recall: Fraction of detected positive instance over all positive + instances. This value is None if no ground truth labels are + present. + """ + if (not isinstance(labels, np.ndarray) or labels.dtype != bool + or len(labels.shape) != 1): + raise ValueError('labels must be single dimension bool numpy array') + + if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: + raise ValueError('scores must be single dimension numpy array') + + if num_gt < np.sum(labels): + raise ValueError( + 'Number of true positives must be smaller than num_gt.') + + if len(scores) != len(labels): + raise ValueError('scores and labels must be of the same size.') + + if num_gt == 0: + return None, None + + sorted_indices = np.argsort(scores) + sorted_indices = sorted_indices[::-1] + labels = labels.astype(int) + true_positive_labels = labels[sorted_indices] + false_positive_labels = 1 - true_positive_labels + cum_true_positives = np.cumsum(true_positive_labels) + cum_false_positives = np.cumsum(false_positive_labels) + precision = cum_true_positives.astype(float) / ( + cum_true_positives + cum_false_positives) + recall = cum_true_positives.astype(float) / num_gt + return precision, recall + + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + + Precision is modified to ensure that it does not decrease as recall + decrease. + + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + + Raises: + ValueError: if the input is not of the correct format + + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError('If precision is None, recall must also be None') + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance( + recall, np.ndarray): + raise ValueError('precision and recall must be numpy array') + if precision.dtype != np.float64 or recall.dtype != np.float64: + raise ValueError('input must be float numpy array.') + if len(precision) != len(recall): + raise ValueError('precision and recall must be of the same size.') + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError('Precision must be in the range of [0, 1].') + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError('recall must be in the range of [0, 1].') + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError('recall must be a non-decreasing array') + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Preprocess precision to be a non-decreasing array + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def compute_cor_loc(num_gt_imgs_per_class, + num_images_correctly_detected_per_class): + """Compute CorLoc according to the definition in the following paper. + + https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf + + Returns nans if there are no ground truth images for a class. + + Args: + num_gt_imgs_per_class: 1D array, representing number of images + containing at least one object instance of a particular class + num_images_correctly_detected_per_class: 1D array, representing number + of images that are correctly detected at least one object instance + of a particular class + + Returns: + corloc_per_class: A float numpy array represents the corloc score of + each class + """ + # Divide by zero expected for classes with no gt examples. + with np.errstate(divide='ignore', invalid='ignore'): + return np.where( + num_gt_imgs_per_class == 0, np.nan, + num_images_correctly_detected_per_class / num_gt_imgs_per_class) diff --git a/mmaction/evaluation/functional/ava_evaluation/np_box_list.py b/mmaction/evaluation/functional/ava_evaluation/np_box_list.py new file mode 100644 index 0000000000000000000000000000000000000000..528cbb5539bca21a54b240ee284b59a200e33006 --- /dev/null +++ b/mmaction/evaluation/functional/ava_evaluation/np_box_list.py @@ -0,0 +1,139 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Numpy BoxList classes and functions.""" + +import numpy as np + + +class BoxList: + """Box collection. + + BoxList represents a list of bounding boxes as numpy array, where each + bounding box is represented as a row of 4 numbers, + [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within + a given list correspond to a single image. + + Optionally, users can add additional related fields (such as + objectness/classification scores). + """ + + def __init__(self, data): + """Constructs box collection. + + Args: + data: a numpy array of shape [N, 4] representing box coordinates + + Raises: + ValueError: if bbox data is not a numpy array + ValueError: if invalid dimensions for bbox data + """ + if not isinstance(data, np.ndarray): + raise ValueError('data must be a numpy array.') + if len(data.shape) != 2 or data.shape[1] != 4: + raise ValueError('Invalid dimensions for box data.') + if data.dtype != np.float32 and data.dtype != np.float64: + raise ValueError( + 'Invalid data type for box data: float is required.') + if not self._is_valid_boxes(data): + raise ValueError('Invalid box data. data must be a numpy array of ' + 'N*[y_min, x_min, y_max, x_max]') + self.data = {'boxes': data} + + def num_boxes(self): + """Return number of boxes held in collections.""" + return self.data['boxes'].shape[0] + + def get_extra_fields(self): + """Return all non-box fields.""" + return [k for k in self.data if k != 'boxes'] + + def has_field(self, field): + return field in self.data + + def add_field(self, field, field_data): + """Add data to a specified field. + + Args: + field: a string parameter used to specify a related field to be + accessed. + field_data: a numpy array of [N, ...] representing the data + associated with the field. + Raises: + ValueError: if the field is already exist or the dimension of the + field data does not matches the number of boxes. + """ + if self.has_field(field): + raise ValueError('Field ' + field + 'already exists') + if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes( + ): + raise ValueError('Invalid dimensions for field data') + self.data[field] = field_data + + def get(self): + """Convenience function for accesssing box coordinates. + + Returns: + a numpy array of shape [N, 4] representing box corners + """ + return self.get_field('boxes') + + def get_field(self, field): + """Accesses data associated with the specified field in the box + collection. + + Args: + field: a string parameter used to specify a related field to be + accessed. + + Returns: + a numpy 1-d array representing data of an associated field + + Raises: + ValueError: if invalid field + """ + if not self.has_field(field): + raise ValueError(f'field {field} does not exist') + return self.data[field] + + def get_coordinates(self): + """Get corner coordinates of boxes. + + Returns: + a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] + """ + box_coordinates = self.get() + y_min = box_coordinates[:, 0] + x_min = box_coordinates[:, 1] + y_max = box_coordinates[:, 2] + x_max = box_coordinates[:, 3] + return [y_min, x_min, y_max, x_max] + + @staticmethod + def _is_valid_boxes(data): + """Check whether data fulfills the format of N*[ymin, xmin, ymax, + xmin]. + + Args: + data: a numpy array of shape [N, 4] representing box coordinates + + Returns: + a boolean indicating whether all ymax of boxes are equal or greater + than ymin, and all xmax of boxes are equal or greater than xmin. + """ + if len(data) != 0: + for v in data: + if v[0] > v[2] or v[1] > v[3]: + return False + return True diff --git a/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py b/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..b542383045d13f27bc8a869abd9cb31df603e788 --- /dev/null +++ b/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py @@ -0,0 +1,98 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Operations for [N, 4] numpy arrays representing bounding boxes. + +Example box operations that are supported: + * Areas: compute bounding box areas + * IOU: pairwise intersection-over-union scores +""" + +import numpy as np + + +def area(boxes): + """Computes area of boxes. + + Args: + boxes: Numpy array with shape [N, 4] holding N boxes + + Returns: + a numpy array with shape [N*1] representing box areas + """ + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def intersection(boxes1, boxes2): + """Compute pairwise intersection areas between boxes. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes + boxes2: a numpy array with shape [M, 4] holding M boxes + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area + """ + [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) + [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) + + all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) + all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) + intersect_heights = np.maximum( + np.zeros(all_pairs_max_ymin.shape), + all_pairs_min_ymax - all_pairs_max_ymin) + all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) + all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) + intersect_widths = np.maximum( + np.zeros(all_pairs_max_xmin.shape), + all_pairs_min_xmax - all_pairs_max_xmin) + return intersect_heights * intersect_widths + + +def iou(boxes1, boxes2): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + """ + intersect = intersection(boxes1, boxes2) + area1 = area(boxes1) + area2 = area(boxes2) + union = ( + np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) - + intersect) + return intersect / union + + +def ioa(boxes1, boxes2): + """Computes pairwise intersection-over-area between box collections. + + Intersection-over-area (ioa) between two boxes box1 and box2 is defined as + their intersection area over box2's area. Note that ioa is not symmetric, + that is, IOA(box1, box2) != IOA(box2, box1). + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + """ + intersect = intersection(boxes1, boxes2) + areas = np.expand_dims(area(boxes2), axis=0) + return intersect / areas diff --git a/mmaction/evaluation/functional/ava_utils.py b/mmaction/evaluation/functional/ava_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2e144779c5c7354a0767e4c0456d3386e5783530 --- /dev/null +++ b/mmaction/evaluation/functional/ava_utils.py @@ -0,0 +1,300 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This piece of code is directly adapted from ActivityNet official repo +# https://github.com/activitynet/ActivityNet/blob/master/ +# Evaluation/get_ava_performance.py. Some unused codes are removed. +import csv +import multiprocessing +import time +from collections import defaultdict + +import numpy as np + +from .ava_evaluation import metrics, np_box_list, np_box_ops + + +def det2csv(results, custom_classes): + """Convert detection results to csv file.""" + csv_results = [] + for idx in range(len(results)): + video_id = results[idx]['video_id'] + timestamp = results[idx]['timestamp'] + result = results[idx]['outputs'] + for label, _ in enumerate(result): + for bbox in result[label]: + bbox_ = tuple(bbox.tolist()) + if custom_classes is not None: + actual_label = custom_classes[label + 1] + else: + actual_label = label + 1 + csv_results.append(( + video_id, + timestamp, + ) + bbox_[:4] + (actual_label, ) + bbox_[4:]) + return csv_results + + +# results is organized by class +def results2csv(results, out_file, custom_classes=None): + """Convert detection results to csv file.""" + csv_results = det2csv(results, custom_classes) + + # save space for float + def to_str(item): + if isinstance(item, float): + return f'{item:.4f}' + return str(item) + + with open(out_file, 'w') as f: + for csv_result in csv_results: + f.write(','.join(map(to_str, csv_result))) + f.write('\n') + + +def print_time(message, start): + """Print processing time.""" + print('==> %g seconds to %s' % (time.time() - start, message), flush=True) + + +def make_image_key(video_id, timestamp): + """Returns a unique identifier for a video id & timestamp.""" + return f'{video_id},{int(timestamp):04d}' + + +def read_csv(csv_file, class_whitelist=None): + """Loads boxes and class labels from a CSV file in the AVA format. + + CSV file format described at https://research.google.com/ava/download.html. + + Args: + csv_file: A file object. + class_whitelist: If provided, boxes corresponding to (integer) class + labels not in this set are skipped. + + Returns: + boxes: A dictionary mapping each unique image key (string) to a list of + boxes, given as coordinates [y1, x1, y2, x2]. + labels: A dictionary mapping each unique image key (string) to a list + of integer class labels, matching the corresponding box in `boxes`. + scores: A dictionary mapping each unique image key (string) to a list + of score values labels, matching the corresponding label in `labels`. + If scores are not provided in the csv, then they will default to 1.0. + """ + entries = defaultdict(list) + boxes = defaultdict(list) + labels = defaultdict(list) + scores = defaultdict(list) + reader = csv.reader(csv_file) + for row in reader: + assert len(row) in [7, 8], 'Wrong number of columns: ' + row + image_key = make_image_key(row[0], row[1]) + x1, y1, x2, y2 = [float(n) for n in row[2:6]] + action_id = int(row[6]) + if class_whitelist and action_id not in class_whitelist: + continue + + score = 1.0 + if len(row) == 8: + score = float(row[7]) + + entries[image_key].append((score, action_id, y1, x1, y2, x2)) + + for image_key in entries: + # Evaluation API assumes boxes with descending scores + entry = sorted(entries[image_key], key=lambda tup: -tup[0]) + boxes[image_key] = [x[2:] for x in entry] + labels[image_key] = [x[1] for x in entry] + scores[image_key] = [x[0] for x in entry] + + return boxes, labels, scores + + +def read_exclusions(exclusions_file): + """Reads a CSV file of excluded timestamps. + + Args: + exclusions_file: A file object containing a csv of video-id,timestamp. + + Returns: + A set of strings containing excluded image keys, e.g. + "aaaaaaaaaaa,0904", + or an empty set if exclusions file is None. + """ + excluded = set() + if exclusions_file: + reader = csv.reader(exclusions_file) + for row in reader: + assert len(row) == 2, f'Expected only 2 columns, got: {row}' + excluded.add(make_image_key(row[0], row[1])) + return excluded + + +def read_labelmap(labelmap_file): + """Reads a labelmap without the dependency on protocol buffers. + + Args: + labelmap_file: A file object containing a label map protocol buffer. + + Returns: + labelmap: The label map in the form used by the + object_detection_evaluation + module - a list of {"id": integer, "name": classname } dicts. + class_ids: A set containing all of the valid class id integers. + """ + labelmap = [] + class_ids = set() + name = '' + class_id = '' + for line in labelmap_file: + if line.startswith(' name:'): + name = line.split('"')[1] + elif line.startswith(' id:') or line.startswith(' label_id:'): + class_id = int(line.strip().split(' ')[-1]) + labelmap.append({'id': class_id, 'name': name}) + class_ids.add(class_id) + return labelmap, class_ids + + +def get_overlaps_and_scores_box_mode(detected_boxes, detected_scores, + groundtruth_boxes): + + detected_boxlist = np_box_list.BoxList(detected_boxes) + detected_boxlist.add_field('scores', detected_scores) + gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes) + + iou = np_box_ops.iou(detected_boxlist.get(), gt_non_group_of_boxlist.get()) + scores = detected_boxlist.get_field('scores') + num_boxes = detected_boxlist.num_boxes() + return iou, scores, num_boxes + + +def tpfp_single(tup, threshold=0.5): + gt_bboxes, gt_labels, bboxes, labels, scores = tup + ret_scores, ret_tp_fp_labels = dict(), dict() + all_labels = list(set(labels)) + for label in all_labels: + gt_bbox = np.array( + [x for x, y in zip(gt_bboxes, gt_labels) if y == label], + dtype=np.float32).reshape(-1, 4) + bbox = np.array([x for x, y in zip(bboxes, labels) if y == label], + dtype=np.float32).reshape(-1, 4) + score = np.array([x for x, y in zip(scores, labels) if y == label], + dtype=np.float32).reshape(-1) + iou, score, num_boxes = get_overlaps_and_scores_box_mode( + bbox, score, gt_bbox) + if gt_bbox.size == 0: + ret_scores[label] = score + ret_tp_fp_labels[label] = np.zeros(num_boxes, dtype=bool) + continue + tp_fp_labels = np.zeros(num_boxes, dtype=bool) + if iou.shape[1] > 0: + max_overlap_gt_ids = np.argmax(iou, axis=1) + is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool) + for i in range(num_boxes): + gt_id = max_overlap_gt_ids[i] + if iou[i, gt_id] >= threshold: + if not is_gt_box_detected[gt_id]: + tp_fp_labels[i] = True + is_gt_box_detected[gt_id] = True + ret_scores[label], ret_tp_fp_labels[label] = score, tp_fp_labels + return ret_scores, ret_tp_fp_labels + + +# Seems there is at most 100 detections for each image +def ava_eval(result_file, + result_type, + label_file, + ann_file, + exclude_file, + verbose=True, + ignore_empty_frames=True, + custom_classes=None): + """Perform ava evaluation.""" + + assert result_type in ['mAP'] + start = time.time() + categories, class_whitelist = read_labelmap(open(label_file)) + if custom_classes is not None: + custom_classes = custom_classes[1:] + assert set(custom_classes).issubset(set(class_whitelist)) + class_whitelist = custom_classes + categories = [cat for cat in categories if cat['id'] in custom_classes] + + # loading gt, do not need gt score + gt_bboxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist) + if verbose: + print_time('Reading GT results', start) + + if exclude_file is not None: + excluded_keys = read_exclusions(open(exclude_file)) + else: + excluded_keys = list() + + start = time.time() + boxes, labels, scores = read_csv(open(result_file), class_whitelist) + if verbose: + print_time('Reading Detection results', start) + + start = time.time() + all_gt_labels = np.concatenate(list(gt_labels.values())) + gt_count = {k: np.sum(all_gt_labels == k) for k in class_whitelist} + + pool = multiprocessing.Pool(32) + if ignore_empty_frames: + tups = [(gt_bboxes[k], gt_labels[k], boxes[k], labels[k], scores[k]) + for k in gt_bboxes if k not in excluded_keys] + else: + tups = [(gt_bboxes.get(k, np.zeros((0, 4), dtype=np.float32)), + gt_labels.get(k, []), boxes[k], labels[k], scores[k]) + for k in boxes if k not in excluded_keys] + rets = pool.map(tpfp_single, tups) + + if verbose: + print_time('Calculating TP/FP', start) + + start = time.time() + scores, tpfps = defaultdict(list), defaultdict(list) + for score, tpfp in rets: + for k in score: + scores[k].append(score[k]) + tpfps[k].append(tpfp[k]) + + cls_AP = [] + for k in scores: + scores[k] = np.concatenate(scores[k]) + tpfps[k] = np.concatenate(tpfps[k]) + precision, recall = metrics.compute_precision_recall( + scores[k], tpfps[k], gt_count[k]) + ap = metrics.compute_average_precision(precision, recall) + class_name = [x['name'] for x in categories if x['id'] == k] + assert len(class_name) == 1 + class_name = class_name[0] + cls_AP.append((k, class_name, ap)) + if verbose: + print_time('Run Evaluator', start) + + print('Per-class results: ', flush=True) + for k, class_name, ap in cls_AP: + print(f'Index: {k}, Action: {class_name}: AP: {ap:.4f};', flush=True) + + overall = np.nanmean([x[2] for x in cls_AP]) + person_movement = np.nanmean([x[2] for x in cls_AP if x[0] <= 14]) + object_manipulation = np.nanmean([x[2] for x in cls_AP if 14 < x[0] < 64]) + person_interaction = np.nanmean([x[2] for x in cls_AP if 64 <= x[0]]) + + print('Overall Results: ', flush=True) + print(f'Overall mAP: {overall:.4f}', flush=True) + print(f'Person Movement mAP: {person_movement:.4f}', flush=True) + print(f'Object Manipulation mAP: {object_manipulation:.4f}', flush=True) + print(f'Person Interaction mAP: {person_interaction:.4f}', flush=True) + + results = {} + results['overall'] = overall + results['person_movement'] = person_movement + results['object_manipulation'] = object_manipulation + results['person_interaction'] = person_interaction + + if verbose: + for k, class_name, ap in cls_AP: + print(f'Class {class_name} AP: {ap:.4f}', flush=True) + + return results diff --git a/mmaction/evaluation/functional/eval_detection.py b/mmaction/evaluation/functional/eval_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..9a32a8d1c92631cdf649501f6859ce0e233f2fb3 --- /dev/null +++ b/mmaction/evaluation/functional/eval_detection.py @@ -0,0 +1,233 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json + +import numpy as np +from mmengine.logging import MMLogger, print_log + +from .accuracy import interpolated_precision_recall, pairwise_temporal_iou + + +class ActivityNetLocalization: + """Class to evaluate detection results on ActivityNet. + + Args: + ground_truth_filename (str | None): The filename of groundtruth. + Default: None. + prediction_filename (str | None): The filename of action detection + results. Default: None. + tiou_thresholds (np.ndarray): The thresholds of temporal iou to + evaluate. Default: ``np.linspace(0.5, 0.95, 10)``. + verbose (bool): Whether to print verbose logs. Default: False. + """ + + def __init__(self, + ground_truth_filename=None, + prediction_filename=None, + tiou_thresholds=np.linspace(0.5, 0.95, 10), + verbose=False): + if not ground_truth_filename: + raise IOError('Please input a valid ground truth file.') + if not prediction_filename: + raise IOError('Please input a valid prediction file.') + self.ground_truth_filename = ground_truth_filename + self.prediction_filename = prediction_filename + self.tiou_thresholds = tiou_thresholds + self.verbose = verbose + self.ap = None + self.logger = MMLogger.get_current_instance() + # Import ground truth and predictions. + self.ground_truth, self.activity_index = self._import_ground_truth( + ground_truth_filename) + self.prediction = self._import_prediction(prediction_filename) + + if self.verbose: + log_msg = ( + '[INIT] Loaded ground_truth from ' + f'{self.ground_truth_filename}, prediction from ' + f'{self.prediction_filename}.\n' + f'Number of ground truth instances: {len(self.ground_truth)}\n' + f'Number of predictions: {len(self.prediction)}\n' + f'Fixed threshold for tiou score: {self.tiou_thresholds}') + print_log(log_msg, logger=self.logger) + + @staticmethod + def _import_ground_truth(ground_truth_filename): + """Read ground truth file and return the ground truth instances and the + activity classes. + + Args: + ground_truth_filename (str): Full path to the ground truth json + file. + + Returns: + tuple[list, dict]: (ground_truth, activity_index). + ground_truth contains the ground truth instances, which is in a + dict format. + activity_index contains classes index. + """ + with open(ground_truth_filename, 'r') as f: + data = json.load(f) + # Checking format + activity_index, class_idx = {}, 0 + ground_truth = [] + for video_id, video_info in data.items(): + for anno in video_info['annotations']: + if anno['label'] not in activity_index: + activity_index[anno['label']] = class_idx + class_idx += 1 + # old video_anno + ground_truth_item = {} + ground_truth_item['video-id'] = video_id[2:] + ground_truth_item['t-start'] = float(anno['segment'][0]) + ground_truth_item['t-end'] = float(anno['segment'][1]) + ground_truth_item['label'] = activity_index[anno['label']] + ground_truth.append(ground_truth_item) + + return ground_truth, activity_index + + def _import_prediction(self, prediction_filename): + """Read prediction file and return the prediction instances. + + Args: + prediction_filename (str): Full path to the prediction json file. + + Returns: + List: List containing the prediction instances (dictionaries). + """ + with open(prediction_filename, 'r') as f: + data = json.load(f) + # Read predictions. + prediction = [] + for video_id, video_info in data['results'].items(): + for result in video_info: + prediction_item = dict() + prediction_item['video-id'] = video_id + prediction_item['label'] = self.activity_index[result['label']] + prediction_item['t-start'] = float(result['segment'][0]) + prediction_item['t-end'] = float(result['segment'][1]) + prediction_item['score'] = result['score'] + prediction.append(prediction_item) + + return prediction + + def wrapper_compute_average_precision(self): + """Computes average precision for each class.""" + ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index))) + + # Adaptation to query faster + ground_truth_by_label = [] + prediction_by_label = [] + for i in range(len(self.activity_index)): + ground_truth_by_label.append([]) + prediction_by_label.append([]) + for gt in self.ground_truth: + ground_truth_by_label[gt['label']].append(gt) + for pred in self.prediction: + prediction_by_label[pred['label']].append(pred) + + for i in range(len(self.activity_index)): + ap_result = compute_average_precision_detection( + ground_truth_by_label[i], prediction_by_label[i], + self.tiou_thresholds) + ap[:, i] = ap_result + + return ap + + def evaluate(self): + """Evaluates a prediction file. + + For the detection task we measure the interpolated mean average + precision to measure the performance of a method. + """ + self.ap = self.wrapper_compute_average_precision() + + self.mAP = self.ap.mean(axis=1) + self.average_mAP = self.mAP.mean() + + return self.mAP, self.average_mAP + + +def compute_average_precision_detection(ground_truth, + prediction, + tiou_thresholds=np.linspace( + 0.5, 0.95, 10)): + """Compute average precision (detection task) between ground truth and + predictions data frames. If multiple predictions occurs for the same + predicted segment, only the one with highest score is matches as true + positive. This code is greatly inspired by Pascal VOC devkit. + + Args: + ground_truth (list[dict]): List containing the ground truth instances + (dictionaries). Required keys are 'video-id', 't-start' and + 't-end'. + prediction (list[dict]): List containing the prediction instances + (dictionaries). Required keys are: 'video-id', 't-start', 't-end' + and 'score'. + tiou_thresholds (np.ndarray): A 1darray indicates the temporal + intersection over union threshold, which is optional. + Default: ``np.linspace(0.5, 0.95, 10)``. + + Returns: + Float: ap, Average precision score. + """ + num_thresholds = len(tiou_thresholds) + num_gts = len(ground_truth) + num_preds = len(prediction) + ap = np.zeros(num_thresholds) + if len(prediction) == 0: + return ap + + num_positive = float(num_gts) + lock_gt = np.ones((num_thresholds, num_gts)) * -1 + # Sort predictions by decreasing score order. + prediction.sort(key=lambda x: -x['score']) + # Initialize true positive and false positive vectors. + tp = np.zeros((num_thresholds, num_preds)) + fp = np.zeros((num_thresholds, num_preds)) + + # Adaptation to query faster + ground_truth_by_videoid = {} + for i, item in enumerate(ground_truth): + item['index'] = i + ground_truth_by_videoid.setdefault(item['video-id'], []).append(item) + + # Assigning true positive to truly grount truth instances. + for idx, pred in enumerate(prediction): + if pred['video-id'] in ground_truth_by_videoid: + gts = ground_truth_by_videoid[pred['video-id']] + else: + fp[:, idx] = 1 + continue + + tiou_arr = pairwise_temporal_iou( + np.array([pred['t-start'], pred['t-end']]), + np.array([np.array([gt['t-start'], gt['t-end']]) for gt in gts])) + tiou_arr = tiou_arr.reshape(-1) + # We would like to retrieve the predictions with highest tiou score. + tiou_sorted_idx = tiou_arr.argsort()[::-1] + for t_idx, tiou_threshold in enumerate(tiou_thresholds): + for j_idx in tiou_sorted_idx: + if tiou_arr[j_idx] < tiou_threshold: + fp[t_idx, idx] = 1 + break + if lock_gt[t_idx, gts[j_idx]['index']] >= 0: + continue + # Assign as true positive after the filters above. + tp[t_idx, idx] = 1 + lock_gt[t_idx, gts[j_idx]['index']] = idx + break + + if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0: + fp[t_idx, idx] = 1 + + tp_cumsum = np.cumsum(tp, axis=1).astype(np.float64) + fp_cumsum = np.cumsum(fp, axis=1).astype(np.float64) + recall_cumsum = tp_cumsum / num_positive + + precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum) + + for t_idx in range(len(tiou_thresholds)): + ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :], + recall_cumsum[t_idx, :]) + + return ap diff --git a/mmaction/evaluation/functional/multisports_utils.py b/mmaction/evaluation/functional/multisports_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9d4d08e6eb3a73781cef34883c113c5cb3d6b451 --- /dev/null +++ b/mmaction/evaluation/functional/multisports_utils.py @@ -0,0 +1,685 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/MCG-NJU/MultiSports +# Original licence: Copyright (c) MCG-NJU, under the MIT License. +# ------------------------------------------------------------------------------ + +import math +from collections import defaultdict + +import numpy as np +from mmengine.logging import MMLogger +from rich.progress import track + + +def area2d_voc(b): + """Compute the areas for a set of 2D boxes.""" + return (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) + + +def overlap2d_voc(b1, b2): + """Compute the overlaps between a set of boxes b1 and one box b2.""" + xmin = np.maximum(b1[:, 0], b2[:, 0]) + ymin = np.maximum(b1[:, 1], b2[:, 1]) + xmax = np.minimum(b1[:, 2], b2[:, 2]) + ymax = np.minimum(b1[:, 3], b2[:, 3]) + + width = np.maximum(0, xmax - xmin) + height = np.maximum(0, ymax - ymin) + + return width * height + + +def iou2d_voc(b1, b2): + """Compute the IoU between a set of boxes b1 and 1 box b2.""" + if b1.ndim == 1: + b1 = b1[None, :] + if b2.ndim == 1: + b2 = b2[None, :] + + assert b2.shape[0] == 1 + + ov = overlap2d_voc(b1, b2) + + return ov / (area2d_voc(b1) + area2d_voc(b2) - ov) + + +def iou3d_voc(b1, b2): + """Compute the IoU between two tubes with same temporal extent.""" + assert b1.shape[0] == b2.shape[0] + assert np.all(b1[:, 0] == b2[:, 0]) + + ov = overlap2d_voc(b1[:, 1:5], b2[:, 1:5]) + + return np.mean(ov / (area2d_voc(b1[:, 1:5]) + area2d_voc(b2[:, 1:5]) - ov)) + + +def iou3dt_voc(b1, b2, spatialonly=False, temporalonly=False): + """Compute the spatio-temporal IoU between two tubes.""" + tmin = max(b1[0, 0], b2[0, 0]) + tmax = min(b1[-1, 0], b2[-1, 0]) + + if tmax < tmin: + return 0.0 + + temporal_inter = tmax - tmin + temporal_union = max(b1[-1, 0], b2[-1, 0]) - min(b1[0, 0], b2[0, 0]) + + tube1 = b1[int(np.where( + b1[:, 0] == tmin)[0]):int(np.where(b1[:, 0] == tmax)[0]) + 1, :] + tube2 = b2[int(np.where( + b2[:, 0] == tmin)[0]):int(np.where(b2[:, 0] == tmax)[0]) + 1, :] + + if temporalonly: + return temporal_inter / temporal_union + return iou3d_voc(tube1, tube2) * (1. if spatialonly else temporal_inter / + temporal_union) + + +def pr_to_ap_voc(pr): + precision = pr[:, 0] + recall = pr[:, 1] + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Preprocess precision to be a non-decreasing array + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def nms_tubelets(dets, overlapThresh=0.3, top_k=None): + """Compute the NMS for a set of scored tubelets scored tubelets are numpy + array with 4K+1 columns, last one being the score return the indices of the + tubelets to keep.""" + + # If there are no detections, return an empty list + if len(dets) == 0: + return dets + if top_k is None: + top_k = len(dets) + + K = int((dets.shape[1] - 1) / 4) + + # Coordinates of bounding boxes + x1 = [dets[:, 4 * k] for k in range(K)] + y1 = [dets[:, 4 * k + 1] for k in range(K)] + x2 = [dets[:, 4 * k + 2] for k in range(K)] + y2 = [dets[:, 4 * k + 3] for k in range(K)] + + # Compute the area of the bounding boxes and sort the bounding + # boxes by the bottom-right y-coordinate of the bounding box + # area = (x2 - x1 + 1) * (y2 - y1 + 1) + scores = dets[:, -1] + area = [(x2[k] - x1[k] + 1) * (y2[k] - y1[k] + 1) for k in range(K)] + order = np.argsort(scores)[::-1] + weight = np.zeros_like(scores) + 1 + counter = 0 + + while order.size > 0: + i = order[0] + counter += 1 + + # Compute overlap + xx1 = [np.maximum(x1[k][i], x1[k][order[1:]]) for k in range(K)] + yy1 = [np.maximum(y1[k][i], y1[k][order[1:]]) for k in range(K)] + xx2 = [np.minimum(x2[k][i], x2[k][order[1:]]) for k in range(K)] + yy2 = [np.minimum(y2[k][i], y2[k][order[1:]]) for k in range(K)] + + w = [np.maximum(0, xx2[k] - xx1[k] + 1) for k in range(K)] + h = [np.maximum(0, yy2[k] - yy1[k] + 1) for k in range(K)] + + inter_area = [w[k] * h[k] for k in range(K)] + ious = sum([ + inter_area[k] / (area[k][order[1:]] + area[k][i] - inter_area[k]) + for k in range(K) + ]) + index = np.where(ious > overlapThresh * K)[0] + weight[order[index + 1]] = 1 - ious[index] + + index2 = np.where(ious <= overlapThresh * K)[0] + order = order[index2 + 1] + + dets[:, -1] = dets[:, -1] * weight + + new_scores = dets[:, -1] + new_order = np.argsort(new_scores)[::-1] + dets = dets[new_order, :] + + return dets[:top_k, :] + + +class Dataset(): + + def __init__(self, anno, frm_alldets) -> None: + self.anno = anno + self.video_list = self.anno['test_videos'][0] + self.nframes = self.anno['nframes'] + self.labels = self.anno['labels'] + self.frm_alldets = frm_alldets + + def get_vid_dets(self): + self.vid_frm_det = defaultdict(list) + for frm_det in self.frm_alldets: + vid_idx = int(frm_det[0]) + vid_name = self.video_list[vid_idx] + self.vid_frm_det[vid_name].append(frm_det) + + self.vid_det = dict() + for vid_name, vid_frm_dets in self.vid_frm_det.items(): + self.vid_det[vid_name] = dict() + for frm_idx in range(1, self.nframes[vid_name] + 1): + self.vid_det[vid_name][frm_idx] = dict() + for label_idx in range(len(self.labels)): + self.vid_det[vid_name][frm_idx][label_idx] = np.empty( + shape=(0, 5)) + for frm_dets in vid_frm_dets: + frm_idx = int(frm_dets[1]) + label_idx = int(frm_dets[2]) + det = [*frm_dets[-4:], frm_det[3]] + det = np.array(det)[None, :] + + self.vid_det[vid_name][frm_idx][label_idx] = np.concatenate( + [self.vid_det[vid_name][frm_idx][label_idx], det]) + + return self.vid_det + + +def link_tubes(anno, frm_dets, K=1, len_thre=15): + + dataset = Dataset(anno, frm_dets) + vlist = dataset.video_list + total_VDets = dataset.get_vid_dets() + + total_video_tubes = {label: [] for label in range(len(dataset.labels))} + for v in track(vlist, description='linking tubes...'): + + RES = {} + if v not in total_VDets: + continue + VDets = total_VDets[v] + for ilabel in range(len(dataset.labels)): + FINISHED_TUBES = [] + CURRENT_TUBES = [] # tubes is a list of tuple (frame, lstubelets) + + # calculate average scores of tubelets in tubes + + def tubescore(tt): + return np.mean( + np.array([tt[i][1][-1] for i in range(len(tt))])) + + for frame in range(1, dataset.nframes[v] + 2 - K): + # load boxes of the new frame and do nms while keeping Nkeep highest scored # noqa: E501 + ltubelets = np.array( + VDets[frame][ilabel] + ) # [:,range(4*K) + [4*K + 1 + ilabel]] Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score # noqa: E501 + + ltubelets = nms_tubelets(ltubelets, 0.6, top_k=10) + + # just start new tubes + if frame == 1: + for i in range(ltubelets.shape[0]): + CURRENT_TUBES.append([(1, ltubelets[i, :])]) + continue + + # sort current tubes according to average score + avgscore = [tubescore(t) for t in CURRENT_TUBES] + argsort = np.argsort(-np.array(avgscore)) + CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort] + # loop over tubes + finished = [] + for it, t in enumerate(CURRENT_TUBES): + # compute ious between the last box of t and ltubelets + last_frame, last_tubelet = t[-1] + ious = [] + offset = frame - last_frame + if offset < K: + nov = K - offset + ious = sum([ + iou2d_voc( + ltubelets[:, 4 * iov:4 * iov + 4], + last_tubelet[4 * (iov + offset):4 * + (iov + offset + 1)]) + for iov in range(nov) + ]) / float(nov) + else: + ious = iou2d_voc(ltubelets[:, :4], + last_tubelet[4 * K - 4:4 * K]) + + valid = np.where(ious >= 0.5)[0] + + if valid.size > 0: + # take the one with maximum score + idx = valid[np.argmax(ltubelets[valid, -1])] + CURRENT_TUBES[it].append((frame, ltubelets[idx, :])) + ltubelets = np.delete(ltubelets, idx, axis=0) + else: + if offset >= K: + finished.append(it) + + # finished tubes that are done + for it in finished[:: + -1]: # process in reverse order to delete them with the right index why --++-- # noqa: E501 + FINISHED_TUBES.append(CURRENT_TUBES[it][:]) + del CURRENT_TUBES[it] + + # start new tubes + for i in range(ltubelets.shape[0]): + CURRENT_TUBES.append([(frame, ltubelets[i, :])]) + + # all tubes are not finished + FINISHED_TUBES += CURRENT_TUBES + + # build real tubes + output = [] + for t in FINISHED_TUBES: + score = tubescore(t) + + # just start new tubes + if score < 0.005: + continue + + beginframe = t[0][0] + endframe = t[-1][0] + K - 1 + length = endframe + 1 - beginframe + + # delete tubes with short duraton + if length < len_thre: + continue + + # build final tubes by average the tubelets + out = np.zeros((length, 6), dtype=np.float32) + out[:, 0] = np.arange(beginframe, endframe + 1) + n_per_frame = np.zeros((length, 1), dtype=np.int32) + for i in range(len(t)): + frame, box = t[i] + for k in range(K): + out[frame - beginframe + k, + 1:5] += box[4 * k:4 * k + 4] + out[frame - beginframe + k, + -1] += box[-1] # single frame confidence + n_per_frame[frame - beginframe + k, 0] += 1 + out[:, 1:] /= n_per_frame + output.append([out, score]) + # out: [num_frames, (frame idx, x1, y1, x2, y2, score)] + + RES[ilabel] = output + if output: + for tube, tube_score in output: + video_tube_res = tuple([v, tube_score, tube]) + total_video_tubes[ilabel].append(video_tube_res) + return total_video_tubes + + +def frameAP(GT, alldets, thr, print_info=True): + logger = MMLogger.get_current_instance() + vlist = GT['test_videos'][0] + + results = {} + for ilabel, label in enumerate(GT['labels']): + # detections of this class + if label in [ + 'aerobic kick jump', 'aerobic off axis jump', + 'aerobic butterfly jump', 'aerobic balance turn', + 'basketball save', 'basketball jump ball' + ]: + if print_info: + logger.info('do not evaluate {}'.format(label)) + continue + # det format: # noqa: E501 + detections = alldets[alldets[:, 2] == ilabel, :] + + # load ground-truth of this class + gt = {} + for iv, v in enumerate(vlist): + tubes = GT['gttubes'][v] + + if ilabel not in tubes: + continue + + for tube in tubes[ilabel]: + for i in range(tube.shape[0]): + k = (iv, int(tube[i, 0])) # k -> (video_idx, frame_idx) + if k not in gt: + gt[k] = [] + gt[k].append(tube[i, 1:5].tolist()) + + for k in gt: + gt[k] = np.array(gt[k]) + + # pr will be an array containing precision-recall values + pr = np.empty((detections.shape[0], 2), + dtype=np.float64) # precision,recall + gt_num = sum([g.shape[0] for g in gt.values()]) + if gt_num == 0: + if print_info: + logger.info('no such label', ilabel, label) + continue + fp = 0 # false positives + tp = 0 # true positives + + is_gt_box_detected = {} + for i, j in enumerate(np.argsort(-detections[:, 3])): + k = (int(detections[j, 0]), int(detections[j, 1])) + box = detections[j, 4:8] + ispositive = False + + if k in gt: + # match gt_box according to the iou + if k not in is_gt_box_detected: + is_gt_box_detected[k] = np.zeros( + gt[k].shape[0], dtype=bool) + ious = iou2d_voc(gt[k], box) + amax = np.argmax(ious) + + if ious[amax] >= thr: + if not is_gt_box_detected[k][amax]: + ispositive = True + is_gt_box_detected[k][amax] = True + + if ispositive: + tp += 1 + else: + fp += 1 + pr[i, 0] = float(tp) / float(tp + fp) + pr[i, 1] = float(tp) / float(gt_num) + + results[label] = pr + + # display results + ap = 100 * np.array([pr_to_ap_voc(results[label]) for label in results]) + class_result = {} + for label in results: + class_result[label] = pr_to_ap_voc(results[label]) * 100 + frameap_result = np.mean(ap) + if print_info: + logger.info('frameAP_{}\n'.format(thr)) + for label in class_result: + logger.info('{:20s} {:8.2f}'.format(label, class_result[label])) + logger.info('{:20s} {:8.2f}'.format('mAP', frameap_result)) + return frameap_result + + +def videoAP(GT, alldets, thr, print_info=True): + logger = MMLogger.get_current_instance() + vlist = GT['test_videos'][0] + + res = {} + for ilabel in range(len(GT['labels'])): + if GT['labels'][ilabel] in [ + 'aerobic kick jump', 'aerobic off axis jump', + 'aerobic butterfly jump', 'aerobic balance turn', + 'basketball save', 'basketball jump ball' + ]: + if print_info: + logger.info('do not evaluate{}'.format(GT['labels'][ilabel])) + continue + detections = alldets[ilabel] + # load ground-truth + gt = {} + for v in vlist: + tubes = GT['gttubes'][v] + + if ilabel not in tubes: + continue + + gt[v] = tubes[ilabel] + + if len(gt[v]) == 0: + del gt[v] + + # precision,recall + pr = np.empty((len(detections), 2), dtype=np.float64) + + gt_num = sum([len(g) for g in gt.values()]) # false negatives + fp = 0 # false positives + tp = 0 # true positives + if gt_num == 0: + if print_info: + logger.info('no such label', ilabel, GT['labels'][ilabel]) + continue + is_gt_box_detected = {} + for i, j in enumerate( + np.argsort(-np.array([dd[1] for dd in detections]))): + v, score, tube = detections[j] + ispositive = False + if v in gt: + if v not in is_gt_box_detected: + is_gt_box_detected[v] = np.zeros(len(gt[v]), dtype=bool) + ious = [iou3dt_voc(g, tube) for g in gt[v]] + amax = np.argmax(ious) + if ious[amax] >= thr: + if not is_gt_box_detected[v][amax]: + ispositive = True + is_gt_box_detected[v][amax] = True + + if ispositive: + tp += 1 + else: + fp += 1 + + pr[i, 0] = float(tp) / float(tp + fp) + pr[i, 1] = float(tp) / float(gt_num) + res[GT['labels'][ilabel]] = pr + + # display results + ap = 100 * np.array([pr_to_ap_voc(res[label]) for label in res]) + videoap_result = np.mean(ap) + class_result = {} + for label in res: + class_result[label] = pr_to_ap_voc(res[label]) * 100 + if print_info: + logger.info('VideoAP_{}\n'.format(thr)) + for label in class_result: + logger.info('{:20s} {:8.2f}'.format(label, class_result[label])) + logger.info('{:20s} {:8.2f}'.format('mAP', videoap_result)) + return videoap_result + + +def videoAP_all(groundtruth, detections): + high_ap = 0 + for i in range(10): + thr = 0.5 + 0.05 * i + high_ap += videoAP(groundtruth, detections, thr, print_info=False) + high_ap = high_ap / 10.0 + + low_ap = 0 + for i in range(9): + thr = 0.05 + 0.05 * i + low_ap += videoAP(groundtruth, detections, thr, print_info=False) + low_ap = low_ap / 9.0 + + all_ap = 0 + for i in range(9): + thr = 0.1 + 0.1 * i + all_ap += videoAP(groundtruth, detections, thr, print_info=False) + all_ap = all_ap / 9.0 + + map = { + 'v_map_0.05:0.45': round(low_ap, 4), + 'v_map_0.10:0.90': round(all_ap, 4), + 'v_map_0.50:0.95': round(high_ap, 4), + } + return map + + +def videoAP_error(GT, alldets, thr): + + vlist = GT['test_videos'][0] + + th_s = math.sqrt(thr) + th_t = math.sqrt(thr) + + print('th is', thr) + print('th_s is', th_s) + print('th_t is', th_t) + + res = {} + dupgt = {} + for v in vlist: + dupgt[v] = GT['gttubes'][v] + # compute video error for every class + for ilabel in range(len(GT['labels'])): + if GT['labels'][ilabel] in [ + 'aerobic kick jump', 'aerobic off axis jump', + 'aerobic butterfly jump', 'aerobic balance turn', + 'basketball save', 'basketball jump ball' + ]: + print('do not evaluate {}'.format(GT['labels'][ilabel])) + continue + detections = alldets[ilabel] + + pr = np.zeros((len(detections), 11), dtype=np.float32) + + gt_num = 0 + for v in dupgt: + if ilabel in dupgt[v]: + gt_num = gt_num + len(dupgt[v][ilabel]) + fp = 0 # false positives + tp = 0 # true positives + ER = 0 # repeat error repeat predict for the same instance + EN = 0 # extra error + EL = 0 # localization errors + EC = 0 # classification error + ET = 0 # timing error + ErrCT = 0 # cls + time + ECL = 0 # cls + loc + ETL = 0 # time + loc + ECTL = 0 # cls + time + loc + + is_gt_box_detected = {} + for i, j in enumerate( + np.argsort(-np.array([dd[1] for dd in detections]))): + v, score, tube = detections[j] + ispositive = False + end = False + if ilabel in dupgt[v]: + if v not in is_gt_box_detected: + is_gt_box_detected[v] = np.zeros( + len(dupgt[v][ilabel]), dtype=bool) + ious = [iou3dt_voc(g, tube) for g in dupgt[v][ilabel]] + amax = np.argmax(ious) + if ious[amax] >= thr: + if not is_gt_box_detected[v][amax]: + ispositive = True + is_gt_box_detected[v][amax] = True + else: + ER += 1 + end = True + if end is False: + ious = [] + for ll in dupgt[v]: + if ll == ilabel: + continue + for g in dupgt[v][ll]: + ious.append(iou3dt_voc(g, tube)) + if ious != []: + amax = np.argmax(ious) + if ious[amax] >= thr: + EC += 1 + end = True + if end is False: + all_gt = [] + ious = [] + for ll in dupgt[v]: + for g in dupgt[v][ll]: + all_gt.append((ll, g)) + ious.append(iou3dt_voc(g, tube)) + amax = np.argmax(ious) + assert (ious[amax] < thr) + if ious[amax] > 0: + t_iou = iou3dt_voc( + all_gt[amax][1], tube, temporalonly=True) + s_iou = iou3dt_voc(all_gt[amax][1], tube, spatialonly=True) + if all_gt[amax][0] == ilabel: + assert (t_iou < th_t or s_iou < th_s) + if t_iou >= th_t: + EL += 1 + end = True + elif s_iou >= th_s: + ET += 1 + end = True + else: + ETL += 1 + end = True + else: + assert (t_iou < th_t or s_iou < th_s) + if t_iou >= th_t: + ECL += 1 + end = True + elif s_iou >= th_s: + ErrCT += 1 + end = True + else: + ECTL += 1 + end = True + else: + EN += 1 + end = True + assert (end is True) + if ispositive: + tp += 1 + # fn -= 1 + else: + fp += 1 + assert (fp == (ER + EN + EL + EC + ET + ErrCT + ECL + ETL + ECTL)) + pr[i, 0] = max(float(tp) / float(tp + fp), 0.) + pr[i, 1] = max(float(tp) / float(gt_num), 0.) + pr[i, 2] = max(float(ER) / float(tp + fp), 0.) + pr[i, 3] = max(float(EN) / float(tp + fp), 0.) + pr[i, 4] = max(float(EL) / float(tp + fp), 0.) + pr[i, 5] = max(float(EC) / float(tp + fp), 0.) + pr[i, 6] = max(float(ET) / float(tp + fp), 0.) + pr[i, 7] = max(float(ErrCT) / float(tp + fp), 0.) + pr[i, 8] = max(float(ECL) / float(tp + fp), 0.) + pr[i, 9] = max(float(ETL) / float(tp + fp), 0.) + pr[i, 10] = max(float(ECTL) / float(tp + fp), 0.) + + res[GT['labels'][ilabel]] = pr + + # display results + AP = 100 * np.array([pr_to_ap_voc(res[label][:, [0, 1]]) for label in res]) + othersap = [ + 100 * np.array([pr_to_ap_voc(res[label][:, [j, 1]]) for label in res]) + for j in range(2, 11) + ] + + ER = othersap[0] + EN = othersap[1] + EL = othersap[2] + EC = othersap[3] + ET = othersap[4] + ErrCT = othersap[5] + ECL = othersap[6] + ETL = othersap[7] + ECTL = othersap[8] + # missed detections = 1-recalll + EM = [] + for label in res: + if res[label].shape[0] != 0: + EM.append(100 - 100 * res[label][-1, 1]) + else: + EM.append(100) + EM = np.array(EM) + + LIST = [AP, ER, EN, EL, EC, ET, ErrCT, ECL, ETL, ECTL, EM] + + print('Error Analysis') + + print('') + print( + '{:20s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}' # noqa: E501 + .format('label', ' AP ', ' Repeat ', ' Extra ', ' Loc. ', ' Cls. ', + ' Time ', ' Cls.+Time ', ' Cls.+Loc. ', ' Time+Loc. ', + ' C+T+L ', ' missed ')) + print('') + for il, label in enumerate(res): + print('{:20s} '.format(label) + + ' '.join(['{:8.2f}'.format(L[il]) for L in LIST])) + print('') + print('{:20s} '.format('mean') + + ' '.join(['{:8.2f}'.format(np.mean(L)) for L in LIST])) + print('') diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f864f243029813aed0ff2b2659ecbbeace55162a --- /dev/null +++ b/mmaction/evaluation/metrics/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .acc_metric import AccMetric, ConfusionMatrix +from .anet_metric import ANetMetric +from .ava_metric import AVAMetric +from .multimodal_metric import VQAMCACC, ReportVQA, RetrievalRecall, VQAAcc +from .multisports_metric import MultiSportsMetric +from .retrieval_metric import RetrievalMetric +from .video_grounding_metric import RecallatTopK + +__all__ = [ + 'AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix', + 'MultiSportsMetric', 'RetrievalMetric', 'VQAAcc', 'ReportVQA', 'VQAMCACC', + 'RetrievalRecall', 'RecallatTopK' +] diff --git a/mmaction/evaluation/metrics/__pycache__/__init__.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..535cef4cfdb65aad40f241ecc27c170f26a5dcb5 Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82ee5db5b6b0e8ff9e739f25ede0bb454569b22c Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..377424e93a1ef256785098a62c0ec500674241b2 Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abfad5c3f4c40f4cd8791e8be9db736300243cd9 Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37b389e3e8747a3f1ff0e6f60b04c35acc063ae3 Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..574474d072c4a4e99930b0aabb29eee3b7a9a95d Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e870c5c46a334c4a800d46e14e83863c98548b3f Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-310.pyc b/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb7433584ec87ac46b486a31299e24478c7aca49 Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-310.pyc differ diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..91020fbcccd0e7a4824a29b296eb9129f5d10ec8 --- /dev/null +++ b/mmaction/evaluation/metrics/acc_metric.py @@ -0,0 +1,387 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import OrderedDict +from itertools import product +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import mmengine +import numpy as np +import torch +from mmengine.evaluator import BaseMetric + +from mmaction.evaluation import (get_weighted_score, mean_average_precision, + mean_class_accuracy, + mmit_mean_average_precision, top_k_accuracy) +from mmaction.registry import METRICS + + +def to_tensor(value): + """Convert value to torch.Tensor.""" + if isinstance(value, np.ndarray): + value = torch.from_numpy(value) + elif isinstance(value, Sequence) and not mmengine.is_str(value): + value = torch.tensor(value) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'{type(value)} is not an available argument.') + return value + + +@METRICS.register_module() +class AccMetric(BaseMetric): + """Accuracy evaluation metric.""" + default_prefix: Optional[str] = 'acc' + + def __init__(self, + metric_list: Optional[Union[str, Tuple[str]]] = ( + 'top_k_accuracy', 'mean_class_accuracy'), + collect_device: str = 'cpu', + metric_options: Optional[Dict] = dict( + top_k_accuracy=dict(topk=(1, 5))), + prefix: Optional[str] = None) -> None: + + # TODO: fix the metric_list argument with a better one. + # `metrics` is not a safe argument here with mmengine. + # we have to replace it with `metric_list`. + super().__init__(collect_device=collect_device, prefix=prefix) + if not isinstance(metric_list, (str, tuple)): + raise TypeError('metric_list must be str or tuple of str, ' + f'but got {type(metric_list)}') + + if isinstance(metric_list, str): + metrics = (metric_list, ) + else: + metrics = metric_list + + # coco evaluation metrics + for metric in metrics: + assert metric in [ + 'top_k_accuracy', 'mean_class_accuracy', + 'mmit_mean_average_precision', 'mean_average_precision' + ] + + self.metrics = metrics + self.metric_options = metric_options + + def process(self, data_batch: Sequence[Tuple[Any, Dict]], + data_samples: Sequence[Dict]) -> None: + """Process one batch of data samples and data_samples. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[dict]): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + data_samples = copy.deepcopy(data_samples) + for data_sample in data_samples: + result = dict() + pred = data_sample['pred_score'] + label = data_sample['gt_label'] + + # Ad-hoc for RGBPoseConv3D + if isinstance(pred, dict): + for item_name, score in pred.items(): + pred[item_name] = score.cpu().numpy() + else: + pred = pred.cpu().numpy() + + result['pred'] = pred + if label.size(0) == 1: + # single-label + result['label'] = label.item() + else: + # multi-label + result['label'] = label.cpu().numpy() + self.results.append(result) + + def compute_metrics(self, results: List) -> Dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + labels = [x['label'] for x in results] + + eval_results = dict() + # Ad-hoc for RGBPoseConv3D + if isinstance(results[0]['pred'], dict): + + for item_name in results[0]['pred'].keys(): + preds = [x['pred'][item_name] for x in results] + eval_result = self.calculate(preds, labels) + eval_results.update( + {f'{item_name}_{k}': v + for k, v in eval_result.items()}) + + if len(results[0]['pred']) == 2 and \ + 'rgb' in results[0]['pred'] and \ + 'pose' in results[0]['pred']: + + rgb = [x['pred']['rgb'] for x in results] + pose = [x['pred']['pose'] for x in results] + + preds = { + '1:1': get_weighted_score([rgb, pose], [1, 1]), + '2:1': get_weighted_score([rgb, pose], [2, 1]), + '1:2': get_weighted_score([rgb, pose], [1, 2]) + } + for k in preds: + eval_result = self.calculate(preds[k], labels) + eval_results.update({ + f'RGBPose_{k}_{key}': v + for key, v in eval_result.items() + }) + return eval_results + + # Simple Acc Calculation + else: + preds = [x['pred'] for x in results] + return self.calculate(preds, labels) + + def calculate(self, preds: List[np.ndarray], + labels: List[Union[int, np.ndarray]]) -> Dict: + """Compute the metrics from processed results. + + Args: + preds (list[np.ndarray]): List of the prediction scores. + labels (list[int | np.ndarray]): List of the labels. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + eval_results = OrderedDict() + metric_options = copy.deepcopy(self.metric_options) + for metric in self.metrics: + if metric == 'top_k_accuracy': + topk = metric_options.setdefault('top_k_accuracy', + {}).setdefault( + 'topk', (1, 5)) + + if not isinstance(topk, (int, tuple)): + raise TypeError('topk must be int or tuple of int, ' + f'but got {type(topk)}') + + if isinstance(topk, int): + topk = (topk, ) + + top_k_acc = top_k_accuracy(preds, labels, topk) + for k, acc in zip(topk, top_k_acc): + eval_results[f'top{k}'] = acc + + if metric == 'mean_class_accuracy': + mean1 = mean_class_accuracy(preds, labels) + eval_results['mean1'] = mean1 + + if metric in [ + 'mean_average_precision', + 'mmit_mean_average_precision', + ]: + if metric == 'mean_average_precision': + mAP = mean_average_precision(preds, labels) + eval_results['mean_average_precision'] = mAP + + elif metric == 'mmit_mean_average_precision': + mAP = mmit_mean_average_precision(preds, labels) + eval_results['mmit_mean_average_precision'] = mAP + + return eval_results + + +@METRICS.register_module() +class ConfusionMatrix(BaseMetric): + r"""A metric to calculate confusion matrix for single-label tasks. + + Args: + num_classes (int, optional): The number of classes. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + + Examples: + + 1. The basic usage. + + >>> import torch + >>> from mmaction.evaluation import ConfusionMatrix + >>> y_pred = [0, 1, 1, 3] + >>> y_true = [0, 2, 1, 3] + >>> ConfusionMatrix.calculate(y_pred, y_true, num_classes=4) + tensor([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1]]) + >>> # plot the confusion matrix + >>> import matplotlib.pyplot as plt + >>> y_score = torch.rand((1000, 10)) + >>> y_true = torch.randint(10, (1000, )) + >>> matrix = ConfusionMatrix.calculate(y_score, y_true) + >>> ConfusionMatrix().plot(matrix) + >>> plt.show() + + 2. In the config file + + .. code:: python + + val_evaluator = dict(type='ConfusionMatrix') + test_evaluator = dict(type='ConfusionMatrix') + """ # noqa: E501 + default_prefix = 'confusion_matrix' + + def __init__(self, + num_classes: Optional[int] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device, prefix) + + self.num_classes = num_classes + + def process(self, data_batch, data_samples: Sequence[dict]) -> None: + for data_sample in data_samples: + pred_scores = data_sample.get('pred_score') + gt_label = data_sample['gt_label'] + if pred_scores is not None: + pred_label = pred_scores.argmax(dim=0, keepdim=True) + self.num_classes = pred_scores.size(0) + else: + pred_label = data_sample['pred_label'] + + self.results.append({ + 'pred_label': pred_label, + 'gt_label': gt_label + }) + + def compute_metrics(self, results: list) -> dict: + pred_labels = [] + gt_labels = [] + for result in results: + pred_labels.append(result['pred_label']) + gt_labels.append(result['gt_label']) + confusion_matrix = ConfusionMatrix.calculate( + torch.cat(pred_labels), + torch.cat(gt_labels), + num_classes=self.num_classes) + return {'result': confusion_matrix} + + @staticmethod + def calculate(pred, target, num_classes=None) -> dict: + """Calculate the confusion matrix for single-label task. + + Args: + pred (torch.Tensor | np.ndarray | Sequence): The prediction + results. It can be labels (N, ), or scores of every + class (N, C). + target (torch.Tensor | np.ndarray | Sequence): The target of + each prediction with shape (N, ). + num_classes (Optional, int): The number of classes. If the ``pred`` + is label instead of scores, this argument is required. + Defaults to None. + + Returns: + torch.Tensor: The confusion matrix. + """ + pred = to_tensor(pred) + target_label = to_tensor(target).int() + + assert pred.size(0) == target_label.size(0), \ + f"The size of pred ({pred.size(0)}) doesn't match "\ + f'the target ({target_label.size(0)}).' + assert target_label.ndim == 1 + + if pred.ndim == 1: + assert num_classes is not None, \ + 'Please specify the `num_classes` if the `pred` is labels ' \ + 'intead of scores.' + pred_label = pred + else: + num_classes = num_classes or pred.size(1) + pred_label = torch.argmax(pred, dim=1).flatten() + + with torch.no_grad(): + indices = num_classes * target_label + pred_label + matrix = torch.bincount(indices, minlength=num_classes**2) + matrix = matrix.reshape(num_classes, num_classes) + + return matrix + + @staticmethod + def plot(confusion_matrix: torch.Tensor, + include_values: bool = False, + cmap: str = 'viridis', + classes: Optional[List[str]] = None, + colorbar: bool = True, + show: bool = True): + """Draw a confusion matrix by matplotlib. + + Modified from `Scikit-Learn + `_ + + Args: + confusion_matrix (torch.Tensor): The confusion matrix to draw. + include_values (bool): Whether to draw the values in the figure. + Defaults to False. + cmap (str): The color map to use. Defaults to use "viridis". + classes (list[str], optional): The names of categories. + Defaults to None, which means to use index number. + colorbar (bool): Whether to show the colorbar. Defaults to True. + show (bool): Whether to show the figure immediately. + Defaults to True. + """ # noqa: E501 + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(10, 10)) + + num_classes = confusion_matrix.size(0) + + im_ = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap) + text_ = None + cmap_min, cmap_max = im_.cmap(0), im_.cmap(1.0) + + if include_values: + text_ = np.empty_like(confusion_matrix, dtype=object) + + # print text with appropriate color depending on background + thresh = (confusion_matrix.max() + confusion_matrix.min()) / 2.0 + + for i, j in product(range(num_classes), range(num_classes)): + color = cmap_max if confusion_matrix[i, + j] < thresh else cmap_min + + text_cm = format(confusion_matrix[i, j], '.2g') + text_d = format(confusion_matrix[i, j], 'd') + if len(text_d) < len(text_cm): + text_cm = text_d + + text_[i, j] = ax.text( + j, i, text_cm, ha='center', va='center', color=color) + + display_labels = classes or np.arange(num_classes) + + if colorbar: + fig.colorbar(im_, ax=ax) + ax.set( + xticks=np.arange(num_classes), + yticks=np.arange(num_classes), + xticklabels=display_labels, + yticklabels=display_labels, + ylabel='True label', + xlabel='Predicted label', + ) + ax.invert_yaxis() + ax.xaxis.tick_top() + + ax.set_ylim((num_classes - 0.5, -0.5)) + # Automatically rotate the x labels. + fig.autofmt_xdate(ha='center') + + if show: + plt.show() + return fig diff --git a/mmaction/evaluation/metrics/anet_metric.py b/mmaction/evaluation/metrics/anet_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..c569eca15584e8cdd001b61abf86fbdda58183e5 --- /dev/null +++ b/mmaction/evaluation/metrics/anet_metric.py @@ -0,0 +1,172 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from collections import OrderedDict +from typing import Any, Optional, Sequence, Tuple + +import mmcv +import mmengine +import numpy as np +from mmengine.evaluator import BaseMetric + +from mmaction.evaluation import average_recall_at_avg_proposals +from mmaction.registry import METRICS +from mmaction.utils import ConfigType + + +@METRICS.register_module() +class ANetMetric(BaseMetric): + """ActivityNet dataset evaluation metric.""" + + def __init__(self, + metric_type: str = 'TEM', + collect_device: str = 'cpu', + prefix: Optional[str] = None, + metric_options: dict = {}, + dump_config: ConfigType = dict(out='')): + super().__init__(collect_device=collect_device, prefix=prefix) + self.metric_type = metric_type + + assert 'out' in dump_config + self.output_format = dump_config.pop('output_format', 'csv') + self.out = dump_config['out'] + + self.metric_options = metric_options + if self.metric_type == 'AR@AN': + self.ground_truth = {} + + def process(self, data_batch: Sequence[Tuple[Any, dict]], + predictions: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[Tuple[Any, dict]]): A batch of data + from the dataloader. + predictions (Sequence[dict]): A batch of outputs from + the model. + """ + for pred in predictions: + self.results.append(pred) + + if self.metric_type == 'AR@AN': + data_batch = data_batch['data_samples'] + for data_sample in data_batch: + video_info = data_sample.metainfo + video_id = video_info['video_name'][2:] + this_video_gt = [] + for ann in video_info['annotations']: + t_start, t_end = ann['segment'] + label = ann['label'] + this_video_gt.append([t_start, t_end, label]) + self.ground_truth[video_id] = np.array(this_video_gt) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + If `metric_type` is 'TEM', only dump middle results and do not compute + any metrics. + Args: + results (list): The processed results of each batch. + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + self.dump_results(results) + if self.metric_type == 'AR@AN': + return self.compute_ARAN(results) + return OrderedDict() + + def compute_ARAN(self, results: list) -> dict: + """AR@AN evaluation metric.""" + temporal_iou_thresholds = self.metric_options.setdefault( + 'AR@AN', {}).setdefault('temporal_iou_thresholds', + np.linspace(0.5, 0.95, 10)) + max_avg_proposals = self.metric_options.setdefault( + 'AR@AN', {}).setdefault('max_avg_proposals', 100) + if isinstance(temporal_iou_thresholds, list): + temporal_iou_thresholds = np.array(temporal_iou_thresholds) + + eval_results = OrderedDict() + proposal, num_proposals = self._import_proposals(results) + + recall, _, _, auc = average_recall_at_avg_proposals( + self.ground_truth, + proposal, + num_proposals, + max_avg_proposals=max_avg_proposals, + temporal_iou_thresholds=temporal_iou_thresholds) + eval_results['auc'] = auc + eval_results['AR@1'] = np.mean(recall[:, 0]) + eval_results['AR@5'] = np.mean(recall[:, 4]) + eval_results['AR@10'] = np.mean(recall[:, 9]) + eval_results['AR@100'] = np.mean(recall[:, 99]) + + return eval_results + + def dump_results(self, results, version='VERSION 1.3'): + """Save middle or final results to disk.""" + if self.output_format == 'json': + result_dict = self.proposals2json(results) + output_dict = { + 'version': version, + 'results': result_dict, + 'external_data': {} + } + mmengine.dump(output_dict, self.out) + elif self.output_format == 'csv': + os.makedirs(self.out, exist_ok=True) + header = 'action,start,end,tmin,tmax' + for result in results: + video_name, outputs = result + output_path = osp.join(self.out, video_name + '.csv') + np.savetxt( + output_path, + outputs, + header=header, + delimiter=',', + comments='') + else: + raise ValueError( + f'The output format {self.output_format} is not supported.') + + @staticmethod + def proposals2json(results, show_progress=False): + """Convert all proposals to a final dict(json) format. + Args: + results (list[dict]): All proposals. + show_progress (bool): Whether to show the progress bar. + Defaults: False. + Returns: + dict: The final result dict. E.g. + .. code-block:: Python + dict(video-1=[dict(segment=[1.1,2.0]. score=0.9), + dict(segment=[50.1, 129.3], score=0.6)]) + """ + result_dict = {} + print('Convert proposals to json format') + if show_progress: + prog_bar = mmcv.ProgressBar(len(results)) + for result in results: + video_name = result['video_name'] + result_dict[video_name[2:]] = result['proposal_list'] + if show_progress: + prog_bar.update() + return result_dict + + @staticmethod + def _import_proposals(results): + """Read predictions from results.""" + proposals = {} + num_proposals = 0 + for result in results: + video_id = result['video_name'][2:] + this_video_proposals = [] + for proposal in result['proposal_list']: + t_start, t_end = proposal['segment'] + score = proposal['score'] + this_video_proposals.append([t_start, t_end, score]) + num_proposals += 1 + proposals[video_id] = np.array(this_video_proposals) + return proposals, num_proposals diff --git a/mmaction/evaluation/metrics/ava_metric.py b/mmaction/evaluation/metrics/ava_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..470309697e8c9a8be08bcc653f76957ae06ef4df --- /dev/null +++ b/mmaction/evaluation/metrics/ava_metric.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from datetime import datetime +from typing import Any, List, Optional, Sequence, Tuple + +from mmengine.evaluator import BaseMetric + +from mmaction.evaluation import ava_eval, results2csv +from mmaction.registry import METRICS +from mmaction.structures import bbox2result + + +@METRICS.register_module() +class AVAMetric(BaseMetric): + """AVA evaluation metric.""" + default_prefix: Optional[str] = 'mAP' + + def __init__(self, + ann_file: str, + exclude_file: str, + label_file: str, + options: Tuple[str] = ('mAP', ), + action_thr: float = 0.002, + num_classes: int = 81, + custom_classes: Optional[List[int]] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + assert len(options) == 1 + self.ann_file = ann_file + self.exclude_file = exclude_file + self.label_file = label_file + self.num_classes = num_classes + self.options = options + self.action_thr = action_thr + self.custom_classes = custom_classes + if custom_classes is not None: + self.custom_classes = list([0] + custom_classes) + + def process(self, data_batch: Sequence[Tuple[Any, dict]], + data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[Tuple[Any, dict]]): A batch of data + from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from + the model. + """ + for data_sample in data_samples: + result = dict() + pred = data_sample['pred_instances'] + result['video_id'] = data_sample['video_id'] + result['timestamp'] = data_sample['timestamp'] + outputs = bbox2result( + pred['bboxes'], + pred['scores'], + num_classes=self.num_classes, + thr=self.action_thr) + result['outputs'] = outputs + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + time_now = datetime.now().strftime('%Y%m%d_%H%M%S') + temp_file = f'AVA_{time_now}_result.csv' + results2csv(results, temp_file, self.custom_classes) + + eval_results = ava_eval( + temp_file, + self.options[0], + self.label_file, + self.ann_file, + self.exclude_file, + ignore_empty_frames=True, + custom_classes=self.custom_classes) + + os.remove(temp_file) + + return eval_results diff --git a/mmaction/evaluation/metrics/multimodal_metric.py b/mmaction/evaluation/metrics/multimodal_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..433d65bcf93a2dd8972933adb05f3d4a03991a92 --- /dev/null +++ b/mmaction/evaluation/metrics/multimodal_metric.py @@ -0,0 +1,565 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copied from mmpretrain +# Partly adopted from https://github.com/GT-Vision-Lab/VQA +# Copyright (c) 2014, Aishwarya Agrawal +from typing import List, Optional, Sequence, Union + +import mmengine +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger +from mmengine.utils import is_seq_of + +from mmaction.registry import METRICS +from mmaction.structures.action_data_sample import format_label +from .acc_metric import to_tensor + + +def _process_punctuation(inText): + import re + outText = inText + punct = [ + ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', + '>', '<', '@', '`', ',', '?', '!' + ] + commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605 + periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605 + for p in punct: + if (p + ' ' in inText or ' ' + p in inText) or (re.search( + commaStrip, inText) is not None): + outText = outText.replace(p, '') + else: + outText = outText.replace(p, ' ') + outText = periodStrip.sub('', outText, re.UNICODE) + return outText + + +def _process_digit_article(inText): + outText = [] + tempText = inText.lower().split() + articles = ['a', 'an', 'the'] + manualMap = { + 'none': '0', + 'zero': '0', + 'one': '1', + 'two': '2', + 'three': '3', + 'four': '4', + 'five': '5', + 'six': '6', + 'seven': '7', + 'eight': '8', + 'nine': '9', + 'ten': '10', + } + contractions = { + 'aint': "ain't", + 'arent': "aren't", + 'cant': "can't", + 'couldve': "could've", + 'couldnt': "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + 'didnt': "didn't", + 'doesnt': "doesn't", + 'dont': "don't", + 'hadnt': "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + 'hasnt': "hasn't", + 'havent': "haven't", + 'hed': "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + 'hes': "he's", + 'howd': "how'd", + 'howll': "how'll", + 'hows': "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + 'Im': "I'm", + 'Ive': "I've", + 'isnt': "isn't", + 'itd': "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + 'itll': "it'll", + "let's": "let's", + 'maam': "ma'am", + 'mightnt': "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + 'mightve': "might've", + 'mustnt': "mustn't", + 'mustve': "must've", + 'neednt': "needn't", + 'notve': "not've", + 'oclock': "o'clock", + 'oughtnt': "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + 'shant': "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + 'shouldve': "should've", + 'shouldnt': "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": 'somebodyd', + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + 'somebodyll': "somebody'll", + 'somebodys': "somebody's", + 'someoned': "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + 'someonell': "someone'll", + 'someones': "someone's", + 'somethingd': "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + 'somethingll': "something'll", + 'thats': "that's", + 'thered': "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + 'therere': "there're", + 'theres': "there's", + 'theyd': "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + 'theyll': "they'll", + 'theyre': "they're", + 'theyve': "they've", + 'twas': "'twas", + 'wasnt': "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + 'weve': "we've", + 'werent': "weren't", + 'whatll': "what'll", + 'whatre': "what're", + 'whats': "what's", + 'whatve': "what've", + 'whens': "when's", + 'whered': "where'd", + 'wheres': "where's", + 'whereve': "where've", + 'whod': "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + 'wholl': "who'll", + 'whos': "who's", + 'whove': "who've", + 'whyll': "why'll", + 'whyre': "why're", + 'whys': "why's", + 'wont': "won't", + 'wouldve': "would've", + 'wouldnt': "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + 'yall': "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + 'youd': "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + 'youll': "you'll", + 'youre': "you're", + 'youve': "you've", + } + for word in tempText: + word = manualMap.setdefault(word, word) + if word not in articles: + outText.append(word) + for wordId, word in enumerate(outText): + if word in contractions: + outText[wordId] = contractions[word] + outText = ' '.join(outText) + return outText + + +@METRICS.register_module() +class VQAAcc(BaseMetric): + '''VQA Acc metric. + Args: + + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Should be modified according to the + `retrieval_type` for unambiguous results. Defaults to TR. + ''' + default_prefix = 'VQA' + + def __init__(self, + full_score_weight: float = 0.3, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + self.full_score_weight = full_score_weight + + def process(self, data_batch, data_samples): + """Process one batch of data samples. + + The processed results should be stored in ``self.results``, which will + be used to computed the metrics when all batches have been processed. + + Args: + data_batch: A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for sample in data_samples: + gt_answer = sample.get('gt_answer') + gt_answer_weight = sample.get('gt_answer_weight') + if isinstance(gt_answer, str): + gt_answer = [gt_answer] + if gt_answer_weight is None: + gt_answer_weight = [1. / (len(gt_answer))] * len(gt_answer) + + result = { + 'pred_answer': sample.get('pred_answer'), + 'gt_answer': gt_answer, + 'gt_answer_weight': gt_answer_weight, + } + + self.results.append(result) + + def compute_metrics(self, results: List): + """Compute the metrics from processed results. + + Args: + results (dict): The processed results of each batch. + + Returns: + Dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + acc = [] + for result in results: + pred_answer = self._process_answer(result['pred_answer']) + gt_answer = [ + self._process_answer(answer) for answer in result['gt_answer'] + ] + answer_weight = result['gt_answer_weight'] + + weight_sum = 0 + for i, gt in enumerate(gt_answer): + if gt == pred_answer: + weight_sum += answer_weight[i] + vqa_acc = min(1.0, weight_sum / self.full_score_weight) + acc.append(vqa_acc) + + accuracy = sum(acc) / len(acc) * 100 + + metrics = {'acc': accuracy} + return metrics + + def _process_answer(self, answer): + answer = answer.replace('\n', ' ') + answer = answer.replace('\t', ' ') + answer = answer.strip() + answer = _process_punctuation(answer) + answer = _process_digit_article(answer) + return answer + + +@METRICS.register_module() +class ReportVQA(BaseMetric): + """Dump VQA result to the standard json format for VQA evaluation. + + Args: + file_path (str): The file path to save the result file. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Should be modified according to the + `retrieval_type` for unambiguous results. Defaults to TR. + """ + default_prefix = 'VQA' + + def __init__(self, + file_path: str, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + if not file_path.endswith('.json'): + raise ValueError('The output file must be a json file.') + self.file_path = file_path + + def process(self, data_batch, data_samples) -> None: + """transfer tensors in predictions to CPU.""" + for sample in data_samples: + question_id = sample['question_id'] + pred_answer = sample['pred_answer'] + + result = { + 'question_id': int(question_id), + 'answer': pred_answer, + } + + self.results.append(result) + + def compute_metrics(self, results: List): + """Dump the result to json file.""" + mmengine.dump(results, self.file_path) + logger = MMLogger.get_current_instance() + logger.info(f'Results has been saved to {self.file_path}.') + return {} + + +@METRICS.register_module() +class VQAMCACC(BaseMetric): + '''VQA multiple choice Acc metric. + Args: + + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Should be modified according to the + `retrieval_type` for unambiguous results. Defaults to TR. + ''' + default_prefix = 'VQAMC' + + def __init__(self, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + + def process(self, data_batch, data_samples): + """Process one batch of data samples. + + The processed results should be stored in ``self.results``, which will + be used to computed the metrics when all batches have been processed. + + Args: + data_batch: A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for sample in data_samples: + # gt_labels in datasample is a LabelData + label = sample['gt_label'].item() + result = { + 'pred_label': sample.get('pred_label'), + 'gt_label': label, + } + + self.results.append(result) + + def compute_metrics(self, results: List): + """Compute the metrics from processed results. + + Args: + results (dict): The processed results of each batch. + + Returns: + Dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + preds = np.array([x['pred_label'] for x in results]) + labels = np.array([x['gt_label'] for x in results]) + + accuracy = np.sum(preds == labels) / len(preds) * 100 + + metrics = {'acc': accuracy} + return metrics + + +@METRICS.register_module() +class RetrievalRecall(BaseMetric): + r"""Recall evaluation metric for image retrieval. + + Args: + topk (int | Sequence[int]): If the ground truth label matches one of + the best **k** predictions, the sample will be regard as a positive + prediction. If the parameter is a tuple, all of top-k recall will + be calculated and outputted together. Defaults to 1. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + + """ + default_prefix: Optional[str] = 'retrieval' + + def __init__(self, + topk: Union[int, Sequence[int]], + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + topk = (topk, ) if isinstance(topk, int) else topk + + for k in topk: + if k <= 0: + raise ValueError('`topk` must be a ingter larger than 0 ' + 'or seq of ingter larger than 0.') + + self.topk = topk + super().__init__(collect_device=collect_device, prefix=prefix) + + def process(self, data_batch: Sequence[dict], + data_samples: Sequence[dict]): + """Process one batch of data and predictions. + + The processed results should be stored in ``self.results``, which will + be used to computed the metrics when all batches have been processed. + + Args: + data_batch (Sequence[dict]): A batch of data from the dataloader. + predictions (Sequence[dict]): A batch of outputs from the model. + """ + for data_sample in data_samples: + pred_score = data_sample['pred_score'].cpu() + gt_label = format_label(data_sample['gt_label']) + + if 'gt_score' in data_sample: + target = data_sample.get('gt_score').clone() + else: + num_classes = pred_score.size()[-1] + target = F.one_hot(gt_label, num_classes) + + # Because the retrieval output logit vector will be much larger + # compared to the normal classification, to save resources, the + # evaluation results are computed each batch here and then reduce + # all results at the end. + result = RetrievalRecall.calculate( + pred_score.unsqueeze(0), target.unsqueeze(0), topk=self.topk) + self.results.append(result) + + def compute_metrics(self, results: List): + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + result_metrics = dict() + for i, k in enumerate(self.topk): + recall_at_k = sum([r[i].item() for r in results]) / len(results) + result_metrics[f'Recall@{k}'] = recall_at_k + + return result_metrics + + @staticmethod + def calculate(pred: Union[np.ndarray, torch.Tensor], + target: Union[np.ndarray, torch.Tensor], + topk: Union[int, Sequence[int]], + pred_indices: (bool) = False, + target_indices: (bool) = False) -> float: + """Calculate the average recall. + + Args: + pred (torch.Tensor | np.ndarray | Sequence): The prediction + results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with + shape ``(N, M)`` or a sequence of index/onehot + format labels. + target (torch.Tensor | np.ndarray | Sequence): The prediction + results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with + shape ``(N, M)`` or a sequence of index/onehot + format labels. + topk (int, Sequence[int]): Predictions with the k-th highest + scores are considered as positive. + pred_indices (bool): Whether the ``pred`` is a sequence of + category index labels. Defaults to False. + target_indices (bool): Whether the ``target`` is a sequence of + category index labels. Defaults to False. + + Returns: + List[float]: the average recalls. + """ + topk = (topk, ) if isinstance(topk, int) else topk + for k in topk: + if k <= 0: + raise ValueError('`topk` must be a ingter larger than 0 ' + 'or seq of ingter larger than 0.') + + max_keep = max(topk) + pred = _format_pred(pred, max_keep, pred_indices) + target = _format_target(target, target_indices) + + assert len(pred) == len(target), ( + f'Length of `pred`({len(pred)}) and `target` ({len(target)}) ' + f'must be the same.') + + num_samples = len(pred) + results = [] + for k in topk: + recalls = torch.zeros(num_samples) + for i, (sample_pred, + sample_target) in enumerate(zip(pred, target)): + sample_pred = np.array(to_tensor(sample_pred).cpu()) + sample_target = np.array(to_tensor(sample_target).cpu()) + recalls[i] = int(np.in1d(sample_pred[:k], sample_target).max()) + results.append(recalls.mean() * 100) + return results + + +def _format_pred(label, topk=None, is_indices=False): + """format various label to List[indices].""" + if is_indices: + assert isinstance(label, Sequence), \ + '`pred` must be Sequence of indices when' \ + f' `pred_indices` set to True, but get {type(label)}' + for i, sample_pred in enumerate(label): + assert is_seq_of(sample_pred, int) or isinstance( + sample_pred, (np.ndarray, torch.Tensor)), \ + '`pred` should be Sequence of indices when `pred_indices`' \ + f'set to True. but pred[{i}] is {sample_pred}' + if topk: + label[i] = sample_pred[:min(topk, len(sample_pred))] + return label + if isinstance(label, np.ndarray): + label = torch.from_numpy(label) + elif not isinstance(label, torch.Tensor): + raise TypeError(f'The pred must be type of torch.tensor, ' + f'np.ndarray or Sequence but get {type(label)}.') + topk = topk if topk else label.size()[-1] + _, indices = label.topk(topk) + return indices + + +def _format_target(label, is_indices=False): + """format various label to List[indices].""" + if is_indices: + assert isinstance(label, Sequence), \ + '`target` must be Sequence of indices when' \ + f' `target_indices` set to True, but get {type(label)}' + for i, sample_gt in enumerate(label): + assert is_seq_of(sample_gt, int) or isinstance( + sample_gt, (np.ndarray, torch.Tensor)), \ + '`target` should be Sequence of indices when ' \ + f'`target_indices` set to True. but target[{i}] is {sample_gt}' + return label + + if isinstance(label, np.ndarray): + label = torch.from_numpy(label) + elif isinstance(label, Sequence) and not mmengine.is_str(label): + label = torch.tensor(label) + elif not isinstance(label, torch.Tensor): + raise TypeError(f'The pred must be type of torch.tensor, ' + f'np.ndarray or Sequence but get {type(label)}.') + + indices = [sample_gt.nonzero().squeeze(-1) for sample_gt in label] + return indices diff --git a/mmaction/evaluation/metrics/multisports_metric.py b/mmaction/evaluation/metrics/multisports_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..6b806d7863d5c86877b51c3e3e74481ea64714a7 --- /dev/null +++ b/mmaction/evaluation/metrics/multisports_metric.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Optional, Sequence, Tuple + +import numpy as np +from mmengine import load +from mmengine.evaluator import BaseMetric + +from mmaction.evaluation import frameAP, link_tubes, videoAP, videoAP_all +from mmaction.registry import METRICS + + +@METRICS.register_module() +class MultiSportsMetric(BaseMetric): + """MAP Metric for MultiSports dataset.""" + default_prefix: Optional[str] = 'mAP' + + def __init__(self, + ann_file: str, + metric_options: Optional[dict] = dict( + F_mAP=dict(thr=(0.5)), + V_mAP=dict(thr=(0.2, 0.5), all=True, tube_thr=15)), + collect_device: str = 'cpu', + verbose: bool = True, + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + + self.metric_options = metric_options + self.annos = load(ann_file) + self.verbose = verbose + + def process(self, data_batch: Sequence[Tuple[Any, dict]], + data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[Tuple[Any, dict]]): A batch of data + from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from + the model. + """ + + for pred in data_samples: + video_key = pred['video_id'].split('.mp4')[0] + frm_num = pred['timestamp'] + bboxes = pred['pred_instances']['bboxes'].cpu().numpy() + cls_scores = pred['pred_instances']['scores'].cpu().numpy() + det_result = [video_key, frm_num, bboxes, cls_scores] + + self.results.append(det_result) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + test_videos = self.annos['test_videos'][0] + resolutions = self.annos['resolution'] + detections = [] + for result in results: + video_key, frm_num, bboxes, cls_scores = result + for bbox, cls_score in zip(bboxes, cls_scores): + video_idx = test_videos.index(video_key) + pred_label = np.argmax(cls_score) + score = cls_score[pred_label] + h, w = resolutions[video_key] + bbox *= np.array([w, h, w, h]) + instance_result = np.array( + [video_idx, frm_num, pred_label, score, *bbox]) + detections.append(instance_result) + + frm_detections = np.array(detections) + + metric_result = dict() + f_map = frameAP(self.annos, frm_detections, + self.metric_options['F_mAP']['thr'], self.verbose) + metric_result.update({'frameAP': round(f_map, 4)}) + video_tubes = link_tubes( + self.annos, + frm_detections, + len_thre=self.metric_options['V_mAP']['tube_thr']) + + v_map = {} + for thr in self.metric_options['V_mAP']['thr']: + map = videoAP( + self.annos, video_tubes, thr, print_info=self.verbose) + v_map.update({f'v_map@{thr}': round(map, 4)}) + metric_result.update(v_map) + if self.metric_options['V_mAP'].get('all'): + all_map = videoAP_all(self.annos, video_tubes) + metric_result.update(all_map) + return metric_result diff --git a/mmaction/evaluation/metrics/retrieval_metric.py b/mmaction/evaluation/metrics/retrieval_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf847d5f95ff17519fc216aab02379d586cd0f1 --- /dev/null +++ b/mmaction/evaluation/metrics/retrieval_metric.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +from mmengine.evaluator import BaseMetric + +from mmaction.registry import METRICS + + +@METRICS.register_module() +class RetrievalMetric(BaseMetric): + """Metric for video retrieval task. + + Args: + metric_list (str | tuple[str]): The list of the metrics to be + computed. Defaults to ``('R1', 'R5', 'R10', 'MdR', 'MnR')``. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + + default_prefix = 'retrieval' + + def __init__(self, + metric_list: Union[Tuple[str], + str] = ('R1', 'R5', 'R10', 'MdR', 'MnR'), + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + if isinstance(metric_list, str): + metric_list = (metric_list, ) + + for metric in metric_list: + if metric not in ['R1', 'R5', 'R10', 'MdR', 'MnR']: + raise ValueError(f'RetrievalMetric only supports ' + f"'R1', 'R5', 'R10', 'MdR', 'MnR', " + f"but got '{metric}. '") + + self.metric_list = metric_list + + def process(self, data_batch: Optional[Dict], + data_samples: Sequence[Dict]) -> None: + """Process one batch of data samples and data_samples. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict, optional): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + data_samples = copy.deepcopy(data_samples) + + for data_sample in data_samples: + results = dict() + features = data_sample['features'] + video_feature = features['video_feature'].cpu().numpy() + text_feature = features['text_feature'].cpu().numpy() + results['video_feature'] = video_feature + results['text_feature'] = text_feature + self.results.append(results) + + def compute_metrics(self, results: List) -> Dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + + video_features = np.stack([res['video_feature'] for res in results]) + text_features = np.stack([res['text_feature'] for res in results]) + + video_features = video_features / np.linalg.norm( + video_features, axis=-1, keepdims=True) + text_features = text_features / np.linalg.norm( + text_features, axis=-1, keepdims=True) + + similarity = text_features @ video_features.T + + sx = np.sort(-similarity) + d = np.diag(-similarity) + ind = np.where((sx - d[:, None]) == 0)[1] + + metrics = OrderedDict() + for metric in self.metric_list: + if metric == 'R1': + metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind) + elif metric == 'R5': + metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind) + elif metric == 'R10': + metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind) + elif metric == 'MdR': + metrics['MdR'] = np.median(ind) + 1 + elif metric == 'MnR': + metrics['MnR'] = np.mean(ind) + 1 + + return metrics diff --git a/mmaction/evaluation/metrics/video_grounding_metric.py b/mmaction/evaluation/metrics/video_grounding_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..ba9307fb145956766d57c8f5f7a79cfc204196aa --- /dev/null +++ b/mmaction/evaluation/metrics/video_grounding_metric.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Optional, Sequence, Tuple + +from mmengine.evaluator import BaseMetric + +from mmaction.registry import METRICS + + +@METRICS.register_module() +class RecallatTopK(BaseMetric): + """ActivityNet dataset evaluation metric.""" + + def __init__(self, + topK_list: Tuple[int] = (1, 5), + threshold: float = 0.5, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super().__init__(collect_device=collect_device, prefix=prefix) + self.topK_list = topK_list + self.threshold = threshold + + def process(self, data_batch: Sequence[Tuple[Any, dict]], + predictions: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[Tuple[Any, dict]]): A batch of data + from the dataloader. + predictions (Sequence[dict]): A batch of outputs from + the model. + """ + for pred in predictions: + self.results.append(pred) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + eval_results = dict() + for topK in self.topK_list: + total = len(results) + correct = 0.0 + for result in results: + gt = result['gt'] + predictions = result['predictions'][:topK] + for prediction in predictions: + IoU = self.calculate_IoU(gt, prediction) + if IoU > self.threshold: + correct += 1 + break + acc = correct / total + eval_results[f'Recall@Top{topK}_IoU={self.threshold}'] = acc + return eval_results + + def calculate_IoU(self, i0, i1): + union = (min(i0[0], i1[0]), max(i0[1], i1[1])) + inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) + iou = (inter[1] - inter[0]) / (union[1] - union[0]) + return iou diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6964616c5219a88f78571b4737798d18ec6721a --- /dev/null +++ b/mmaction/models/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa: F401,F403 +from .common import * # noqa: F401,F403 +from .data_preprocessors import * # noqa: F401,F403 +from .heads import * # noqa: F401,F403 +from .localizers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .multimodal import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .recognizers import * # noqa: F401,F403 +from .roi_heads import * # noqa: F401,F403 +from .similarity import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 +from .utils import * # noqa: F401,F403 diff --git a/mmaction/models/__pycache__/__init__.cpython-310.pyc b/mmaction/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51a0d9643361ea66145d95dddc124d362fbb0cae Binary files /dev/null and b/mmaction/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b1cd9e76a2fde3e541001c5189d0825c736fc7b --- /dev/null +++ b/mmaction/models/backbones/__init__.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .aagcn import AAGCN +from .c2d import C2D +from .c3d import C3D +from .mobilenet_v2 import MobileNetV2 +from .mobilenet_v2_tsm import MobileNetV2TSM +from .mvit import MViT +from .resnet import ResNet +from .resnet2plus1d import ResNet2Plus1d +from .resnet3d import ResNet3d, ResNet3dLayer +from .resnet3d_csn import ResNet3dCSN +from .resnet3d_slowfast import ResNet3dSlowFast +from .resnet3d_slowonly import ResNet3dSlowOnly +from .resnet_audio import ResNetAudio +from .resnet_omni import OmniResNet +from .resnet_tin import ResNetTIN +from .resnet_tsm import ResNetTSM +from .rgbposeconv3d import RGBPoseConv3D +from .stgcn import STGCN +from .swin import SwinTransformer3D +from .tanet import TANet +from .timesformer import TimeSformer +from .uniformer import UniFormer +from .uniformerv2 import UniFormerV2 +from .vit_mae import VisionTransformer +from .x3d import X3D + +__all__ = [ + 'AAGCN', 'C2D', 'C3D', 'MViT', 'MobileNetV2', 'MobileNetV2TSM', + 'OmniResNet', 'ResNet', 'ResNet2Plus1d', 'ResNet3d', 'ResNet3dCSN', + 'ResNet3dLayer', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNetAudio', + 'ResNetTIN', 'ResNetTSM', 'STGCN', 'SwinTransformer3D', 'TANet', + 'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D', + 'RGBPoseConv3D' +] + +try: + from .mobileone_tsm import MobileOneTSM # noqa: F401 + __all__.append('MobileOneTSM') + +except (ImportError, ModuleNotFoundError): + pass diff --git a/mmaction/models/backbones/__pycache__/__init__.cpython-310.pyc b/mmaction/models/backbones/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15115d2b520557e29673196535f1b17bf78c5f69 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/aagcn.cpython-310.pyc b/mmaction/models/backbones/__pycache__/aagcn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed018770e34b92bfe08a996f77759c2c8923d513 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/aagcn.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/c2d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/c2d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d7e7ea5d96e60a9c46a99528c0159824f779246 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/c2d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/c3d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/c3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7231910cf99166fefb66beffd174f137f94fc1f3 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/c3d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-310.pyc b/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4bf3b9eb3f7126305cf6563c52e4f8a92f502f7 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-310.pyc b/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c48a1d4c85b2d6a942727d3bc0d8ada37e80276 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-310.pyc b/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0d5ee5d02f8aca3a33b3ebe8c8352eebbaf79d2 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/mvit.cpython-310.pyc b/mmaction/models/backbones/__pycache__/mvit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e47e04230d8d077020c9142598a8cda11566ae3 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mvit.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da7d0a148ecf1ef00350210978e7c1a5c9bc09d8 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62e14f86d561915f068d9edad2a64a7db753a9a8 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet3d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76b630065368a8d8550185612a38dc5c41fc9ef2 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcc9f95780153eb7f66e6c713a1eceeaa0c41d0f Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38f4d4a620f0a78794984c5414b520055b0a10eb Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6bd7a54d5cfad244b3ac4258e40ba9a07b60c4a Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet_audio.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet_audio.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48b599210c4632e78b736c4773c4b1a5865da0df Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_audio.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet_omni.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet_omni.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..980ba86ac0246dae52d80e850679acb27f389c1f Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_omni.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet_tin.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet_tin.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16626e36e26221e9ff40a31b452d96728d5c956e Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_tin.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-310.pyc b/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..512784048a6d054cacd43000b60c83a2284897bc Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94666d66ad8a7d35131bc7fe451d3b5a2a919f90 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/stgcn.cpython-310.pyc b/mmaction/models/backbones/__pycache__/stgcn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a13ae6657e3c128873da593568a7813f83db01de Binary files /dev/null and b/mmaction/models/backbones/__pycache__/stgcn.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/swin.cpython-310.pyc b/mmaction/models/backbones/__pycache__/swin.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ad60be3b9961fa0101e52e198390c98a6eced88 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/swin.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/tanet.cpython-310.pyc b/mmaction/models/backbones/__pycache__/tanet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7e952b59f9967b8129d28bb838bb5773192874c Binary files /dev/null and b/mmaction/models/backbones/__pycache__/tanet.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/timesformer.cpython-310.pyc b/mmaction/models/backbones/__pycache__/timesformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11a3dc3030c702edadfd216fd633b127aa13421d Binary files /dev/null and b/mmaction/models/backbones/__pycache__/timesformer.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/uniformer.cpython-310.pyc b/mmaction/models/backbones/__pycache__/uniformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f07d3d758c1161bc82b46d5f4d6370c270fcab6e Binary files /dev/null and b/mmaction/models/backbones/__pycache__/uniformer.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/uniformerv2.cpython-310.pyc b/mmaction/models/backbones/__pycache__/uniformerv2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0ffbc8a5ea12f305c79b39bc7fb985eb431ea76 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/uniformerv2.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/vit_mae.cpython-310.pyc b/mmaction/models/backbones/__pycache__/vit_mae.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e56112de0bd7d8844c33d866cef88c63ff1c3c44 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/vit_mae.cpython-310.pyc differ diff --git a/mmaction/models/backbones/__pycache__/x3d.cpython-310.pyc b/mmaction/models/backbones/__pycache__/x3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b3bc712288da947c8484f547ce901dd76cc3a88 Binary files /dev/null and b/mmaction/models/backbones/__pycache__/x3d.cpython-310.pyc differ diff --git a/mmaction/models/backbones/aagcn.py b/mmaction/models/backbones/aagcn.py new file mode 100644 index 0000000000000000000000000000000000000000..42a085bed65cc7811375de9324d410d3cf6e8652 --- /dev/null +++ b/mmaction/models/backbones/aagcn.py @@ -0,0 +1,236 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmengine.model import BaseModule, ModuleList + +from mmaction.registry import MODELS +from ..utils import Graph, unit_aagcn, unit_tcn + + +class AAGCNBlock(BaseModule): + """The basic block of AAGCN. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + A (torch.Tensor): The adjacency matrix defined in the graph + with shape of `(num_subsets, num_nodes, num_nodes)`. + stride (int): Stride of the temporal convolution. Defaults to 1. + residual (bool): Whether to use residual connection. Defaults to True. + init_cfg (dict or list[dict], optional): Config to control + the initialization. Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + A: torch.Tensor, + stride: int = 1, + residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + + gcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'gcn_'} + tcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'tcn_'} + kwargs = { + k: v + for k, v in kwargs.items() if k[:4] not in ['gcn_', 'tcn_'] + } + assert len(kwargs) == 0, f'Invalid arguments: {kwargs}' + + tcn_type = tcn_kwargs.pop('type', 'unit_tcn') + assert tcn_type in ['unit_tcn', 'mstcn'] + gcn_type = gcn_kwargs.pop('type', 'unit_aagcn') + assert gcn_type in ['unit_aagcn'] + + self.gcn = unit_aagcn(in_channels, out_channels, A, **gcn_kwargs) + + if tcn_type == 'unit_tcn': + self.tcn = unit_tcn( + out_channels, out_channels, 9, stride=stride, **tcn_kwargs) + + self.relu = nn.ReLU() + + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + return self.relu(self.tcn(self.gcn(x)) + self.residual(x)) + + +@MODELS.register_module() +class AAGCN(BaseModule): + """AAGCN backbone, the attention-enhanced version of 2s-AGCN. + + Skeleton-Based Action Recognition with Multi-Stream + Adaptive Graph Convolutional Networks. + More details can be found in the `paper + `__ . + + Two-Stream Adaptive Graph Convolutional Networks for + Skeleton-Based Action Recognition. + More details can be found in the `paper + `__ . + + Args: + graph_cfg (dict): Config for building the graph. + in_channels (int): Number of input channels. Defaults to 3. + base_channels (int): Number of base channels. Defaults to 64. + data_bn_type (str): Type of the data bn layer. Defaults to ``'MVC'``. + num_person (int): Maximum number of people. Only used when + data_bn_type == 'MVC'. Defaults to 2. + num_stages (int): Total number of stages. Defaults to 10. + inflate_stages (list[int]): Stages to inflate the number of channels. + Defaults to ``[5, 8]``. + down_stages (list[int]): Stages to perform downsampling in + the time dimension. Defaults to ``[5, 8]``. + init_cfg (dict or list[dict], optional): Config to control + the initialization. Defaults to None. + + Examples: + >>> import torch + >>> from mmaction.models import AAGCN + >>> from mmaction.utils import register_all_modules + >>> + >>> register_all_modules() + >>> mode = 'stgcn_spatial' + >>> batch_size, num_person, num_frames = 2, 2, 150 + >>> + >>> # openpose-18 layout + >>> num_joints = 18 + >>> model = AAGCN(graph_cfg=dict(layout='openpose', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # nturgb+d layout + >>> num_joints = 25 + >>> model = AAGCN(graph_cfg=dict(layout='nturgb+d', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # coco layout + >>> num_joints = 17 + >>> model = AAGCN(graph_cfg=dict(layout='coco', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # custom settings + >>> # disable the attention module to degenerate AAGCN to AGCN + >>> model = AAGCN(graph_cfg=dict(layout='coco', mode=mode), + ... gcn_attention=False) + >>> model.init_weights() + >>> output = model(inputs) + >>> print(output.shape) + torch.Size([2, 2, 256, 38, 18]) + torch.Size([2, 2, 256, 38, 25]) + torch.Size([2, 2, 256, 38, 17]) + torch.Size([2, 2, 256, 38, 17]) + """ + + def __init__(self, + graph_cfg: Dict, + in_channels: int = 3, + base_channels: int = 64, + data_bn_type: str = 'MVC', + num_person: int = 2, + num_stages: int = 10, + inflate_stages: List[int] = [5, 8], + down_stages: List[int] = [5, 8], + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + + self.graph = Graph(**graph_cfg) + A = torch.tensor( + self.graph.A, dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + + assert data_bn_type in ['MVC', 'VC', None] + self.data_bn_type = data_bn_type + self.in_channels = in_channels + self.base_channels = base_channels + self.num_person = num_person + self.num_stages = num_stages + self.inflate_stages = inflate_stages + self.down_stages = down_stages + + if self.data_bn_type == 'MVC': + self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1)) + elif self.data_bn_type == 'VC': + self.data_bn = nn.BatchNorm1d(in_channels * A.size(1)) + else: + self.data_bn = nn.Identity() + + lw_kwargs = [cp.deepcopy(kwargs) for i in range(num_stages)] + for k, v in kwargs.items(): + if isinstance(v, tuple) and len(v) == num_stages: + for i in range(num_stages): + lw_kwargs[i][k] = v[i] + lw_kwargs[0].pop('tcn_dropout', None) + + modules = [] + if self.in_channels != self.base_channels: + modules = [ + AAGCNBlock( + in_channels, + base_channels, + A.clone(), + 1, + residual=False, + **lw_kwargs[0]) + ] + + for i in range(2, num_stages + 1): + in_channels = base_channels + out_channels = base_channels * (1 + (i in inflate_stages)) + stride = 1 + (i in down_stages) + modules.append( + AAGCNBlock( + base_channels, + out_channels, + A.clone(), + stride=stride, + **lw_kwargs[i - 1])) + base_channels = out_channels + + if self.in_channels == self.base_channels: + self.num_stages -= 1 + + self.gcn = ModuleList(modules) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous() + if self.data_bn_type == 'MVC': + x = self.data_bn(x.view(N, M * V * C, T)) + else: + x = self.data_bn(x.view(N * M, V * C, T)) + + x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, + 2).contiguous().view(N * M, C, T, V) + + for i in range(self.num_stages): + x = self.gcn[i](x) + + x = x.reshape((N, M) + x.shape[1:]) + return x diff --git a/mmaction/models/backbones/c2d.py b/mmaction/models/backbones/c2d.py new file mode 100644 index 0000000000000000000000000000000000000000..42ca5eb9e697f8f10267302e3729a8e8c7aacb55 --- /dev/null +++ b/mmaction/models/backbones/c2d.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule + +from mmaction.models.backbones.resnet import ResNet +from mmaction.registry import MODELS + + +@MODELS.register_module() +class C2D(ResNet): + """C2D backbone. + + Compared to ResNet-50, a temporal-pool is added after the first + bottleneck. Detailed structure is kept same as "video-nonlocal-net" repo. + Please refer to https://github.com/facebookresearch/video-nonlocal-net/blob + /main/scripts/run_c2d_baseline_400k.sh. + Please note that there are some improvements compared to "Non-local Neural + Networks" paper (https://arxiv.org/abs/1711.07971). + Differences are noted at https://github.com/facebookresearch/video-nonlocal + -net#modifications-for-improving-speed. + """ + + def _make_stem_layer(self) -> None: + """Construct the stem layers consists of a conv+norm+act module and a + pooling layer.""" + self.conv1 = ConvModule( + self.in_channels, + 64, + kernel_size=7, + stride=2, + padding=3, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.maxpool3d_1 = nn.MaxPool3d( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0)) + self.maxpool3d_2 = nn.MaxPool3d( + kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)) + + def forward(self, x: torch.Tensor) \ + -> Union[torch.Tensor, Tuple[torch.Tensor]]: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the + input samples extracted by the backbone. + """ + + batches = x.shape[0] + + def _convert_to_2d(x: torch.Tensor) -> torch.Tensor: + """(N, C, T, H, W) -> (N x T, C, H, W)""" + x = x.permute((0, 2, 1, 3, 4)) + x = x.reshape(-1, x.shape[2], x.shape[3], x.shape[4]) + return x + + def _convert_to_3d(x: torch.Tensor) -> torch.Tensor: + """(N x T, C, H, W) -> (N, C, T, H, W)""" + x = x.reshape(batches, -1, x.shape[1], x.shape[2], x.shape[3]) + x = x.permute((0, 2, 1, 3, 4)) + return x + + x = _convert_to_2d(x) + x = self.conv1(x) + x = _convert_to_3d(x) + x = self.maxpool3d_1(x) + x = _convert_to_2d(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i == 0: + x = _convert_to_3d(x) + x = self.maxpool3d_2(x) + x = _convert_to_2d(x) + if i in self.out_indices: + x = _convert_to_3d(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + + return tuple(outs) diff --git a/mmaction/models/backbones/c3d.py b/mmaction/models/backbones/c3d.py new file mode 100644 index 0000000000000000000000000000000000000000..9feee6652397d42ba6e2f0600721273cc0b75a70 --- /dev/null +++ b/mmaction/models/backbones/c3d.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.logging import MMLogger +from mmengine.model.weight_init import constant_init, kaiming_init, normal_init +from mmengine.runner import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class C3D(nn.Module): + """C3D backbone. + + Args: + pretrained (str | None): Name of pretrained model. + style (str): ``pytorch`` or ``caffe``. If set to "pytorch", the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Default: 'pytorch'. + conv_cfg (dict | None): Config dict for convolution layer. + If set to None, it uses ``dict(type='Conv3d')`` to construct + layers. Default: None. + norm_cfg (dict | None): Config for norm layers. required keys are + ``type``, Default: None. + act_cfg (dict | None): Config dict for activation layer. If set to + None, it uses ``dict(type='ReLU')`` to construct layers. + Default: None. + out_dim (int): The dimension of last layer feature (after flatten). + Depends on the input shape. Default: 8192. + dropout_ratio (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation of fc layers. Default: 0.01. + """ + + def __init__(self, + pretrained=None, + style='pytorch', + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + out_dim=8192, + dropout_ratio=0.5, + init_std=0.005): + super().__init__() + if conv_cfg is None: + conv_cfg = dict(type='Conv3d') + if act_cfg is None: + act_cfg = dict(type='ReLU') + self.pretrained = pretrained + self.style = style + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.dropout_ratio = dropout_ratio + self.init_std = init_std + + c3d_conv_param = dict( + kernel_size=(3, 3, 3), + padding=(1, 1, 1), + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.conv1a = ConvModule(3, 64, **c3d_conv_param) + self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) + + self.conv2a = ConvModule(64, 128, **c3d_conv_param) + self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv3a = ConvModule(128, 256, **c3d_conv_param) + self.conv3b = ConvModule(256, 256, **c3d_conv_param) + self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv4a = ConvModule(256, 512, **c3d_conv_param) + self.conv4b = ConvModule(512, 512, **c3d_conv_param) + self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv5a = ConvModule(512, 512, **c3d_conv_param) + self.conv5b = ConvModule(512, 512, **c3d_conv_param) + self.pool5 = nn.MaxPool3d( + kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) + + self.fc6 = nn.Linear(out_dim, 4096) + self.fc7 = nn.Linear(4096, 4096) + + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=self.dropout_ratio) + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + + load_checkpoint(self, self.pretrained, strict=False, logger=logger) + + elif self.pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, nn.Linear): + normal_init(m, std=self.init_std) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + the size of x is (num_batches, 3, 16, 112, 112). + + Returns: + torch.Tensor: The feature of the input + samples extracted by the backbone. + """ + x = self.conv1a(x) + x = self.pool1(x) + + x = self.conv2a(x) + x = self.pool2(x) + + x = self.conv3a(x) + x = self.conv3b(x) + x = self.pool3(x) + + x = self.conv4a(x) + x = self.conv4b(x) + x = self.pool4(x) + + x = self.conv5a(x) + x = self.conv5b(x) + x = self.pool5(x) + + x = x.flatten(start_dim=1) + x = self.relu(self.fc6(x)) + x = self.dropout(x) + x = self.relu(self.fc7(x)) + + return x diff --git a/mmaction/models/backbones/mobilenet_v2.py b/mmaction/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..daa32e725f841fcb00a6e86807e0459f13e23211 --- /dev/null +++ b/mmaction/models/backbones/mobilenet_v2.py @@ -0,0 +1,324 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS + + +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number down to the nearest value that can + be divisible by the divisor. + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int, optional): The minimum value of the output channel. + Defaults to None, means that the minimum value equal to the + divisor. + min_ratio (float, optional): The minimum ratio of the rounded channel + number to the original channel number. Defaults to 0.9. + Returns: + int: The modified output channel number + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value + + +class InvertedResidual(nn.Module): + """InvertedResidual block for MobileNetV2. + + Args: + in_channels (int): The input channels of the InvertedResidual block. + out_channels (int): The output channels of the InvertedResidual block. + stride (int): Stride of the middle (first) 3x3 convolution. + expand_ratio (int): adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + conv_cfg (dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU6'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + Returns: + Tensor: The output tensor + """ + + def __init__(self, + in_channels, + out_channels, + stride, + expand_ratio, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + with_cp=False): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.use_res_connect = self.stride == 1 and in_channels == out_channels + hidden_dim = int(round(in_channels * expand_ratio)) + + layers = [] + if expand_ratio != 1: + layers.append( + ConvModule( + in_channels=in_channels, + out_channels=hidden_dim, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + layers.extend([ + ConvModule( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=3, + stride=stride, + padding=1, + groups=hidden_dim, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The output of the module. + """ + + def _inner_forward(x): + if self.use_res_connect: + return x + self.conv(x) + + return self.conv(x) + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@MODELS.register_module() +class MobileNetV2(BaseModule): + """MobileNetV2 backbone. + + Args: + pretrained (str | None): Name of pretrained model. Defaults to None. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (None or Sequence[int]): Output from which stages. + Defaults to (7, ). + frozen_stages (int): Stages to be frozen (all param fixed). Note that + the last stage in ``MobileNetV2`` is ``conv2``. Defaults to -1, + which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='Kaiming', layer='Conv2d',), + dict(type='Constant', layer=['GroupNorm', '_BatchNorm'], val=1.) + ]``. + """ + + # Parameters to build layers. 4 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks, stride. + arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], + [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], + [6, 320, 1, 1]] + + def __init__(self, + pretrained=None, + widen_factor=1., + out_indices=(7, ), + frozen_stages=-1, + conv_cfg=dict(type='Conv'), + norm_cfg=dict(type='BN2d', requires_grad=True), + act_cfg=dict(type='ReLU6', inplace=True), + norm_eval=False, + with_cp=False, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + layer=['GroupNorm', '_BatchNorm'], + val=1.) + ]): + if pretrained is not None: + init_cfg = dict(type='Pretrained', checkpoint=pretrained) + super().__init__(init_cfg=init_cfg) + self.pretrained = pretrained + self.widen_factor = widen_factor + self.out_indices = out_indices + for index in out_indices: + if index not in range(0, 8): + raise ValueError('the item in out_indices must in ' + f'range(0, 8). But received {index}') + + if frozen_stages not in range(-1, 9): + raise ValueError('frozen_stages must be in range(-1, 9). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks, stride = layer_cfg + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + if widen_factor > 1.0: + self.out_channel = int(1280 * widen_factor) + else: + self.out_channel = 1280 + + layer = ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channel, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.add_module('conv2', layer) + self.layers.append('conv2') + + def make_layer(self, out_channels, num_blocks, stride, expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Defaults to 1 + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. Defaults to 6. + """ + layers = [] + for i in range(num_blocks): + if i >= 1: + stride = 1 + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + stride, + expand_ratio=expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor or Tuple[Tensor]: The feature of the input samples extracted + by the backbone. + """ + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def _freeze_stages(self): + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1.eval() + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer_name = self.layers[i - 1] + layer = getattr(self, layer_name) + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Set the optimization status when training.""" + super(MobileNetV2, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..1a71284e7e50ab67e890a85734a20b365db9cc92 --- /dev/null +++ b/mmaction/models/backbones/mobilenet_v2_tsm.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.logging import MMLogger +from mmengine.runner.checkpoint import _load_checkpoint + +from mmaction.registry import MODELS +from .mobilenet_v2 import InvertedResidual, MobileNetV2 +from .resnet_tsm import TemporalShift + + +@MODELS.register_module() +class MobileNetV2TSM(MobileNetV2): + """MobileNetV2 backbone for TSM. + + Args: + num_segments (int): Number of frame segments. Defaults to 8. + is_shift (bool): Whether to make temporal shift in reset layers. + Defaults to True. + shift_div (int): Number of div for shift. Defaults to 8. + pretraind2d (bool): Whether to load pretrained 2D model. + Defaults to True. + **kwargs (keyword arguments, optional): Arguments for MobilNetV2. + """ + + def __init__(self, + num_segments=8, + is_shift=True, + shift_div=8, + pretrained2d=True, + **kwargs): + super().__init__(**kwargs) + self.num_segments = num_segments + self.is_shift = is_shift + self.shift_div = shift_div + self.pretrained2d = pretrained2d + self.init_structure() + + def make_temporal_shift(self): + """Make temporal shift for some layers.""" + for m in self.modules(): + if isinstance(m, InvertedResidual) and \ + len(m.conv) == 3 and m.use_res_connect: + m.conv[0] = TemporalShift( + m.conv[0], + num_segments=self.num_segments, + shift_div=self.shift_div, + ) + + def init_structure(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if self.is_shift: + self.make_temporal_shift() + + def load_original_weights(self, logger): + original_state_dict = _load_checkpoint( + self.pretrained, map_location='cpu') + if 'state_dict' in original_state_dict: + original_state_dict = original_state_dict['state_dict'] + + wrapped_layers_map = dict() + for name, module in self.named_modules(): + ori_name = name + for wrap_prefix in ['.net']: + if wrap_prefix in ori_name: + ori_name = ori_name.replace(wrap_prefix, '') + wrapped_layers_map[ori_name] = name + + # convert wrapped keys + for param_name in list(original_state_dict.keys()): + layer_name = '.'.join(param_name.split('.')[:-1]) + if layer_name in wrapped_layers_map: + wrapped_name = param_name.replace( + layer_name, wrapped_layers_map[layer_name]) + original_state_dict[wrapped_name] = original_state_dict.pop( + param_name) + + msg = self.load_state_dict(original_state_dict, strict=True) + logger.info(msg) + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if self.pretrained2d: + logger = MMLogger.get_current_instance() + self.load_original_weights(logger) + else: + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() diff --git a/mmaction/models/backbones/mobileone_tsm.py b/mmaction/models/backbones/mobileone_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..d12e84408831e90e69e7ca9cee2062de783fcd85 --- /dev/null +++ b/mmaction/models/backbones/mobileone_tsm.py @@ -0,0 +1,140 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch.nn as nn +from mmengine.logging import MMLogger +from mmengine.runner.checkpoint import (_load_checkpoint, + _load_checkpoint_with_prefix) +from mmpretrain.models import MobileOne + +from mmaction.registry import MODELS +from .resnet_tsm import TemporalShift + + +@MODELS.register_module() +class MobileOneTSM(MobileOne): + """MobileOne backbone for TSM. + + Args: + arch (str | dict): MobileOne architecture. If use string, choose + from 's0', 's1', 's2', 's3' and 's4'. If use dict, it should + have below keys: + + - num_blocks (Sequence[int]): Number of blocks in each stage. + - width_factor (Sequence[float]): Width factor in each stage. + - num_conv_branches (Sequence[int]): Number of conv branches + in each stage. + - num_se_blocks (Sequence[int]): Number of SE layers in each + stage, all the SE layers are placed in the subsequent order + in each stage. + + Defaults to 's0'. + num_segments (int): Number of frame segments. Defaults to 8. + is_shift (bool): Whether to make temporal shift in reset layers. + Defaults to True. + shift_div (int): Number of div for shift. Defaults to 8. + pretraind2d (bool): Whether to load pretrained 2D model. + Defaults to True. + **kwargs (keyword arguments, optional): Arguments for MobileOne. + """ + + def __init__(self, + arch: str, + num_segments: int = 8, + is_shift: bool = True, + shift_div: int = 8, + pretrained2d: bool = True, + **kwargs): + super().__init__(arch, **kwargs) + self.num_segments = num_segments + self.is_shift = is_shift + self.shift_div = shift_div + self.pretrained2d = pretrained2d + self.init_structure() + + def make_temporal_shift(self): + """Make temporal shift for some layers. + + To make reparameterization work, we can only build the shift layer + before the 'block', instead of the 'blockres' + """ + + def make_block_temporal(stage, num_segments): + """Make temporal shift on some blocks. + + Args: + stage (nn.Module): Model layers to be shifted. + num_segments (int): Number of frame segments. + + Returns: + nn.Module: The shifted blocks. + """ + blocks = list(stage.children()) + for i, b in enumerate(blocks): + blocks[i] = TemporalShift( + b, num_segments=num_segments, shift_div=self.shift_div) + return nn.Sequential(*blocks) + + self.stage0 = make_block_temporal( + nn.Sequential(self.stage0), self.num_segments)[0] + for i in range(1, 5): + temporal_stage = make_block_temporal( + getattr(self, f'stage{i}'), self.num_segments) + setattr(self, f'stage{i}', temporal_stage) + + def init_structure(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if self.is_shift: + self.make_temporal_shift() + + def load_original_weights(self, logger): + assert self.init_cfg.get('type') == 'Pretrained', ( + 'Please specify ' + 'init_cfg to use pretrained 2d checkpoint') + self.pretrained = self.init_cfg.get('checkpoint') + prefix = self.init_cfg.get('prefix') + if prefix is not None: + original_state_dict = _load_checkpoint_with_prefix( + prefix, self.pretrained, map_location='cpu') + else: + original_state_dict = _load_checkpoint( + self.pretrained, map_location='cpu') + if 'state_dict' in original_state_dict: + original_state_dict = original_state_dict['state_dict'] + + wrapped_layers_map = dict() + for name, module in self.named_modules(): + ori_name = name + for wrap_prefix in ['.net']: + if wrap_prefix in ori_name: + ori_name = ori_name.replace(wrap_prefix, '') + wrapped_layers_map[ori_name] = name + + # convert wrapped keys + for param_name in list(original_state_dict.keys()): + layer_name = '.'.join(param_name.split('.')[:-1]) + if layer_name in wrapped_layers_map: + wrapped_name = param_name.replace( + layer_name, wrapped_layers_map[layer_name]) + original_state_dict[wrapped_name] = original_state_dict.pop( + param_name) + + msg = self.load_state_dict(original_state_dict, strict=True) + logger.info(msg) + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if self.pretrained2d: + logger = MMLogger.get_current_instance() + self.load_original_weights(logger) + else: + super().init_weights() + + def forward(self, x): + """unpack tuple result.""" + x = super().forward(x) + if isinstance(x, tuple): + assert len(x) == 1 + x = x[0] + return x diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py new file mode 100644 index 0000000000000000000000000000000000000000..cb808a8c0230e04bd5f06c7c9275486cfc26cbf1 --- /dev/null +++ b/mmaction/models/backbones/mvit.py @@ -0,0 +1,909 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmcv.cnn.bricks import DropPath +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import trunc_normal_ +from mmengine.runner.checkpoint import _load_checkpoint_with_prefix +from mmengine.utils import to_3tuple + +from mmaction.registry import MODELS +from mmaction.utils import get_str_type +from ..utils.embed import PatchEmbed3D + + +def resize_pos_embed(pos_embed: torch.Tensor, + src_shape: Tuple[int], + dst_shape: Tuple[int], + mode: str = 'trilinear', + num_extra_tokens: int = 1) -> torch.Tensor: + """Resize pos_embed weights. + + Args: + pos_embed (torch.Tensor): Position embedding weights with shape + [1, L, C]. + src_shape (tuple): The resolution of downsampled origin training + image, in format (T, H, W). + dst_shape (tuple): The resolution of downsampled new training + image, in format (T, H, W). + mode (str): Algorithm used for upsampling. Choose one from 'nearest', + 'linear', 'bilinear', 'bicubic' and 'trilinear'. + Defaults to 'trilinear'. + num_extra_tokens (int): The number of extra tokens, such as cls_token. + Defaults to 1. + + Returns: + torch.Tensor: The resized pos_embed of shape [1, L_new, C] + """ + if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1] \ + and src_shape[2] == dst_shape[2]: + return pos_embed + assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]' + _, L, C = pos_embed.shape + src_t, src_h, src_w = src_shape + assert L == src_t * src_h * src_w + num_extra_tokens, \ + f"The length of `pos_embed` ({L}) doesn't match the expected " \ + f'shape ({src_t}*{src_h}*{src_w}+{num_extra_tokens}).' \ + 'Please check the `img_size` argument.' + extra_tokens = pos_embed[:, :num_extra_tokens] + + src_weight = pos_embed[:, num_extra_tokens:] + src_weight = src_weight.reshape(1, src_t, src_h, src_w, + C).permute(0, 4, 1, 2, 3) + + dst_weight = F.interpolate( + src_weight, size=dst_shape, align_corners=False, mode=mode) + dst_weight = torch.flatten(dst_weight, 2).transpose(1, 2) + + return torch.cat((extra_tokens, dst_weight), dim=1) + + +def resize_decomposed_rel_pos(rel_pos: torch.Tensor, q_size: int, + k_size: int) -> torch.Tensor: + """Get relative positional embeddings according to the relative positions + of query and key sizes. + + Args: + rel_pos (Tensor): relative position embeddings (L, C). + q_size (int): size of query q. + k_size (int): size of key k. + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + resized = F.interpolate( + # (L, C) -> (1, C, L) + rel_pos.transpose(0, 1).unsqueeze(0), + size=max_rel_dist, + mode='linear', + ) + # (1, C, L) -> (L, C) + resized = resized.squeeze(0).transpose(0, 1) + else: + resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_h_ratio = max(k_size / q_size, 1.0) + k_h_ratio = max(q_size / k_size, 1.0) + q_coords = torch.arange(q_size)[:, None] * q_h_ratio + k_coords = torch.arange(k_size)[None, :] * k_h_ratio + relative_coords = (q_coords - k_coords) + (k_size - 1) * k_h_ratio + + return resized[relative_coords.long()] + + +def add_decomposed_rel_pos(attn: torch.Tensor, + q: torch.Tensor, + q_shape: Sequence[int], + k_shape: Sequence[int], + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + rel_pos_t: torch.Tensor, + with_cls_token: bool = False) -> torch.Tensor: + """Spatiotemporal Relative Positional Embeddings.""" + sp_idx = 1 if with_cls_token else 0 + B, num_heads, _, C = q.shape + q_t, q_h, q_w = q_shape + k_t, k_h, k_w = k_shape + + Rt = resize_decomposed_rel_pos(rel_pos_t, q_t, k_t) + Rh = resize_decomposed_rel_pos(rel_pos_h, q_h, k_h) + Rw = resize_decomposed_rel_pos(rel_pos_w, q_w, k_w) + + r_q = q[:, :, sp_idx:].reshape(B, num_heads, q_t, q_h, q_w, C) + rel_t = torch.einsum('bythwc,tkc->bythwk', r_q, Rt) + rel_h = torch.einsum('bythwc,hkc->bythwk', r_q, Rh) + rel_w = torch.einsum('bythwc,wkc->bythwk', r_q, Rw) + rel_pos_embed = ( + rel_t[:, :, :, :, :, :, None, None] + + rel_h[:, :, :, :, :, None, :, None] + + rel_w[:, :, :, :, :, None, None, :]) + + attn_map = attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t, + k_h, k_w) + attn_map += rel_pos_embed + attn[:, :, sp_idx:, sp_idx:] = attn_map.view(B, -1, q_t * q_h * q_w, + k_t * k_h * k_w) + + return attn + + +class MLP(BaseModule): + """Two-layer multilayer perceptron. + + Comparing with :class:`mmcv.cnn.bricks.transformer.FFN`, this class allows + different input and output channel numbers. + + Args: + in_channels (int): The number of input channels. + hidden_channels (int, optional): The number of hidden layer channels. + If None, same as the ``in_channels``. Defaults to None. + out_channels (int, optional): The number of output channels. If None, + same as the ``in_channels``. Defaults to None. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + hidden_channels: Optional[int] = None, + out_channels: Optional[int] = None, + act_cfg: Dict = dict(type='GELU'), + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + out_channels = out_channels or in_channels + hidden_channels = hidden_channels or in_channels + self.fc1 = nn.Linear(in_channels, hidden_channels) + self.act = build_activation_layer(act_cfg) + self.fc2 = nn.Linear(hidden_channels, out_channels) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +def attention_pool(x: torch.Tensor, + pool: nn.Module, + in_size: Tuple[int], + with_cls_token: bool = False, + norm: Optional[nn.Module] = None) -> tuple: + """Pooling the feature tokens. + + Args: + x (torch.Tensor): The input tensor, should be with shape + ``(B, num_heads, L, C)`` or ``(B, L, C)``. + pool (nn.Module): The pooling module. + in_size (Tuple[int]): The shape of the input feature map. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + norm (nn.Module, optional): The normalization module. + Defaults to None. + """ + ndim = x.ndim + if ndim == 4: + B, num_heads, L, C = x.shape + elif ndim == 3: + num_heads = 1 + B, L, C = x.shape + x = x.unsqueeze(1) + else: + raise RuntimeError(f'Unsupported input dimension {x.shape}') + + T, H, W = in_size + assert L == T * H * W + with_cls_token + + if with_cls_token: + cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :] + + # (B, num_heads, T*H*W, C) -> (B*num_heads, C, T, H, W) + x = x.reshape(B * num_heads, T, H, W, C).permute(0, 4, 1, 2, + 3).contiguous() + x = pool(x) + out_size = x.shape[2:] + + # (B*num_heads, C, T', H', W') -> (B, num_heads, T'*H'*W', C) + x = x.reshape(B, num_heads, C, -1).transpose(2, 3) + + if with_cls_token: + x = torch.cat((cls_tok, x), dim=2) + + if norm is not None: + x = norm(x) + + if ndim == 3: + x = x.squeeze(1) + + return x, out_size + + +class MultiScaleAttention(BaseModule): + """Multiscale Multi-head Attention block. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + stride_q (int): stride size for q pooling layer. + Defaults to (1, 1, 1). + stride_kv (int): stride size for kv pooling layer. + Defaults to (1, 1, 1). + rel_pos_embed (bool): Whether to enable the spatial and temporal + relative position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_embed``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_dims: int, + out_dims: int, + num_heads: int, + qkv_bias: bool = True, + norm_cfg: Dict = dict(type='LN'), + pool_kernel: Tuple[int] = (3, 3, 3), + stride_q: Tuple[int] = (1, 1, 1), + stride_kv: Tuple[int] = (1, 1, 1), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + input_size: Optional[Tuple[int]] = None, + rel_pos_zero_init: bool = False, + with_cls_token: bool = True, + init_cfg: Optional[dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.num_heads = num_heads + self.with_cls_token = with_cls_token + self.in_dims = in_dims + self.out_dims = out_dims + + head_dim = out_dims // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(in_dims, out_dims * 3, bias=qkv_bias) + self.proj = nn.Linear(out_dims, out_dims) + + # qkv pooling + pool_padding = [k // 2 for k in pool_kernel] + pool_dims = out_dims // num_heads + + def build_pooling(stride): + pool = nn.Conv3d( + pool_dims, + pool_dims, + pool_kernel, + stride=stride, + padding=pool_padding, + groups=pool_dims, + bias=False, + ) + norm = build_norm_layer(norm_cfg, pool_dims)[1] + return pool, norm + + self.pool_q, self.norm_q = build_pooling(stride_q) + self.pool_k, self.norm_k = build_pooling(stride_kv) + self.pool_v, self.norm_v = build_pooling(stride_kv) + + self.residual_pooling = residual_pooling + + self.rel_pos_embed = rel_pos_embed + self.rel_pos_zero_init = rel_pos_zero_init + if self.rel_pos_embed: + # initialize relative positional embeddings + assert input_size[1] == input_size[2] + + size = input_size[1] + rel_dim = 2 * max(size // stride_q[1], size // stride_kv[1]) - 1 + self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_t = nn.Parameter( + torch.zeros(2 * input_size[0] - 1, head_dim)) + + def init_weights(self) -> None: + """Weight initialization.""" + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and get_str_type(self.init_cfg['type']) == 'Pretrained'): + # Suppress rel_pos_zero_init if use pretrained model. + return + + if not self.rel_pos_zero_init: + trunc_normal_(self.rel_pos_h, std=0.02) + trunc_normal_(self.rel_pos_w, std=0.02) + trunc_normal_(self.rel_pos_t, std=0.02) + + def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple: + """Forward the MultiScaleAttention.""" + B, N, _ = x.shape # (B, H*W, C) + + # qkv: (B, H*W, 3, num_heads, C) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1) + # q, k, v: (B, num_heads, H*W, C) + q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0) + + q, q_shape = attention_pool( + q, + self.pool_q, + in_size, + norm=self.norm_q, + with_cls_token=self.with_cls_token) + k, k_shape = attention_pool( + k, + self.pool_k, + in_size, + norm=self.norm_k, + with_cls_token=self.with_cls_token) + v, v_shape = attention_pool( + v, + self.pool_v, + in_size, + norm=self.norm_v, + with_cls_token=self.with_cls_token) + + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.rel_pos_embed: + attn = add_decomposed_rel_pos(attn, q, q_shape, k_shape, + self.rel_pos_h, self.rel_pos_w, + self.rel_pos_t, self.with_cls_token) + + attn = attn.softmax(dim=-1) + x = attn @ v + + if self.residual_pooling: + if self.with_cls_token: + x[:, :, 1:, :] += q[:, :, 1:, :] + else: + x = x + q + + # (B, num_heads, H'*W', C'//num_heads) -> (B, H'*W', C') + x = x.transpose(1, 2).reshape(B, -1, self.out_dims) + x = self.proj(x) + + return x, q_shape + + +class MultiScaleBlock(BaseModule): + """Multiscale Transformer blocks. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + drop_path (float): Stochastic depth rate. Defaults to 0. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + qkv_pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + stride_q (int): stride size for q pooling layer. + Defaults to (1, 1, 1). + stride_kv (int): stride size for kv pooling layer. + Defaults to (1, 1, 1). + rel_pos_embed (bool): Whether to enable the spatial relative + position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_embed``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + in_dims: int, + out_dims: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + drop_path: float = 0.0, + norm_cfg: Dict = dict(type='LN'), + act_cfg: Dict = dict(type='GELU'), + qkv_pool_kernel: Tuple = (3, 3, 3), + stride_q: Tuple = (1, 1, 1), + stride_kv: Tuple = (1, 1, 1), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + with_cls_token: bool = True, + dim_mul_in_attention: bool = True, + input_size: Optional[Tuple[int]] = None, + rel_pos_zero_init: bool = False, + init_cfg: Optional[Dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + self.with_cls_token = with_cls_token + self.in_dims = in_dims + self.out_dims = out_dims + self.norm1 = build_norm_layer(norm_cfg, in_dims)[1] + self.dim_mul_in_attention = dim_mul_in_attention + + attn_dims = out_dims if dim_mul_in_attention else in_dims + self.attn = MultiScaleAttention( + in_dims, + attn_dims, + num_heads=num_heads, + qkv_bias=qkv_bias, + norm_cfg=norm_cfg, + pool_kernel=qkv_pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_embed=rel_pos_embed, + residual_pooling=residual_pooling, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init, + with_cls_token=with_cls_token) + self.drop_path = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = build_norm_layer(norm_cfg, attn_dims)[1] + + self.mlp = MLP( + in_channels=attn_dims, + hidden_channels=int(attn_dims * mlp_ratio), + out_channels=out_dims, + act_cfg=act_cfg) + + if in_dims != out_dims: + self.proj = nn.Linear(in_dims, out_dims) + else: + self.proj = None + + if np.prod(stride_q) > 1: + kernel_skip = [s + 1 if s > 1 else s for s in stride_q] + padding_skip = [int(skip // 2) for skip in kernel_skip] + self.pool_skip = nn.MaxPool3d( + kernel_skip, stride_q, padding_skip, ceil_mode=False) + + if input_size is not None: + input_size = to_3tuple(input_size) + out_size = [size // s for size, s in zip(input_size, stride_q)] + self.init_out_size = out_size + else: + self.init_out_size = None + else: + self.pool_skip = None + self.init_out_size = input_size + + def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple: + x_norm = self.norm1(x) + x_attn, out_size = self.attn(x_norm, in_size) + + if self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + if self.pool_skip is not None: + skip, _ = attention_pool( + skip, + self.pool_skip, + in_size, + with_cls_token=self.with_cls_token) + + x = skip + self.drop_path(x_attn) + x_norm = self.norm2(x) + x_mlp = self.mlp(x_norm) + + if not self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + x = skip + self.drop_path(x_mlp) + + return x, out_size + + +@MODELS.register_module() +class MViT(BaseModule): + """Multi-scale ViT v2. + + A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers + for Classification and Detection `_ + + Inspiration from `the official implementation + `_ and `the mmclassification + implementation `_ + + Args: + arch (str | dict): MViT architecture. If use string, choose + from 'tiny', 'small', 'base' and 'large'. If use dict, it should + have below keys: + + - **embed_dims** (int): The dimensions of embedding. + - **num_layers** (int): The number of layers. + - **num_heads** (int): The number of heads in attention + modules of the initial layer. + - **downscale_indices** (List[int]): The layer indices to downscale + the feature map. + + Defaults to 'base'. + spatial_size (int): The expected input spatial_size shape. + Defaults to 224. + temporal_size (int): The expected input temporal_size shape. + Defaults to 224. + in_channels (int): The num of input channels. Defaults to 3. + pretrained (str, optional): Name of pretrained model. + Defaults to None. + pretrained_type (str, optional): Type of pretrained model. choose from + 'imagenet', 'maskfeat', None. Defaults to None, which means load + from same architecture. + out_scales (int | Sequence[int]): The output scale indices. + They should not exceed the length of ``downscale_indices``. + Defaults to -1, which means the last scale. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults to False. + interpolate_mode (str): Select the interpolate mode for absolute + position embedding vector resize. Defaults to "trilinear". + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + dim_mul (int): The magnification for ``embed_dims`` in the downscale + layers. Defaults to 2. + head_mul (int): The magnification for ``num_heads`` in the downscale + layers. Defaults to 2. + adaptive_kv_stride (int): The stride size for kv pooling in the initial + layer. Defaults to (1, 8, 8). + rel_pos_embed (bool): Whether to enable the spatial and temporal + relative position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + output_cls_token (bool): Whether output the cls_token. If set True, + ``with_cls_token`` must be True. Defaults to True. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): enable bias for qkv if True. Defaults to True. + norm_cfg (dict): Config dict for normalization layer for all output + features. Defaults to ``dict(type='LN', eps=1e-6)``. + patch_cfg (dict): Config dict for the patch embedding layer. + Defaults to + ``dict(kernel_size=(3, 7, 7), + stride=(2, 4, 4), + padding=(1, 3, 3))``. + init_cfg (dict, optional): The Config for initialization. Defaults to + ``[ + dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02), + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.02), + ]`` + + Examples: + >>> import torch + >>> from mmaction.registry import MODELS + >>> from mmaction.utils import register_all_modules + >>> register_all_modules() + >>> + >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3]) + >>> model = MODELS.build(cfg) + >>> model.init_weights() + >>> inputs = torch.rand(1, 3, 16, 224, 224) + >>> outputs = model(inputs) + >>> for i, output in enumerate(outputs): + >>> print(f'scale{i}: {output.shape}') + scale0: torch.Size([1, 96, 8, 56, 56]) + scale1: torch.Size([1, 192, 8, 28, 28]) + scale2: torch.Size([1, 384, 8, 14, 14]) + scale3: torch.Size([1, 768, 8, 7, 7]) + """ + arch_zoo = { + 'tiny': { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [1, 3, 8] + }, + 'small': { + 'embed_dims': 96, + 'num_layers': 16, + 'num_heads': 1, + 'downscale_indices': [1, 3, 14] + }, + 'base': { + 'embed_dims': 96, + 'num_layers': 24, + 'num_heads': 1, + 'downscale_indices': [2, 5, 21] + }, + 'large': { + 'embed_dims': 144, + 'num_layers': 48, + 'num_heads': 2, + 'downscale_indices': [2, 8, 44] + }, + } + num_extra_tokens = 1 + + def __init__( + self, + arch: str = 'base', + spatial_size: int = 224, + temporal_size: int = 16, + in_channels: int = 3, + pretrained: Optional[str] = None, + pretrained_type: Optional[str] = None, + out_scales: Union[int, Sequence[int]] = -1, + drop_path_rate: float = 0., + use_abs_pos_embed: bool = False, + interpolate_mode: str = 'trilinear', + pool_kernel: tuple = (3, 3, 3), + dim_mul: int = 2, + head_mul: int = 2, + adaptive_kv_stride: tuple = (1, 8, 8), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + dim_mul_in_attention: bool = True, + with_cls_token: bool = True, + output_cls_token: bool = True, + rel_pos_zero_init: bool = False, + mlp_ratio: float = 4., + qkv_bias: bool = True, + norm_cfg: Dict = dict(type='LN', eps=1e-6), + patch_cfg: Dict = dict( + kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)), + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02), + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.02), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.02), + ] + ) -> None: + if pretrained: + init_cfg = dict(type='Pretrained', checkpoint=pretrained) + super().__init__(init_cfg=init_cfg.copy()) + self.pretrained_type = pretrained_type + + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + essential_keys = { + 'embed_dims', 'num_layers', 'num_heads', 'downscale_indices' + } + assert isinstance(arch, dict) and essential_keys <= set(arch), \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.embed_dims = self.arch_settings['embed_dims'] + self.num_layers = self.arch_settings['num_layers'] + self.num_heads = self.arch_settings['num_heads'] + self.downscale_indices = self.arch_settings['downscale_indices'] + # Defaults take downscale_indices as downscale_indices + self.dim_mul_indices = self.arch_settings.get( + 'dim_mul_indices', self.downscale_indices.copy()) + self.num_scales = len(self.downscale_indices) + 1 + self.stage_indices = { + index - 1: i + for i, index in enumerate(self.downscale_indices) + } + self.stage_indices[self.num_layers - 1] = self.num_scales - 1 + self.use_abs_pos_embed = use_abs_pos_embed + self.interpolate_mode = interpolate_mode + + if isinstance(out_scales, int): + out_scales = [out_scales] + assert isinstance(out_scales, Sequence), \ + f'"out_scales" must by a sequence or int, ' \ + f'get {type(out_scales)} instead.' + for i, index in enumerate(out_scales): + if index < 0: + out_scales[i] = self.num_scales + index + assert 0 <= out_scales[i] <= self.num_scales, \ + f'Invalid out_scales {index}' + self.out_scales = sorted(list(out_scales)) + + # Set patch embedding + _patch_cfg = dict( + in_channels=in_channels, + input_size=(temporal_size, spatial_size, spatial_size), + embed_dims=self.embed_dims, + conv_type='Conv3d', + ) + _patch_cfg.update(patch_cfg) + self.patch_embed = PatchEmbed3D(**_patch_cfg) + self.patch_resolution = self.patch_embed.init_out_size + + # Set cls token + if output_cls_token: + assert with_cls_token is True, f'with_cls_token must be True if' \ + f'set output_cls_token to True, but got {with_cls_token}' + self.with_cls_token = with_cls_token + self.output_cls_token = output_cls_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims)) + + # Set absolute position embedding + if self.use_abs_pos_embed: + num_patches = np.prod(self.patch_resolution) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_extra_tokens, + self.embed_dims)) + + # stochastic depth decay rule + dpr = np.linspace(0, drop_path_rate, self.num_layers) + + self.blocks = ModuleList() + out_dims_list = [self.embed_dims] + num_heads = self.num_heads + stride_kv = adaptive_kv_stride + input_size = self.patch_resolution + for i in range(self.num_layers): + if i in self.downscale_indices or i in self.dim_mul_indices: + num_heads *= head_mul + + if i in self.downscale_indices: + stride_q = [1, 2, 2] + stride_kv = [max(s // 2, 1) for s in stride_kv] + else: + stride_q = [1, 1, 1] + + # Set output embed_dims + if dim_mul_in_attention and i in self.dim_mul_indices: + # multiply embed_dims in dim_mul_indices layers. + out_dims = out_dims_list[-1] * dim_mul + elif not dim_mul_in_attention and i + 1 in self.dim_mul_indices: + # multiply embed_dims before dim_mul_indices layers. + out_dims = out_dims_list[-1] * dim_mul + else: + out_dims = out_dims_list[-1] + + attention_block = MultiScaleBlock( + in_dims=out_dims_list[-1], + out_dims=out_dims, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop_path=dpr[i], + norm_cfg=norm_cfg, + qkv_pool_kernel=pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_embed=rel_pos_embed, + residual_pooling=residual_pooling, + with_cls_token=with_cls_token, + dim_mul_in_attention=dim_mul_in_attention, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init) + self.blocks.append(attention_block) + + input_size = attention_block.init_out_size + out_dims_list.append(out_dims) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + norm_layer = build_norm_layer(norm_cfg, out_dims)[1] + self.add_module(f'norm{stage_index}', norm_layer) + + def init_weights(self, pretrained: Optional[str] = None) -> None: + # interpolate maskfeat relative position embedding + if self.pretrained_type == 'maskfeat': + logger = MMLogger.get_current_instance() + pretrained = self.init_cfg['checkpoint'] + logger.info(f'load pretrained model from {pretrained}') + state_dict = _load_checkpoint_with_prefix( + 'backbone.', pretrained, map_location='cpu') + attn_rel_pos_keys = [ + k for k in state_dict.keys() if 'attn.rel_pos' in k + ] + for k in attn_rel_pos_keys: + attn_rel_pos_pretrained = state_dict[k] + attn_rel_pos_current = self.state_dict()[k] + L1, dim1 = attn_rel_pos_pretrained.size() + L2, dim2 = attn_rel_pos_current.size() + if dim1 != dim2: + logger.warning(f'Dim mismatch in loading {k}, passing') + else: + if L1 != L2: + interp_param = torch.nn.functional.interpolate( + attn_rel_pos_pretrained.t().unsqueeze(0), + size=L2, + mode='linear') + interp_param = \ + interp_param.view(dim2, L2).permute(1, 0) + state_dict[k] = interp_param + logger.info( + f'{k} reshaped from {(L1, dim1)} to {L2, dim2}') + msg = self.load_state_dict(state_dict, strict=False) + logger.info(msg) + + elif self.pretrained_type is None: + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and get_str_type(self.init_cfg['type']) == 'Pretrained'): + # Suppress default init if use pretrained model. + return + + if self.use_abs_pos_embed: + trunc_normal_(self.pos_embed, std=0.02) + + def forward(self, x: torch.Tensor) ->\ + Tuple[Union[torch.Tensor, List[torch.Tensor]]]: + """Forward the MViT.""" + B = x.shape[0] + x, patch_resolution = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + if self.use_abs_pos_embed: + x = x + resize_pos_embed( + self.pos_embed, + self.patch_resolution, + patch_resolution, + mode=self.interpolate_mode, + num_extra_tokens=self.num_extra_tokens) + + if not self.with_cls_token: + # Remove class token for transformer encoder input + x = x[:, 1:] + + outs = [] + for i, block in enumerate(self.blocks): + x, patch_resolution = block(x, patch_resolution) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + B, _, C = x.shape + x = getattr(self, f'norm{stage_index}')(x) + tokens = x.transpose(1, 2) + if self.with_cls_token: + patch_token = tokens[:, :, 1:].reshape( + B, C, *patch_resolution) + cls_token = tokens[:, :, 0] + else: + patch_token = tokens.reshape(B, C, *patch_resolution) + cls_token = None + if self.output_cls_token: + out = [patch_token, cls_token] + else: + out = patch_token + outs.append(out) + + return tuple(outs) diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6c28cc3367f5c08a96338296c325d8604e13f80a --- /dev/null +++ b/mmaction/models/backbones/resnet.py @@ -0,0 +1,625 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmengine +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.logging import MMLogger +from mmengine.model import BaseModule +from mmengine.runner.checkpoint import _load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm +from torch.utils import checkpoint as cp + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType + + +class BasicBlock(nn.Module): + """Basic block for ResNet. + + Args: + inplanes (int): Number of channels for the input in first conv2d layer. + planes (int): Number of channels produced by some norm/conv2d layers. + stride (int): Stride in the conv layer. Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + downsample (nn.Module, optional): Downsample layer. Defaults to None. + style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``pytorch``. + conv_cfg (Union[dict, ConfigDict]): Config for norm layers. + Defaults to ``dict(type='Conv')``. + norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required + keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN2d', requires_grad=True)``. + act_cfg (Union[dict, ConfigDict]): Config for activate layers. + Defaults to ``dict(type='ReLU', inplace=True)``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + """ + expansion = 1 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + conv_cfg: ConfigType = dict(type='Conv'), + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + with_cp: bool = False) -> None: + super().__init__() + assert style in ['pytorch', 'caffe'] + self.conv1 = ConvModule( + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = ConvModule( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + dilation=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.style = style + self.stride = stride + self.dilation = dilation + self.norm_cfg = norm_cfg + assert not with_cp + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + identity = x + + out = self.conv1(x) + out = self.conv2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + """Bottleneck block for ResNet. + + Args: + inplanes (int): + Number of channels for the input feature in first conv layer. + planes (int): + Number of channels produced by some norm layes and conv layers. + stride (int): Spatial stride in the conv layer. Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + downsample (nn.Module, optional): Downsample layer. Defaults to None. + style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``pytorch``. + conv_cfg (Union[dict, ConfigDict]): Config for norm layers. + Defaults to ``dict(type='Conv')``. + norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required + keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN2d', requires_grad=True)``. + act_cfg (Union[dict, ConfigDict]): Config for activate layers. + Defaults to ``dict(type='ReLU', inplace=True)``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + """ + + expansion = 4 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + conv_cfg: ConfigType = dict(type='Conv'), + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + with_cp: bool = False) -> None: + super().__init__() + assert style in ['pytorch', 'caffe'] + self.inplanes = inplanes + self.planes = planes + if style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + self.conv1 = ConvModule( + inplanes, + planes, + kernel_size=1, + stride=self.conv1_stride, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv3 = ConvModule( + planes, + planes * self.expansion, + kernel_size=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.norm_cfg = norm_cfg + self.with_cp = with_cp + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def make_res_layer(block: nn.Module, + inplanes: int, + planes: int, + blocks: int, + stride: int = 1, + dilation: int = 1, + style: str = 'pytorch', + conv_cfg: Optional[ConfigType] = None, + norm_cfg: Optional[ConfigType] = None, + act_cfg: Optional[ConfigType] = None, + with_cp: bool = False) -> nn.Module: + """Build residual layer for ResNet. + + Args: + block: (nn.Module): Residual module to be built. + inplanes (int): Number of channels for the input feature in each block. + planes (int): Number of channels for the output feature in each block. + blocks (int): Number of residual blocks. + stride (int): Stride in the conv layer. Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``pytorch``. + conv_cfg (Union[dict, ConfigDict], optional): Config for norm layers. + Defaults to None. + norm_cfg (Union[dict, ConfigDict], optional): Config for norm layers. + Defaults to None. + act_cfg (Union[dict, ConfigDict], optional): Config for activate + layers. Defaults to None. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + + Returns: + nn.Module: A residual layer for the given config. + """ + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = ConvModule( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + dilation, + downsample, + style=style, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) + inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block( + inplanes, + planes, + 1, + dilation, + style=style, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) + + return nn.Sequential(*layers) + + +@MODELS.register_module() +class ResNet(BaseModule): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from ``{18, 34, 50, 101, 152}``. + pretrained (str, optional): Name of pretrained model. Defaults to None. + torchvision_pretrain (bool): Whether to load pretrained model from + torchvision. Defaults to True. + in_channels (int): Channel num of input features. Defaults to 3. + num_stages (int): Resnet stages. Defaults to 4. + out_indices (Sequence[int]): Indices of output feature. + Defaults to (3, ). + strides (Sequence[int]): Strides of the first block of each stage. + Defaults to ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Defaults to ``(1, 1, 1, 1)``. + style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``pytorch``. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Defaults to -1. + conv_cfg (dict or ConfigDict): Config for norm layers. + Defaults ``dict(type='Conv')``. + norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required + keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN2d', requires_grad=True)``. + act_cfg (Union[dict, ConfigDict]): Config for activate layers. + Defaults to ``dict(type='ReLU', inplace=True)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (mean and var). Defaults to False. + partial_bn (bool): Whether to use partial bn. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='Kaiming', layer='Conv2d',), + dict(type='Constant', layer='BatchNorm', val=1.) + ]``. + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__( + self, + depth: int, + pretrained: Optional[str] = None, + torchvision_pretrain: bool = True, + in_channels: int = 3, + num_stages: int = 4, + out_indices: Sequence[int] = (3, ), + strides: Sequence[int] = (1, 2, 2, 2), + dilations: Sequence[int] = (1, 1, 1, 1), + style: str = 'pytorch', + frozen_stages: int = -1, + conv_cfg: ConfigType = dict(type='Conv'), + norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + partial_bn: bool = False, + with_cp: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='Kaiming', layer='Conv2d'), + dict(type='Constant', layer='BatchNorm2d', val=1.) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.in_channels = in_channels + self.pretrained = pretrained + self.torchvision_pretrain = torchvision_pretrain + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.style = style + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.partial_bn = partial_bn + self.with_cp = with_cp + + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = 64 + + self._make_stem_layer() + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = 64 * 2**i + res_layer = make_res_layer( + self.block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = self.block.expansion * 64 * 2**( + len(self.stage_blocks) - 1) + + def _make_stem_layer(self) -> None: + """Construct the stem layers consists of a conv+norm+act module and a + pooling layer.""" + self.conv1 = ConvModule( + self.in_channels, + 64, + kernel_size=7, + stride=2, + padding=3, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + @staticmethod + def _load_conv_params(conv: nn.Module, state_dict_tv: OrderedDict, + module_name_tv: str, + loaded_param_names: List[str]) -> None: + """Load the conv parameters of resnet from torchvision. + + Args: + conv (nn.Module): The destination conv module. + state_dict_tv (OrderedDict): The state dict of pretrained + torchvision model. + module_name_tv (str): The name of corresponding conv module in the + torchvision model. + loaded_param_names (list[str]): List of parameters that have been + loaded. + """ + + weight_tv_name = module_name_tv + '.weight' + if conv.weight.data.shape == state_dict_tv[weight_tv_name].shape: + conv.weight.data.copy_(state_dict_tv[weight_tv_name]) + loaded_param_names.append(weight_tv_name) + + if getattr(conv, 'bias') is not None: + bias_tv_name = module_name_tv + '.bias' + if conv.bias.data.shape == state_dict_tv[bias_tv_name].shape: + conv.bias.data.copy_(state_dict_tv[bias_tv_name]) + loaded_param_names.append(bias_tv_name) + + @staticmethod + def _load_bn_params(bn: nn.Module, state_dict_tv: OrderedDict, + module_name_tv: str, + loaded_param_names: List[str]) -> None: + """Load the bn parameters of resnet from torchvision. + + Args: + bn (nn.Module): The destination bn module. + state_dict_tv (OrderedDict): The state dict of pretrained + torchvision model. + module_name_tv (str): The name of corresponding bn module in the + torchvision model. + loaded_param_names (list[str]): List of parameters that have been + loaded. + """ + + for param_name, param in bn.named_parameters(): + param_tv_name = f'{module_name_tv}.{param_name}' + param_tv = state_dict_tv[param_tv_name] + if param.data.shape == param_tv.shape: + param.data.copy_(param_tv) + loaded_param_names.append(param_tv_name) + + for param_name, param in bn.named_buffers(): + param_tv_name = f'{module_name_tv}.{param_name}' + # some buffers like num_batches_tracked may not exist + if param_tv_name in state_dict_tv: + param_tv = state_dict_tv[param_tv_name] + if param.data.shape == param_tv.shape: + param.data.copy_(param_tv) + loaded_param_names.append(param_tv_name) + + def _load_torchvision_checkpoint(self, + logger: mmengine.MMLogger = None) -> None: + """Initiate the parameters from torchvision pretrained checkpoint.""" + state_dict_torchvision = _load_checkpoint( + self.pretrained, map_location='cpu') + if 'state_dict' in state_dict_torchvision: + state_dict_torchvision = state_dict_torchvision['state_dict'] + + loaded_param_names = [] + for name, module in self.named_modules(): + if isinstance(module, ConvModule): + # we use a ConvModule to wrap conv+bn+relu layers, thus the + # name mapping is needed + if 'downsample' in name: + # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0 + original_conv_name = name + '.0' + # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1 + original_bn_name = name + '.1' + else: + # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n} + original_conv_name = name + # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n} + original_bn_name = name.replace('conv', 'bn') + self._load_conv_params(module.conv, state_dict_torchvision, + original_conv_name, loaded_param_names) + self._load_bn_params(module.bn, state_dict_torchvision, + original_bn_name, loaded_param_names) + + # check if any parameters in the 2d checkpoint are not loaded + remaining_names = set( + state_dict_torchvision.keys()) - set(loaded_param_names) + if remaining_names: + logger.info( + f'These parameters in pretrained checkpoint are not loaded' + f': {remaining_names}') + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + if self.torchvision_pretrain: + # torchvision's + self._load_torchvision_checkpoint(logger) + else: + # ours + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() + elif self.pretrained is None: + super().init_weights() + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: torch.Tensor) \ + -> Union[torch.Tensor, Tuple[torch.Tensor]]: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the + input samples extracted by the backbone. + """ + x = self.conv1(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1.bn.eval() + for m in self.conv1.modules(): + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def _partial_bn(self) -> None: + """Freezing BatchNorm2D except the first one.""" + logger = MMLogger.get_current_instance() + logger.info('Freezing BatchNorm2D except the first one.') + count_bn = 0 + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + count_bn += 1 + if count_bn >= 2: + m.eval() + # shutdown update in frozen mode + m.weight.requires_grad = False + m.bias.requires_grad = False + + def train(self, mode: bool = True) -> None: + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + if mode and self.partial_bn: + self._partial_bn() diff --git a/mmaction/models/backbones/resnet2plus1d.py b/mmaction/models/backbones/resnet2plus1d.py new file mode 100644 index 0000000000000000000000000000000000000000..61d36e921dbe67b63ab18dbe627fb0136774495a --- /dev/null +++ b/mmaction/models/backbones/resnet2plus1d.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.registry import MODELS +from mmaction.utils import get_str_type +from .resnet3d import ResNet3d + + +@MODELS.register_module() +class ResNet2Plus1d(ResNet3d): + """ResNet (2+1)d backbone. + + This model is proposed in `A Closer Look at Spatiotemporal Convolutions for + Action Recognition `_ + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.pretrained2d is False + assert get_str_type(self.conv_cfg['type']) == 'Conv2plus1d' + + def _freeze_stages(self): + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1.eval() + for param in self.conv1.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The feature of the input + samples extracted by the backbone. + """ + x = self.conv1(x) + x = self.maxpool(x) + for layer_name in self.res_layers: + res_layer = getattr(self, layer_name) + # no pool2 in R(2+1)d + x = res_layer(x) + + return x diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py new file mode 100644 index 0000000000000000000000000000000000000000..aa764256081a7fee5c6137503978b54201bea98d --- /dev/null +++ b/mmaction/models/backbones/resnet3d.py @@ -0,0 +1,1060 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, NonLocal3d, build_activation_layer +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, Sequential +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm +from torch.nn.modules.utils import _ntuple, _triple + +from mmaction.registry import MODELS + + +class BasicBlock3d(BaseModule): + """BasicBlock 3d block for ResNet3D. + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + spatial_stride (int): Spatial stride in the conv3d layer. + Defaults to 1. + temporal_stride (int): Temporal stride in the conv3d layer. + Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + downsample (nn.Module or None): Downsample layer. Defaults to None. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. + inflate (bool): Whether to inflate kernel. Defaults to True. + non_local (bool): Determine whether to apply non-local module in this + block. Defaults to False. + non_local_cfg (dict): Config for non-local module. + Defaults to ``dict()``. + conv_cfg (dict): Config dict for convolution layer. + Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. + Required keys are ``type``. Defaults to ``dict(type='BN3d')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + expansion = 1 + + def __init__(self, + inplanes: int, + planes: int, + spatial_stride: int = 1, + temporal_stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + inflate: bool = True, + non_local: bool = False, + non_local_cfg: Dict = dict(), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d'), + act_cfg: Dict = dict(type='ReLU'), + with_cp: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + assert style in ['pytorch', 'caffe'] + # make sure that only ``inflate_style`` is passed into kwargs + assert set(kwargs).issubset(['inflate_style']) + + self.inplanes = inplanes + self.planes = planes + self.spatial_stride = spatial_stride + self.temporal_stride = temporal_stride + self.dilation = dilation + self.style = style + self.inflate = inflate + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.with_cp = with_cp + self.non_local = non_local + self.non_local_cfg = non_local_cfg + + self.conv1_stride_s = spatial_stride + self.conv2_stride_s = 1 + self.conv1_stride_t = temporal_stride + self.conv2_stride_t = 1 + + if self.inflate: + conv1_kernel_size = (3, 3, 3) + conv1_padding = (1, dilation, dilation) + conv2_kernel_size = (3, 3, 3) + conv2_padding = (1, 1, 1) + else: + conv1_kernel_size = (1, 3, 3) + conv1_padding = (0, dilation, dilation) + conv2_kernel_size = (1, 3, 3) + conv2_padding = (0, 1, 1) + + self.conv1 = ConvModule( + inplanes, + planes, + conv1_kernel_size, + stride=(self.conv1_stride_t, self.conv1_stride_s, + self.conv1_stride_s), + padding=conv1_padding, + dilation=(1, dilation, dilation), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.conv2 = ConvModule( + planes, + planes * self.expansion, + conv2_kernel_size, + stride=(self.conv2_stride_t, self.conv2_stride_s, + self.conv2_stride_s), + padding=conv2_padding, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + + self.downsample = downsample + self.relu = build_activation_layer(self.act_cfg) + + if self.non_local: + self.non_local_block = NonLocal3d(self.conv2.norm.num_features, + **self.non_local_cfg) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.conv1(x) + out = self.conv2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + out = self.relu(out) + + if self.non_local: + out = self.non_local_block(out) + + return out + + +class Bottleneck3d(BaseModule): + """Bottleneck 3d block for ResNet3D. + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + spatial_stride (int): Spatial stride in the conv3d layer. + Defaults to 1. + temporal_stride (int): Temporal stride in the conv3d layer. + Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + downsample (nn.Module, optional): Downsample layer. Defaults to None. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. + inflate (bool): Whether to inflate kernel. Defaults to True. + inflate_style (str): '3x1x1' or '3x3x3'. which determines the + kernel sizes and padding strides for conv1 and conv2 in each block. + Defaults to ``'3x1x1'``. + non_local (bool): Determine whether to apply non-local module in this + block. Defaults to False. + non_local_cfg (dict): Config for non-local module. + Defaults to ``dict()``. + conv_cfg (dict): Config dict for convolution layer. + Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. required + keys are ``type``. Defaults to ``dict(type='BN3d')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + expansion = 4 + + def __init__(self, + inplanes: int, + planes: int, + spatial_stride: int = 1, + temporal_stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + inflate: bool = True, + inflate_style: str = '3x1x1', + non_local: bool = False, + non_local_cfg: Dict = dict(), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d'), + act_cfg: Dict = dict(type='ReLU'), + with_cp: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + assert style in ['pytorch', 'caffe'] + assert inflate_style in ['3x1x1', '3x3x3'] + + self.inplanes = inplanes + self.planes = planes + self.spatial_stride = spatial_stride + self.temporal_stride = temporal_stride + self.dilation = dilation + self.style = style + self.inflate = inflate + self.inflate_style = inflate_style + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.act_cfg = act_cfg + self.with_cp = with_cp + self.non_local = non_local + self.non_local_cfg = non_local_cfg + + if self.style == 'pytorch': + self.conv1_stride_s = 1 + self.conv2_stride_s = spatial_stride + self.conv1_stride_t = 1 + self.conv2_stride_t = temporal_stride + else: + self.conv1_stride_s = spatial_stride + self.conv2_stride_s = 1 + self.conv1_stride_t = temporal_stride + self.conv2_stride_t = 1 + + if self.inflate: + if inflate_style == '3x1x1': + conv1_kernel_size = (3, 1, 1) + conv1_padding = (1, 0, 0) + conv2_kernel_size = (1, 3, 3) + conv2_padding = (0, dilation, dilation) + else: + conv1_kernel_size = (1, 1, 1) + conv1_padding = (0, 0, 0) + conv2_kernel_size = (3, 3, 3) + conv2_padding = (1, dilation, dilation) + else: + conv1_kernel_size = (1, 1, 1) + conv1_padding = (0, 0, 0) + conv2_kernel_size = (1, 3, 3) + conv2_padding = (0, dilation, dilation) + + self.conv1 = ConvModule( + inplanes, + planes, + conv1_kernel_size, + stride=(self.conv1_stride_t, self.conv1_stride_s, + self.conv1_stride_s), + padding=conv1_padding, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.conv2 = ConvModule( + planes, + planes, + conv2_kernel_size, + stride=(self.conv2_stride_t, self.conv2_stride_s, + self.conv2_stride_s), + padding=conv2_padding, + dilation=(1, dilation, dilation), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.conv3 = ConvModule( + planes, + planes * self.expansion, + 1, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + # No activation in the third ConvModule for bottleneck + act_cfg=None) + + self.downsample = downsample + self.relu = build_activation_layer(self.act_cfg) + + if self.non_local: + self.non_local_block = NonLocal3d(self.conv3.norm.num_features, + **self.non_local_cfg) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + out = self.relu(out) + + if self.non_local: + out = self.non_local_block(out) + + return out + + +@MODELS.register_module() +class ResNet3d(BaseModule): + """ResNet 3d backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + Defaults to 50. + pretrained (str, optional): Name of pretrained model. Defaults to None. + stage_blocks (tuple, optional): Set number of stages for each res + layer. Defaults to None. + pretrained2d (bool): Whether to load pretrained 2D model. + Defaults to True. + in_channels (int): Channel num of input features. Defaults to 3. + num_stages (int): Resnet stages. Defaults to 4. + base_channels (int): Channel num of stem output features. + Defaults to 64. + out_indices (Sequence[int]): Indices of output feature. + Defaults to ``(3, )``. + spatial_strides (Sequence[int]): + Spatial strides of residual blocks of each stage. + Defaults to ``(1, 2, 2, 2)``. + temporal_strides (Sequence[int]): + Temporal strides of residual blocks of each stage. + Defaults to ``(1, 1, 1, 1)``. + dilations (Sequence[int]): Dilation of each stage. + Defaults to ``(1, 1, 1, 1)``. + conv1_kernel (Sequence[int]): Kernel size of the first conv layer. + Defaults to ``(3, 7, 7)``. + conv1_stride_s (int): Spatial stride of the first conv layer. + Defaults to 2. + conv1_stride_t (int): Temporal stride of the first conv layer. + Defaults to 1. + pool1_stride_s (int): Spatial stride of the first pooling layer. + Defaults to 2. + pool1_stride_t (int): Temporal stride of the first pooling layer. + Defaults to 1. + with_pool2 (bool): Whether to use pool2. Defaults to True. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Defaults to -1. + inflate (Sequence[int]): Inflate Dims of each block. + Defaults to ``(1, 1, 1, 1)``. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + kernel sizes and padding strides for conv1 and conv2 in each block. + Defaults to ``3x1x1``. + conv_cfg (dict): Config for conv layers. + Required keys are ``type``. Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. + Required keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN3d', requires_grad=True)``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU', inplace=True)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (``mean`` and ``var``). Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + non_local (Sequence[int]): Determine whether to apply non-local module + in the corresponding block of each stages. + Defaults to ``(0, 0, 0, 0)``. + non_local_cfg (dict): Config for non-local module. + Defaults to ``dict()``. + zero_init_residual (bool): + Whether to use zero initialization for residual block, + Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + arch_settings = { + 18: (BasicBlock3d, (2, 2, 2, 2)), + 34: (BasicBlock3d, (3, 4, 6, 3)), + 50: (Bottleneck3d, (3, 4, 6, 3)), + 101: (Bottleneck3d, (3, 4, 23, 3)), + 152: (Bottleneck3d, (3, 8, 36, 3)) + } + + def __init__(self, + depth: int = 50, + pretrained: Optional[str] = None, + stage_blocks: Optional[Tuple] = None, + pretrained2d: bool = True, + in_channels: int = 3, + num_stages: int = 4, + base_channels: int = 64, + out_indices: Sequence[int] = (3, ), + spatial_strides: Sequence[int] = (1, 2, 2, 2), + temporal_strides: Sequence[int] = (1, 1, 1, 1), + dilations: Sequence[int] = (1, 1, 1, 1), + conv1_kernel: Sequence[int] = (3, 7, 7), + conv1_stride_s: int = 2, + conv1_stride_t: int = 1, + pool1_stride_s: int = 2, + pool1_stride_t: int = 1, + with_pool1: bool = True, + with_pool2: bool = True, + style: str = 'pytorch', + frozen_stages: int = -1, + inflate: Sequence[int] = (1, 1, 1, 1), + inflate_style: str = '3x1x1', + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d', requires_grad=True), + act_cfg: Dict = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + with_cp: bool = False, + non_local: Sequence[int] = (0, 0, 0, 0), + non_local_cfg: Dict = dict(), + zero_init_residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.pretrained = pretrained + self.pretrained2d = pretrained2d + self.in_channels = in_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.stage_blocks = stage_blocks + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.spatial_strides = spatial_strides + self.temporal_strides = temporal_strides + self.dilations = dilations + assert len(spatial_strides) == len(temporal_strides) == len( + dilations) == num_stages + if self.stage_blocks is not None: + assert len(self.stage_blocks) == num_stages + + self.conv1_kernel = conv1_kernel + self.conv1_stride_s = conv1_stride_s + self.conv1_stride_t = conv1_stride_t + self.pool1_stride_s = pool1_stride_s + self.pool1_stride_t = pool1_stride_t + self.with_pool1 = with_pool1 + self.with_pool2 = with_pool2 + self.style = style + self.frozen_stages = frozen_stages + self.stage_inflations = _ntuple(num_stages)(inflate) + self.non_local_stages = _ntuple(num_stages)(non_local) + self.inflate_style = inflate_style + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + self.block, stage_blocks = self.arch_settings[depth] + + if self.stage_blocks is None: + self.stage_blocks = stage_blocks[:num_stages] + + self.inplanes = self.base_channels + + self.non_local_cfg = non_local_cfg + + self._make_stem_layer() + + self.res_layers = [] + lateral_inplanes = getattr(self, 'lateral_inplanes', [0, 0, 0, 0]) + + for i, num_blocks in enumerate(self.stage_blocks): + spatial_stride = spatial_strides[i] + temporal_stride = temporal_strides[i] + dilation = dilations[i] + planes = self.base_channels * 2**i + res_layer = self.make_res_layer( + self.block, + self.inplanes + lateral_inplanes[i], + planes, + num_blocks, + spatial_stride=spatial_stride, + temporal_stride=temporal_stride, + dilation=dilation, + style=self.style, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + act_cfg=self.act_cfg, + non_local=self.non_local_stages[i], + non_local_cfg=self.non_local_cfg, + inflate=self.stage_inflations[i], + inflate_style=self.inflate_style, + with_cp=with_cp, + **kwargs) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = self.block.expansion * \ + self.base_channels * 2 ** (len(self.stage_blocks) - 1) + + @staticmethod + def make_res_layer(block: nn.Module, + inplanes: int, + planes: int, + blocks: int, + spatial_stride: Union[int, Sequence[int]] = 1, + temporal_stride: Union[int, Sequence[int]] = 1, + dilation: int = 1, + style: str = 'pytorch', + inflate: Union[int, Sequence[int]] = 1, + inflate_style: str = '3x1x1', + non_local: Union[int, Sequence[int]] = 0, + non_local_cfg: Dict = dict(), + norm_cfg: Optional[Dict] = None, + act_cfg: Optional[Dict] = None, + conv_cfg: Optional[Dict] = None, + with_cp: bool = False, + **kwargs) -> nn.Module: + """Build residual layer for ResNet3D. + + Args: + block (nn.Module): Residual module to be built. + inplanes (int): Number of channels for the input feature + in each block. + planes (int): Number of channels for the output feature + in each block. + blocks (int): Number of residual blocks. + spatial_stride (int | Sequence[int]): Spatial strides in + residual and conv layers. Defaults to 1. + temporal_stride (int | Sequence[int]): Temporal strides in + residual and conv layers. Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer,otherwise the + stride-two layer is the first 1x1 conv layer. + Defaults to ``'pytorch'``. + inflate (int | Sequence[int]): Determine whether to inflate + for each block. Defaults to 1. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines + the kernel sizes and padding strides for conv1 and conv2 + in each block. Default: ``'3x1x1'``. + non_local (int | Sequence[int]): Determine whether to apply + non-local module in the corresponding block of each stages. + Defaults to 0. + non_local_cfg (dict): Config for non-local module. + Defaults to ``dict()``. + conv_cfg (dict, optional): Config for conv layers. + Defaults to None. + norm_cfg (dict, optional): Config for norm layers. + Defaults to None. + act_cfg (dict, optional): Config for activate layers. + Defaults to None. + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Defaults to False. + + Returns: + nn.Module: A residual layer for the given config. + """ + inflate = inflate if not isinstance(inflate, int) \ + else (inflate,) * blocks + non_local = non_local if not isinstance(non_local, int) \ + else (non_local,) * blocks + assert len(inflate) == blocks and len(non_local) == blocks + downsample = None + if spatial_stride != 1 or inplanes != planes * block.expansion: + downsample = ConvModule( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=(temporal_stride, spatial_stride, spatial_stride), + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + layers = [] + layers.append( + block( + inplanes, + planes, + spatial_stride=spatial_stride, + temporal_stride=temporal_stride, + dilation=dilation, + downsample=downsample, + style=style, + inflate=(inflate[0] == 1), + inflate_style=inflate_style, + non_local=(non_local[0] == 1), + non_local_cfg=non_local_cfg, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + spatial_stride=1, + temporal_stride=1, + dilation=dilation, + style=style, + inflate=(inflate[i] == 1), + inflate_style=inflate_style, + non_local=(non_local[i] == 1), + non_local_cfg=non_local_cfg, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + + return Sequential(*layers) + + @staticmethod + def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict, + module_name_2d: str, + inflated_param_names: List[str]) -> None: + """Inflate a conv module from 2d to 3d. + + Args: + conv3d (nn.Module): The destination conv3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding conv module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + weight_2d_name = module_name_2d + '.weight' + + conv2d_weight = state_dict_2d[weight_2d_name] + kernel_t = conv3d.weight.data.shape[2] + + new_weight = conv2d_weight.data.unsqueeze(2).expand_as( + conv3d.weight) / kernel_t + conv3d.weight.data.copy_(new_weight) + inflated_param_names.append(weight_2d_name) + + if getattr(conv3d, 'bias') is not None: + bias_2d_name = module_name_2d + '.bias' + conv3d.bias.data.copy_(state_dict_2d[bias_2d_name]) + inflated_param_names.append(bias_2d_name) + + @staticmethod + def _inflate_bn_params(bn3d: nn.Module, state_dict_2d: OrderedDict, + module_name_2d: str, + inflated_param_names: List[str]) -> None: + """Inflate a norm module from 2d to 3d. + + Args: + bn3d (nn.Module): The destination bn3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding bn module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + for param_name, param in bn3d.named_parameters(): + param_2d_name = f'{module_name_2d}.{param_name}' + param_2d = state_dict_2d[param_2d_name] + if param.data.shape != param_2d.shape: + warnings.warn(f'The parameter of {module_name_2d} is not' + 'loaded due to incompatible shapes. ') + return + + param.data.copy_(param_2d) + inflated_param_names.append(param_2d_name) + + for param_name, param in bn3d.named_buffers(): + param_2d_name = f'{module_name_2d}.{param_name}' + # some buffers like num_batches_tracked may not exist in old + # checkpoints + if param_2d_name in state_dict_2d: + param_2d = state_dict_2d[param_2d_name] + param.data.copy_(param_2d) + inflated_param_names.append(param_2d_name) + + @staticmethod + def _inflate_weights(self, logger: MMLogger) -> None: + """Inflate the resnet2d parameters to resnet3d. + + The differences between resnet3d and resnet2d mainly lie in an extra + axis of conv kernel. To utilize the pretrained parameters in 2d model, + the weight of conv2d models should be inflated to fit in the shapes of + the 3d counterpart. + + Args: + logger (MMLogger): The logger used to print + debugging information. + """ + + state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu') + if 'state_dict' in state_dict_r2d: + state_dict_r2d = state_dict_r2d['state_dict'] + + inflated_param_names = [] + for name, module in self.named_modules(): + if isinstance(module, ConvModule): + # we use a ConvModule to wrap conv+bn+relu layers, thus the + # name mapping is needed + if 'downsample' in name: + # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0 + original_conv_name = name + '.0' + # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1 + original_bn_name = name + '.1' + else: + # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n} + original_conv_name = name + # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n} + original_bn_name = name.replace('conv', 'bn') + if original_conv_name + '.weight' not in state_dict_r2d: + logger.warning(f'Module not exist in the state_dict_r2d' + f': {original_conv_name}') + else: + shape_2d = state_dict_r2d[original_conv_name + + '.weight'].shape + shape_3d = module.conv.weight.data.shape + if shape_2d != shape_3d[:2] + shape_3d[3:]: + logger.warning(f'Weight shape mismatch for ' + f': {original_conv_name} : ' + f'3d weight shape: {shape_3d}; ' + f'2d weight shape: {shape_2d}. ') + else: + self._inflate_conv_params(module.conv, state_dict_r2d, + original_conv_name, + inflated_param_names) + + if original_bn_name + '.weight' not in state_dict_r2d: + logger.warning(f'Module not exist in the state_dict_r2d' + f': {original_bn_name}') + else: + self._inflate_bn_params(module.bn, state_dict_r2d, + original_bn_name, + inflated_param_names) + + # check if any parameters in the 2d checkpoint are not loaded + remaining_names = set( + state_dict_r2d.keys()) - set(inflated_param_names) + if remaining_names: + logger.info(f'These parameters in the 2d checkpoint are not loaded' + f': {remaining_names}') + + def inflate_weights(self, logger: MMLogger) -> None: + """Inflate weights.""" + self._inflate_weights(self, logger) + + def _make_stem_layer(self) -> None: + """Construct the stem layers consists of a conv+norm+act module and a + pooling layer.""" + self.conv1 = ConvModule( + self.in_channels, + self.base_channels, + kernel_size=self.conv1_kernel, + stride=(self.conv1_stride_t, self.conv1_stride_s, + self.conv1_stride_s), + padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.maxpool = nn.MaxPool3d( + kernel_size=(1, 3, 3), + stride=(self.pool1_stride_t, self.pool1_stride_s, + self.pool1_stride_s), + padding=(0, 1, 1)) + + self.pool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1)) + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1.eval() + for param in self.conv1.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + @staticmethod + def _init_weights(self, pretrained: Optional[str] = None) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch. + + Args: + pretrained (str | None): The path of the pretrained weight. Will + override the original `pretrained` if set. The arg is added to + be compatible with mmdet. Defaults to None. + """ + if pretrained: + self.pretrained = pretrained + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + + if self.pretrained2d: + # Inflate 2D model into 3D model. + self.inflate_weights(logger) + else: + # Directly load 3D model. + load_checkpoint( + self, self.pretrained, strict=False, logger=logger) + + elif self.pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck3d): + constant_init(m.conv3.bn, 0) + elif isinstance(m, BasicBlock3d): + constant_init(m.conv2.bn, 0) + else: + raise TypeError('pretrained must be a str or None') + + def init_weights(self, pretrained: Optional[str] = None) -> None: + """Initialize weights.""" + self._init_weights(self, pretrained) + + def forward(self, x: torch.Tensor) \ + -> Union[torch.Tensor, Tuple[torch.Tensor]]: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor or tuple[torch.Tensor]: The feature of the input + samples extracted by the backbone. + """ + x = self.conv1(x) + if self.with_pool1: + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i == 0 and self.with_pool2: + x = self.pool2(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def train(self, mode: bool = True) -> None: + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + +@MODELS.register_module() +class ResNet3dLayer(BaseModule): + """ResNet 3d Layer. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + pretrained (str, optional): Name of pretrained model. Defaults to None. + pretrained2d (bool): Whether to load pretrained 2D model. + Defaults to True. + stage (int): The index of Resnet stage. Defaults to 3. + base_channels (int): Channel num of stem output features. + Defaults to 64. + spatial_stride (int): The 1st res block's spatial stride. + Defaults to 2. + temporal_stride (int): The 1st res block's temporal stride. + Defaults to 1. + dilation (int): The dilation. Defaults to 1. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. + all_frozen (bool): Frozen all modules in the layer. Defaults to False. + inflate (int): Inflate dims of each block. Defaults to 1. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + kernel sizes and padding strides for conv1 and conv2 in each block. + Defaults to ``'3x1x1'``. + conv_cfg (dict): Config for conv layers. + Required keys are ``type``. Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. + Required keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN3d', requires_grad=True)``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU', inplace=True)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (``mean`` and ``var``). Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + zero_init_residual (bool): + Whether to use zero initialization for residual block, + Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + depth: int, + pretrained: Optional[str] = None, + pretrained2d: bool = True, + stage: int = 3, + base_channels: int = 64, + spatial_stride: int = 2, + temporal_stride: int = 1, + dilation: int = 1, + style: str = 'pytorch', + all_frozen: bool = False, + inflate: int = 1, + inflate_style: str = '3x1x1', + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d', requires_grad=True), + act_cfg: Dict = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + with_cp: bool = False, + zero_init_residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.arch_settings = ResNet3d.arch_settings + assert depth in self.arch_settings + + self.make_res_layer = ResNet3d.make_res_layer + self._inflate_conv_params = ResNet3d._inflate_conv_params + self._inflate_bn_params = ResNet3d._inflate_bn_params + self._inflate_weights = ResNet3d._inflate_weights + self._init_weights = ResNet3d._init_weights + + self.depth = depth + self.pretrained = pretrained + self.pretrained2d = pretrained2d + self.stage = stage + # stage index is 0 based + assert 0 <= stage <= 3 + self.base_channels = base_channels + + self.spatial_stride = spatial_stride + self.temporal_stride = temporal_stride + self.dilation = dilation + + self.style = style + self.all_frozen = all_frozen + + self.stage_inflation = inflate + self.inflate_style = inflate_style + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + block, stage_blocks = self.arch_settings[depth] + stage_block = stage_blocks[stage] + planes = 64 * 2**stage + inplanes = 64 * 2**(stage - 1) * block.expansion + + res_layer = self.make_res_layer( + block, + inplanes, + planes, + stage_block, + spatial_stride=spatial_stride, + temporal_stride=temporal_stride, + dilation=dilation, + style=self.style, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + act_cfg=self.act_cfg, + inflate=self.stage_inflation, + inflate_style=self.inflate_style, + with_cp=with_cp, + **kwargs) + + self.layer_name = f'layer{stage + 1}' + self.add_module(self.layer_name, res_layer) + + def inflate_weights(self, logger: MMLogger) -> None: + """Inflate weights.""" + self._inflate_weights(self, logger) + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.all_frozen: + layer = getattr(self, self.layer_name) + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained: Optional[str] = None) -> None: + """Initialize weights.""" + self._init_weights(self, pretrained) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The feature of the input + samples extracted by the residual layer. + """ + res_layer = getattr(self, self.layer_name) + out = res_layer(x) + return out + + def train(self, mode: bool = True) -> None: + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py new file mode 100644 index 0000000000000000000000000000000000000000..1d67f6ceadd413c1908b433a610cef2f988ccd49 --- /dev/null +++ b/mmaction/models/backbones/resnet3d_csn.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS +from .resnet3d import Bottleneck3d, ResNet3d + + +class CSNBottleneck3d(Bottleneck3d): + """Channel-Separated Bottleneck Block. + + This module is proposed in + "Video Classification with Channel-Separated Convolutional Networks" + Link: https://arxiv.org/pdf/1711.11248.pdf + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + bottleneck_mode (str): Determine which ways to factorize a 3D + bottleneck block using channel-separated convolutional networks. + If set to 'ip', it will replace the 3x3x3 conv2 layer with a + 1x1x1 traditional convolution and a 3x3x3 depthwise + convolution, i.e., Interaction-preserved channel-separated + bottleneck block. + If set to 'ir', it will replace the 3x3x3 conv2 layer with a + 3x3x3 depthwise convolution, which is derived from preserved + bottleneck block by removing the extra 1x1x1 convolution, + i.e., Interaction-reduced channel-separated bottleneck block. + Default: 'ir'. + args (position arguments): Position arguments for Bottleneck. + kwargs (dict, optional): Keyword arguments for Bottleneck. + """ + + def __init__(self, + inplanes, + planes, + *args, + bottleneck_mode='ir', + **kwargs): + super(CSNBottleneck3d, self).__init__(inplanes, planes, *args, + **kwargs) + self.bottleneck_mode = bottleneck_mode + conv2 = [] + if self.bottleneck_mode == 'ip': + conv2.append( + ConvModule( + planes, + planes, + 1, + stride=1, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None)) + conv2_kernel_size = self.conv2.conv.kernel_size + conv2_stride = self.conv2.conv.stride + conv2_padding = self.conv2.conv.padding + conv2_dilation = self.conv2.conv.dilation + conv2_bias = bool(self.conv2.conv.bias) + self.conv2 = ConvModule( + planes, + planes, + conv2_kernel_size, + stride=conv2_stride, + padding=conv2_padding, + dilation=conv2_dilation, + bias=conv2_bias, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + groups=planes) + conv2.append(self.conv2) + self.conv2 = nn.Sequential(*conv2) + + +@MODELS.register_module() +class ResNet3dCSN(ResNet3d): + """ResNet backbone for CSN. + + Args: + depth (int): Depth of ResNetCSN, from {18, 34, 50, 101, 152}. + pretrained (str | None): Name of pretrained model. + temporal_strides (tuple[int]): + Temporal strides of residual blocks of each stage. + Default: (1, 2, 2, 2). + conv1_kernel (tuple[int]): Kernel size of the first conv layer. + Default: (3, 7, 7). + conv1_stride_t (int): Temporal stride of the first conv layer. + Default: 1. + pool1_stride_t (int): Temporal stride of the first pooling layer. + Default: 1. + norm_cfg (dict): Config for norm layers. required keys are `type` and + `requires_grad`. + Default: dict(type='BN3d', requires_grad=True, eps=1e-3). + inflate_style (str): `3x1x1` or `3x3x3`. which determines the kernel + sizes and padding strides for conv1 and conv2 in each block. + Default: '3x3x3'. + bottleneck_mode (str): Determine which ways to factorize a 3D + bottleneck block using channel-separated convolutional networks. + If set to 'ip', it will replace the 3x3x3 conv2 layer with a + 1x1x1 traditional convolution and a 3x3x3 depthwise + convolution, i.e., Interaction-preserved channel-separated + bottleneck block. + If set to 'ir', it will replace the 3x3x3 conv2 layer with a + 3x3x3 depthwise convolution, which is derived from preserved + bottleneck block by removing the extra 1x1x1 convolution, + i.e., Interaction-reduced channel-separated bottleneck block. + Default: 'ip'. + kwargs (dict, optional): Key arguments for "make_res_layer". + """ + + def __init__(self, + depth, + pretrained, + temporal_strides=(1, 2, 2, 2), + conv1_kernel=(3, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_cfg=dict(type='BN3d', requires_grad=True, eps=1e-3), + inflate_style='3x3x3', + bottleneck_mode='ir', + bn_frozen=False, + **kwargs): + self.arch_settings = { + # 18: (BasicBlock3d, (2, 2, 2, 2)), + # 34: (BasicBlock3d, (3, 4, 6, 3)), + 50: (CSNBottleneck3d, (3, 4, 6, 3)), + 101: (CSNBottleneck3d, (3, 4, 23, 3)), + 152: (CSNBottleneck3d, (3, 8, 36, 3)) + } + self.bn_frozen = bn_frozen + if bottleneck_mode not in ['ip', 'ir']: + raise ValueError(f'Bottleneck mode must be "ip" or "ir",' + f'but got {bottleneck_mode}.') + super(ResNet3dCSN, self).__init__( + depth, + pretrained, + temporal_strides=temporal_strides, + conv1_kernel=conv1_kernel, + conv1_stride_t=conv1_stride_t, + pool1_stride_t=pool1_stride_t, + norm_cfg=norm_cfg, + inflate_style=inflate_style, + bottleneck_mode=bottleneck_mode, + **kwargs) + + def train(self, mode=True): + """Set the optimization status when training.""" + super(ResNet3d, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + if self.bn_frozen: + for param in m.parameters(): + param.requires_grad = False diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py new file mode 100644 index 0000000000000000000000000000000000000000..3d069625eed4473fa33117d536bab13e90bd0995 --- /dev/null +++ b/mmaction/models/backbones/resnet3d_slowfast.py @@ -0,0 +1,510 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.logging import MMLogger, print_log +from mmengine.model import BaseModule +from mmengine.model.weight_init import kaiming_init +from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint + +from mmaction.registry import MODELS +from .resnet3d import ResNet3d + + +class DeConvModule(BaseModule): + """A deconv module that bundles deconv/norm/activation layers. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels produced by the convolution. + kernel_size (int | tuple[int]): Size of the convolving kernel. + stride (int | tuple[int]): Stride of the convolution. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. + bias (bool): Whether to add a learnable bias to the output. + Defaults to False. + with_bn (bool): Whether to add a BN layer. Defaults to True. + with_relu (bool): Whether to add a ReLU layer. Defaults to True. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: Union[int, Tuple[int]] = (1, 1, 1), + padding: Union[int, Tuple[int]] = 0, + bias: bool = False, + with_bn: bool = True, + with_relu: bool = True) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.bias = bias + self.with_bn = with_bn + self.with_relu = with_relu + + self.conv = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=bias) + self.bn = nn.BatchNorm3d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + # x should be a 5-d tensor + assert len(x.shape) == 5 + N, C, T, H, W = x.shape + out_shape = (N, self.out_channels, self.stride[0] * T, + self.stride[1] * H, self.stride[2] * W) + x = self.conv(x, output_size=out_shape) + if self.with_bn: + x = self.bn(x) + if self.with_relu: + x = self.relu(x) + return x + + +class ResNet3dPathway(ResNet3d): + """A pathway of Slowfast based on ResNet3d. + + Args: + lateral (bool): Determines whether to enable the lateral connection + from another pathway. Defaults to False. + lateral_inv (bool): Whether to use deconv to upscale the time + dimension of features from another pathway. Defaults to False. + lateral_norm (bool): Determines whether to enable the lateral norm + in lateral layers. Defaults to False. + speed_ratio (int): Speed ratio indicating the ratio between time + dimension of the fast and slow pathway, corresponding to the + ``alpha`` in the paper. Defaults to 8. + channel_ratio (int): Reduce the channel number of fast pathway + by ``channel_ratio``, corresponding to ``beta`` in the paper. + Defaults to 8. + fusion_kernel (int): The kernel size of lateral fusion. + Defaults to 5. + lateral_infl (int): The ratio of the inflated channels. + Defaults to 2. + lateral_activate (list[int]): Flags for activating the lateral + connection. Defaults to ``[1, 1, 1, 1]``. + """ + + def __init__(self, + lateral: bool = False, + lateral_inv: bool = False, + lateral_norm: bool = False, + speed_ratio: int = 8, + channel_ratio: int = 8, + fusion_kernel: int = 5, + lateral_infl: int = 2, + lateral_activate: List[int] = [1, 1, 1, 1], + **kwargs) -> None: + self.lateral = lateral + self.lateral_inv = lateral_inv + self.lateral_norm = lateral_norm + self.speed_ratio = speed_ratio + self.channel_ratio = channel_ratio + self.fusion_kernel = fusion_kernel + self.lateral_infl = lateral_infl + self.lateral_activate = lateral_activate + self._calculate_lateral_inplanes(kwargs) + + super().__init__(**kwargs) + self.inplanes = self.base_channels + if self.lateral and self.lateral_activate[0] == 1: + if self.lateral_inv: + self.conv1_lateral = DeConvModule( + self.inplanes * self.channel_ratio, + self.inplanes * self.channel_ratio // lateral_infl, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + with_bn=True, + with_relu=True) + else: + self.conv1_lateral = ConvModule( + self.inplanes // self.channel_ratio, + self.inplanes * lateral_infl // self.channel_ratio, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg if self.lateral_norm else None, + act_cfg=self.act_cfg if self.lateral_norm else None) + + self.lateral_connections = [] + for i in range(len(self.stage_blocks)): + planes = self.base_channels * 2**i + self.inplanes = planes * self.block.expansion + + if lateral and i != self.num_stages - 1 \ + and self.lateral_activate[i + 1]: + # no lateral connection needed in final stage + lateral_name = f'layer{(i + 1)}_lateral' + if self.lateral_inv: + conv_module = DeConvModule( + self.inplanes * self.channel_ratio, + self.inplanes * self.channel_ratio // lateral_infl, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + bias=False, + with_bn=True, + with_relu=True) + else: + conv_module = ConvModule( + self.inplanes // self.channel_ratio, + self.inplanes * lateral_infl // self.channel_ratio, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg if self.lateral_norm else None, + act_cfg=self.act_cfg if self.lateral_norm else None) + setattr(self, lateral_name, conv_module) + self.lateral_connections.append(lateral_name) + + def _calculate_lateral_inplanes(self, kwargs): + """Calculate inplanes for lateral connection.""" + depth = kwargs.get('depth', 50) + expansion = 1 if depth < 50 else 4 + base_channels = kwargs.get('base_channels', 64) + lateral_inplanes = [] + for i in range(kwargs.get('num_stages', 4)): + if expansion % 2 == 0: + planes = base_channels * (2 ** i) * \ + ((expansion // 2) ** (i > 0)) + else: + planes = base_channels * (2**i) // (2**(i > 0)) + if self.lateral and self.lateral_activate[i]: + if self.lateral_inv: + lateral_inplane = planes * \ + self.channel_ratio // self.lateral_infl + else: + lateral_inplane = planes * \ + self.lateral_infl // self.channel_ratio + else: + lateral_inplane = 0 + lateral_inplanes.append(lateral_inplane) + self.lateral_inplanes = lateral_inplanes + + def inflate_weights(self, logger: MMLogger) -> None: + """Inflate the resnet2d parameters to resnet3d pathway. + + The differences between resnet3d and resnet2d mainly lie in an extra + axis of conv kernel. To utilize the pretrained parameters in 2d model, + the weight of conv2d models should be inflated to fit in the shapes of + the 3d counterpart. For pathway the ``lateral_connection`` part should + not be inflated from 2d weights. + + Args: + logger (MMLogger): The logger used to print + debugging information. + """ + + state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu') + if 'state_dict' in state_dict_r2d: + state_dict_r2d = state_dict_r2d['state_dict'] + + inflated_param_names = [] + for name, module in self.named_modules(): + if 'lateral' in name: + continue + if isinstance(module, ConvModule): + # we use a ConvModule to wrap conv+bn+relu layers, thus the + # name mapping is needed + if 'downsample' in name: + # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0 + original_conv_name = name + '.0' + # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1 + original_bn_name = name + '.1' + else: + # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n} + original_conv_name = name + # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n} + original_bn_name = name.replace('conv', 'bn') + if original_conv_name + '.weight' not in state_dict_r2d: + logger.warning(f'Module not exist in the state_dict_r2d' + f': {original_conv_name}') + else: + self._inflate_conv_params(module.conv, state_dict_r2d, + original_conv_name, + inflated_param_names) + if original_bn_name + '.weight' not in state_dict_r2d: + logger.warning(f'Module not exist in the state_dict_r2d' + f': {original_bn_name}') + else: + self._inflate_bn_params(module.bn, state_dict_r2d, + original_bn_name, + inflated_param_names) + + # check if any parameters in the 2d checkpoint are not loaded + remaining_names = set( + state_dict_r2d.keys()) - set(inflated_param_names) + if remaining_names: + logger.info(f'These parameters in the 2d checkpoint are not loaded' + f': {remaining_names}') + + def _inflate_conv_params(self, conv3d: nn.Module, + state_dict_2d: OrderedDict, module_name_2d: str, + inflated_param_names: List[str]) -> None: + """Inflate a conv module from 2d to 3d. + + The differences of conv modules betweene 2d and 3d in Pathway + mainly lie in the inplanes due to lateral connections. To fit the + shapes of the lateral connection counterpart, it will expand + parameters by concatting conv2d parameters and extra zero paddings. + + Args: + conv3d (nn.Module): The destination conv3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding conv module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + weight_2d_name = module_name_2d + '.weight' + conv2d_weight = state_dict_2d[weight_2d_name] + old_shape = conv2d_weight.shape + new_shape = conv3d.weight.data.shape + kernel_t = new_shape[2] + + if new_shape[1] != old_shape[1]: + if new_shape[1] < old_shape[1]: + warnings.warn(f'The parameter of {module_name_2d} is not' + 'loaded due to incompatible shapes. ') + return + # Inplanes may be different due to lateral connections + new_channels = new_shape[1] - old_shape[1] + pad_shape = old_shape + pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:] + # Expand parameters by concat extra channels + conv2d_weight = torch.cat( + (conv2d_weight, + torch.zeros(pad_shape).type_as(conv2d_weight).to( + conv2d_weight.device)), + dim=1) + + new_weight = conv2d_weight.data.unsqueeze(2).expand_as( + conv3d.weight) / kernel_t + conv3d.weight.data.copy_(new_weight) + inflated_param_names.append(weight_2d_name) + + if getattr(conv3d, 'bias') is not None: + bias_2d_name = module_name_2d + '.bias' + conv3d.bias.data.copy_(state_dict_2d[bias_2d_name]) + inflated_param_names.append(bias_2d_name) + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + `self.frozen_stages`.""" + if self.frozen_stages >= 0: + self.conv1.eval() + for param in self.conv1.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + if i != len(self.res_layers) and self.lateral: + # No fusion needed in the final stage + lateral_name = self.lateral_connections[i - 1] + conv_lateral = getattr(self, lateral_name) + conv_lateral.eval() + for param in conv_lateral.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained: Optional[str] = None) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if pretrained: + self.pretrained = pretrained + + # Override the init_weights of i3d + super().init_weights() + for module_name in self.lateral_connections: + layer = getattr(self, module_name) + for m in layer.modules(): + if isinstance(m, (nn.Conv3d, nn.Conv2d)): + kaiming_init(m) + + +pathway_cfg = { + 'resnet3d': ResNet3dPathway, + # TODO: BNInceptionPathway +} + + +def build_pathway(cfg: Dict, *args, **kwargs) -> nn.Module: + """Build pathway. + + Args: + cfg (dict): cfg should contain: + - type (str): identify backbone type. + + Returns: + nn.Module: Created pathway. + """ + if not (isinstance(cfg, dict) and 'type' in cfg): + raise TypeError('cfg must be a dict containing the key "type"') + cfg_ = cfg.copy() + + pathway_type = cfg_.pop('type') + if pathway_type not in pathway_cfg: + raise KeyError(f'Unrecognized pathway type {pathway_type}') + + pathway_cls = pathway_cfg[pathway_type] + pathway = pathway_cls(*args, **kwargs, **cfg_) + + return pathway + + +@MODELS.register_module() +class ResNet3dSlowFast(BaseModule): + """Slowfast backbone. + + This module is proposed in `SlowFast Networks for Video Recognition + `_ + + Args: + pretrained (str): The file path to a pretrained model. + resample_rate (int): A large temporal stride ``resample_rate`` + on input frames. The actual resample rate is calculated by + multipling the ``interval`` in ``SampleFrames`` in the + pipeline with ``resample_rate``, equivalent to the :math:`\\tau` + in the paper, i.e. it processes only one out of + ``resample_rate * interval`` frames. Defaults to 8. + speed_ratio (int): Speed ratio indicating the ratio between time + dimension of the fast and slow pathway, corresponding to the + :math:`\\alpha` in the paper. Defaults to 8. + channel_ratio (int): Reduce the channel number of fast pathway + by ``channel_ratio``, corresponding to :math:`\\beta` in the paper. + Defaults to 8. + slow_pathway (dict): Configuration of slow branch. Defaults to + ``dict(type='resnet3d', depth=50, pretrained=None, lateral=True, + conv1_kernel=(1, 7, 7), conv1_stride_t=1, pool1_stride_t=1, + inflate=(0, 0, 1, 1))``. + fast_pathway (dict): Configuration of fast branch. Defaults to + ``dict(type='resnet3d', depth=50, pretrained=None, lateral=False, + base_channels=8, conv1_kernel=(5, 7, 7), conv1_stride_t=1, + pool1_stride_t=1)``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + pretrained: Optional[str] = None, + resample_rate: int = 8, + speed_ratio: int = 8, + channel_ratio: int = 8, + slow_pathway: Dict = dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1)), + fast_pathway: Dict = dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1), + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.pretrained = pretrained + self.resample_rate = resample_rate + self.speed_ratio = speed_ratio + self.channel_ratio = channel_ratio + + if slow_pathway['lateral']: + slow_pathway['speed_ratio'] = speed_ratio + slow_pathway['channel_ratio'] = channel_ratio + + self.slow_path = build_pathway(slow_pathway) + self.fast_path = build_pathway(fast_pathway) + + def init_weights(self, pretrained: Optional[str] = None) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if pretrained: + self.pretrained = pretrained + + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + msg = f'load model from: {self.pretrained}' + print_log(msg, logger=logger) + # Directly load 3D model. + load_checkpoint(self, self.pretrained, strict=True, logger=logger) + elif self.pretrained is None: + # Init two branch separately. + self.fast_path.init_weights() + self.slow_path.init_weights() + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: torch.Tensor) -> tuple: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + tuple[torch.Tensor]: The feature of the input samples + extracted by the backbone. + """ + x_slow = nn.functional.interpolate( + x, + mode='nearest', + scale_factor=(1.0 / self.resample_rate, 1.0, 1.0)) + x_slow = self.slow_path.conv1(x_slow) + x_slow = self.slow_path.maxpool(x_slow) + + x_fast = nn.functional.interpolate( + x, + mode='nearest', + scale_factor=(1.0 / (self.resample_rate // self.speed_ratio), 1.0, + 1.0)) + x_fast = self.fast_path.conv1(x_fast) + x_fast = self.fast_path.maxpool(x_fast) + + if self.slow_path.lateral: + x_fast_lateral = self.slow_path.conv1_lateral(x_fast) + x_slow = torch.cat((x_slow, x_fast_lateral), dim=1) + + for i, layer_name in enumerate(self.slow_path.res_layers): + res_layer = getattr(self.slow_path, layer_name) + x_slow = res_layer(x_slow) + res_layer_fast = getattr(self.fast_path, layer_name) + x_fast = res_layer_fast(x_fast) + if (i != len(self.slow_path.res_layers) - 1 + and self.slow_path.lateral): + # No fusion needed in the final stage + lateral_name = self.slow_path.lateral_connections[i] + conv_lateral = getattr(self.slow_path, lateral_name) + x_fast_lateral = conv_lateral(x_fast) + x_slow = torch.cat((x_slow, x_fast_lateral), dim=1) + + out = (x_slow, x_fast) + + return out diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py new file mode 100644 index 0000000000000000000000000000000000000000..7fac766294879890135b549b69aebc21a9fb795c --- /dev/null +++ b/mmaction/models/backbones/resnet3d_slowonly.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +from mmaction.registry import MODELS +from .resnet3d_slowfast import ResNet3dPathway + + +@MODELS.register_module() +class ResNet3dSlowOnly(ResNet3dPathway): + """SlowOnly backbone based on ResNet3dPathway. + + Args: + conv1_kernel (Sequence[int]): Kernel size of the first conv layer. + Defaults to ``(1, 7, 7)``. + conv1_stride_t (int): Temporal stride of the first conv layer. + Defaults to 1. + pool1_stride_t (int): Temporal stride of the first pooling layer. + Defaults to 1. + inflate (Sequence[int]): Inflate dims of each block. + Defaults to ``(0, 0, 1, 1)``. + with_pool2 (bool): Whether to use pool2. Defaults to False. + """ + + def __init__(self, + conv1_kernel: Sequence[int] = (1, 7, 7), + conv1_stride_t: int = 1, + pool1_stride_t: int = 1, + inflate: Sequence[int] = (0, 0, 1, 1), + with_pool2: bool = False, + **kwargs) -> None: + super().__init__( + conv1_kernel=conv1_kernel, + conv1_stride_t=conv1_stride_t, + pool1_stride_t=pool1_stride_t, + inflate=inflate, + with_pool2=with_pool2, + **kwargs) + + assert not self.lateral diff --git a/mmaction/models/backbones/resnet_audio.py b/mmaction/models/backbones/resnet_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..c44b5d3950a5a250c19b9f8c817abea8a7bfcbba --- /dev/null +++ b/mmaction/models/backbones/resnet_audio.py @@ -0,0 +1,386 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule +from mmengine.logging import MMLogger +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm +from torch.nn.modules.utils import _ntuple + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType + + +class Bottleneck2dAudio(nn.Module): + """Bottleneck2D block for ResNet2D. + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + stride (int): Stride in the conv layer. Defaults to 2. + dilation (int): Spacing between kernel elements. Defaults to 1. + downsample (nn.Module, optional): Downsample layer. Defaults to None. + factorize (bool): Whether to factorize kernel. Defaults to True. + norm_cfg (dict): Config for norm layers. required keys are ``type`` and + ``requires_grad``. Defaults to None. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the trgaining speed. Defaults to False. + """ + expansion = 4 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 2, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + factorize: bool = True, + norm_cfg: ConfigType = None, + with_cp: bool = False) -> None: + super().__init__() + + self.inplanes = inplanes + self.planes = planes + self.stride = stride + self.dilation = dilation + self.factorize = factorize + self.norm_cfg = norm_cfg + self.with_cp = with_cp + + self.conv1_stride = 1 + self.conv2_stride = stride + + conv1_kernel_size = (1, 1) + conv1_padding = 0 + conv2_kernel_size = (3, 3) + conv2_padding = (dilation, dilation) + self.conv1 = ConvModule( + inplanes, + planes, + kernel_size=conv1_kernel_size, + padding=conv1_padding, + dilation=dilation, + norm_cfg=self.norm_cfg, + bias=False) + self.conv2 = ConvModule( + planes, + planes, + kernel_size=conv2_kernel_size, + stride=stride, + padding=conv2_padding, + dilation=dilation, + bias=False, + conv_cfg=dict(type='ConvAudio') if factorize else dict( + type='Conv'), + norm_cfg=None, + act_cfg=None) + self.conv3 = ConvModule( + 2 * planes if factorize else planes, + planes * self.expansion, + kernel_size=1, + bias=False, + norm_cfg=self.norm_cfg, + act_cfg=None) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + + def _inner_forward(x): + identity = x + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + if self.downsample is not None: + identity = self.downsample(x) + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@MODELS.register_module() +class ResNetAudio(nn.Module): + """ResNet 2d audio backbone. Reference: + + `_. + + Args: + depth (int): Depth of resnet, from ``{50, 101, 152}``. + pretrained (str, optional): Name of pretrained model. Defaults to None. + in_channels (int): Channel num of input features. Defaults to 1. + base_channels (int): Channel num of stem output features. + Defaults to 32. + num_stages (int): Resnet stages. Defaults to 4. + strides (Sequence[int]): Strides of residual blocks of each stage. + Defaults to ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Defaults to ``(1, 1, 1, 1)``. + conv1_kernel (int): Kernel size of the first conv layer. Defaults to 9. + conv1_stride (Union[int, Tuple[int]]): Stride of the first conv layer. + Defaults to 1. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Defaults to -1. + factorize (Sequence[int]): factorize Dims of each block for audio. + Defaults to ``(1, 1, 0, 0)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (mean and var). Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + conv_cfg (Union[dict, ConfigDict]): Config for norm layers. + Defaults to ``dict(type='Conv')``. + norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required + keys are ``type`` and ``requires_grad``. + Defaults to ``dict(type='BN2d', requires_grad=True)``. + act_cfg (Union[dict, ConfigDict]): Config for activate layers. + Defaults to ``dict(type='ReLU', inplace=True)``. + zero_init_residual (bool): Whether to use zero initialization + for residual block. Defaults to True. + """ + + arch_settings = { + # 18: (BasicBlock2dAudio, (2, 2, 2, 2)), + # 34: (BasicBlock2dAudio, (3, 4, 6, 3)), + 50: (Bottleneck2dAudio, (3, 4, 6, 3)), + 101: (Bottleneck2dAudio, (3, 4, 23, 3)), + 152: (Bottleneck2dAudio, (3, 8, 36, 3)) + } + + def __init__(self, + depth: int, + pretrained: str = None, + in_channels: int = 1, + num_stages: int = 4, + base_channels: int = 32, + strides: Sequence[int] = (1, 2, 2, 2), + dilations: Sequence[int] = (1, 1, 1, 1), + conv1_kernel: int = 9, + conv1_stride: int = 1, + frozen_stages: int = -1, + factorize: Sequence[int] = (1, 1, 0, 0), + norm_eval: bool = False, + with_cp: bool = False, + conv_cfg: ConfigType = dict(type='Conv'), + norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + zero_init_residual: bool = True) -> None: + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.pretrained = pretrained + self.in_channels = in_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.dilations = dilations + self.conv1_kernel = conv1_kernel + self.conv1_stride = conv1_stride + self.frozen_stages = frozen_stages + self.stage_factorization = _ntuple(num_stages)(factorize) + self.norm_eval = norm_eval + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.zero_init_residual = zero_init_residual + + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = self.base_channels + + self._make_stem_layer() + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = self.base_channels * 2**i + res_layer = self.make_res_layer( + self.block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + factorize=self.stage_factorization[i], + norm_cfg=self.norm_cfg, + with_cp=with_cp) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = self.block.expansion * self.base_channels * 2**( + len(self.stage_blocks) - 1) + + @staticmethod + def make_res_layer(block: nn.Module, + inplanes: int, + planes: int, + blocks: int, + stride: int = 1, + dilation: int = 1, + factorize: int = 1, + norm_cfg: Optional[ConfigType] = None, + with_cp: bool = False) -> nn.Module: + """Build residual layer for ResNetAudio. + + Args: + block (nn.Module): Residual module to be built. + inplanes (int): Number of channels for the input feature + in each block. + planes (int): Number of channels for the output feature + in each block. + blocks (int): Number of residual blocks. + stride (int): Strides of residual blocks of each stage. + Defaults to 1. + dilation (int): Spacing between kernel elements. Defaults to 1. + factorize (Uninon[int, Sequence[int]]): Determine whether to + factorize for each block. Defaults to 1. + norm_cfg (Union[dict, ConfigDict], optional): Config for norm + layers. Defaults to None. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Defaults to False. + + Returns: + nn.Module: A residual layer for the given config. + """ + factorize = factorize if not isinstance( + factorize, int) else (factorize, ) * blocks + assert len(factorize) == blocks + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = ConvModule( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + dilation, + downsample, + factorize=(factorize[0] == 1), + norm_cfg=norm_cfg, + with_cp=with_cp)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + 1, + dilation, + factorize=(factorize[i] == 1), + norm_cfg=norm_cfg, + with_cp=with_cp)) + + return nn.Sequential(*layers) + + def _make_stem_layer(self) -> None: + """Construct the stem layers consists of a ``conv+norm+act`` module and + a pooling layer.""" + self.conv1 = ConvModule( + self.in_channels, + self.base_channels, + kernel_size=self.conv1_kernel, + stride=self.conv1_stride, + bias=False, + conv_cfg=dict(type='ConvAudio', op='sum'), + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1.bn.eval() + for m in [self.conv1.conv, self.conv1.bn]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + + load_checkpoint(self, self.pretrained, strict=False, logger=logger) + + elif self.pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck2dAudio): + constant_init(m.conv3.bn, 0) + + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The feature of the input samples extracted + by the backbone. + """ + x = self.conv1(x) + for layer_name in self.res_layers: + res_layer = getattr(self, layer_name) + x = res_layer(x) + return x + + def train(self, mode: bool = True) -> None: + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmaction/models/backbones/resnet_omni.py b/mmaction/models/backbones/resnet_omni.py new file mode 100644 index 0000000000000000000000000000000000000000..77c5cfac93113a3b280a10ebcefb1a5fa15e8eed --- /dev/null +++ b/mmaction/models/backbones/resnet_omni.py @@ -0,0 +1,255 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel, BaseModule +from mmengine.runner import CheckpointLoader + +from mmaction.registry import MODELS +from mmaction.utils import OptConfigType + + +def batch_norm(inputs: torch.Tensor, + module: nn.modules.batchnorm, + training: Optional[bool] = None) -> torch.Tensor: + """Applies Batch Normalization for each channel across a batch of data + using params from the given batch normalization module. + + Args: + inputs (Tensor): The input data. + module (nn.modules.batchnorm): a batch normalization module. Will use + params from this batch normalization module to do the operation. + training (bool, optional): if true, apply the train mode batch + normalization. Defaults to None and will use the training mode of + the module. + """ + if training is None: + training = module.training + return F.batch_norm( + input=inputs, + running_mean=None if training else module.running_mean, + running_var=None if training else module.running_var, + weight=module.weight, + bias=module.bias, + training=training, + momentum=module.momentum, + eps=module.eps) + + +class BottleNeck(BaseModule): + """Building block for Omni-ResNet. + + Args: + inplanes (int): Number of channels for the input in first conv layer. + planes (int): Number of channels for the input in second conv layer. + temporal_kernel (int): Temporal kernel in the conv layer. Should be + either 1 or 3. Defaults to 1. + spatial_stride (int): Spatial stride in the conv layer. Defaults to 1. + init_cfg (dict or ConfigDict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + inplanes: int, + planes: int, + temporal_kernel: int = 3, + spatial_stride: int = 1, + init_cfg: OptConfigType = None, + **kwargs) -> None: + super(BottleNeck, self).__init__(init_cfg=init_cfg) + assert temporal_kernel in [1, 3] + + self.conv1 = nn.Conv3d( + inplanes, + planes, + kernel_size=(temporal_kernel, 1, 1), + padding=(temporal_kernel // 2, 0, 0), + bias=False) + self.conv2 = nn.Conv3d( + planes, + planes, + stride=(1, spatial_stride, spatial_stride), + kernel_size=(1, 3, 3), + padding=(0, 1, 1), + bias=False) + + self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) + + self.bn1 = nn.BatchNorm3d(planes, momentum=0.01) + self.bn2 = nn.BatchNorm3d(planes, momentum=0.01) + self.bn3 = nn.BatchNorm3d(planes * 4, momentum=0.01) + + if inplanes != planes * 4 or spatial_stride != 1: + downsample = [ + nn.Conv3d( + inplanes, + planes * 4, + kernel_size=1, + stride=(1, spatial_stride, spatial_stride), + bias=False), + nn.BatchNorm3d(planes * 4, momentum=0.01) + ] + self.downsample = nn.Sequential(*downsample) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Accept both 3D (BCTHW for videos) and 2D (BCHW for images) tensors. + """ + if x.ndim == 4: + return self.forward_2d(x) + + # Forward call for 3D tensors. + out = self.conv1(x) + out = self.bn1(out).relu_() + + out = self.conv2(out) + out = self.bn2(out).relu_() + + out = self.conv3(out) + out = self.bn3(out) + + if hasattr(self, 'downsample'): + x = self.downsample(x) + + return out.add_(x).relu_() + + def forward_2d(self, x: torch.Tensor) -> torch.Tensor: + """Forward call for 2D tensors.""" + out = F.conv2d(x, self.conv1.weight.sum(2)) + out = batch_norm(out, self.bn1).relu_() + + out = F.conv2d( + out, + self.conv2.weight.squeeze(2), + stride=self.conv2.stride[-1], + padding=1) + out = batch_norm(out, self.bn2).relu_() + + out = F.conv2d(out, self.conv3.weight.squeeze(2)) + out = batch_norm(out, self.bn3) + + if hasattr(self, 'downsample'): + x = F.conv2d( + x, + self.downsample[0].weight.squeeze(2), + stride=self.downsample[0].stride[-1]) + x = batch_norm(x, self.downsample[1]) + + return out.add_(x).relu_() + + +@MODELS.register_module() +class OmniResNet(BaseModel): + """Omni-ResNet that accepts both image and video inputs. + + Args: + layers (List[int]): number of layers in each residual stages. Defaults + to [3, 4, 6, 3]. + pretrain_2d (str, optional): path to the 2D pretraining checkpoints. + Defaults to None. + init_cfg (dict or ConfigDict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + layers: List[int] = [3, 4, 6, 3], + pretrain_2d: Optional[str] = None, + init_cfg: OptConfigType = None) -> None: + super(OmniResNet, self).__init__(init_cfg=init_cfg) + + self.inplanes = 64 + self.conv1 = nn.Conv3d( + 3, + self.inplanes, + kernel_size=(1, 7, 7), + stride=(1, 2, 2), + padding=(0, 3, 3), + bias=False) + self.bn1 = nn.BatchNorm3d(self.inplanes, momentum=0.01) + + self.pool3d = nn.MaxPool3d((1, 3, 3), (1, 2, 2), (0, 1, 1)) + self.pool2d = nn.MaxPool2d(3, 2, 1) + + self.temporal_kernel = 1 + self.layer1 = self._make_layer(64, layers[0]) + self.layer2 = self._make_layer(128, layers[1], stride=2) + self.temporal_kernel = 3 + self.layer3 = self._make_layer(256, layers[2], stride=2) + self.layer4 = self._make_layer(512, layers[3], stride=2) + + if pretrain_2d is not None: + self.init_from_2d(pretrain_2d) + + def _make_layer(self, + planes: int, + num_blocks: int, + stride: int = 1) -> nn.Module: + layers = [ + BottleNeck( + self.inplanes, + planes, + spatial_stride=stride, + temporal_kernel=self.temporal_kernel) + ] + self.inplanes = planes * 4 + for _ in range(1, num_blocks): + layers.append( + BottleNeck( + self.inplanes, + planes, + temporal_kernel=self.temporal_kernel)) + return nn.Sequential(*layers) + + def init_from_2d(self, pretrain: str) -> None: + param2d = CheckpointLoader.load_checkpoint( + pretrain, map_location='cpu') + param3d = self.state_dict() + for key in param3d: + if key in param2d: + weight = param2d[key] + if weight.ndim == 4: + t = param3d[key].shape[2] + weight = weight.unsqueeze(2) + weight = weight.expand(-1, -1, t, -1, -1) + weight = weight / t + param3d[key] = weight + self.load_state_dict(param3d) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Accept both 3D (BCTHW for videos) and 2D (BCHW for images) tensors. + """ + if x.ndim == 4: + return self.forward_2d(x) + + # Forward call for 3D tensors. + x = self.conv1(x) + x = self.bn1(x).relu_() + x = self.pool3d(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + return x + + def forward_2d(self, x: torch.Tensor) -> torch.Tensor: + """Forward call for 2D tensors.""" + x = F.conv2d( + x, + self.conv1.weight.squeeze(2), + stride=self.conv1.stride[-1], + padding=self.conv1.padding[-1]) + x = batch_norm(x, self.bn1).relu_() + x = self.pool2d(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py new file mode 100644 index 0000000000000000000000000000000000000000..3040e70337511131d6f9519a9dd9d185bc83bcd9 --- /dev/null +++ b/mmaction/models/backbones/resnet_tin.py @@ -0,0 +1,370 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.registry import MODELS +from .resnet_tsm import ResNetTSM + + +def linear_sampler(data, offset): + """Differentiable Temporal-wise Frame Sampling, which is essentially a + linear interpolation process. + + It gets the feature map which has been split into several groups + and shift them by different offsets according to their groups. + Then compute the weighted sum along with the temporal dimension. + + Args: + data (torch.Tensor): Split data for certain group in shape + [N, num_segments, C, H, W]. + offset (torch.Tensor): Data offsets for this group data in shape + [N, num_segments]. + """ + # [N, num_segments, C, H, W] + n, t, c, h, w = data.shape + + # offset0, offset1: [N, num_segments] + offset0 = torch.floor(offset).int() + offset1 = offset0 + 1 + + # data, data0, data1: [N, num_segments, C, H * W] + data = data.view(n, t, c, h * w).contiguous() + + try: + from mmcv.ops import tin_shift + except (ImportError, ModuleNotFoundError): + raise ImportError('Failed to import `tin_shift` from `mmcv.ops`. You ' + 'will be unable to use TIN. ') + + data0 = tin_shift(data, offset0) + data1 = tin_shift(data, offset1) + + # weight0, weight1: [N, num_segments] + weight0 = 1 - (offset - offset0.float()) + weight1 = 1 - weight0 + + # weight0, weight1: + # [N, num_segments] -> [N, num_segments, C // num_segments] -> [N, C] + group_size = offset.shape[1] + weight0 = weight0[:, :, None].repeat(1, 1, c // group_size) + weight0 = weight0.view(weight0.size(0), -1) + weight1 = weight1[:, :, None].repeat(1, 1, c // group_size) + weight1 = weight1.view(weight1.size(0), -1) + + # weight0, weight1: [N, C] -> [N, 1, C, 1] + weight0 = weight0[:, None, :, None] + weight1 = weight1[:, None, :, None] + + # output: [N, num_segments, C, H * W] -> [N, num_segments, C, H, W] + output = weight0 * data0 + weight1 * data1 + output = output.view(n, t, c, h, w) + + return output + + +class CombineNet(nn.Module): + """Combine Net. + + It combines Temporal interlace module with some part of ResNet layer. + + Args: + net1 (nn.module): Temporal interlace module. + net2 (nn.module): Some part of ResNet layer. + """ + + def __init__(self, net1, net2): + super().__init__() + self.net1 = net1 + self.net2 = net2 + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + # input shape: [num_batches * num_segments, C, H, W] + # output x shape: [num_batches * num_segments, C, H, W] + x = self.net1(x) + # [num_batches * num_segments, C, H, W] + x = self.net2(x) + return x + + +class WeightNet(nn.Module): + """WeightNet in Temporal interlace module. + + The WeightNet consists of two parts: one convolution layer + and a sigmoid function. Following the convolution layer, the sigmoid + function and rescale module can scale our output to the range (0, 2). + Here we set the initial bias of the convolution layer to 0, and the + final initial output will be 1.0. + + Args: + in_channels (int): Channel num of input features. + groups (int): Number of groups for fc layer outputs. + """ + + def __init__(self, in_channels, groups): + super().__init__() + self.sigmoid = nn.Sigmoid() + self.groups = groups + + self.conv = nn.Conv1d(in_channels, groups, 3, padding=1) + + self.init_weights() + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + # we set the initial bias of the convolution + # layer to 0, and the final initial output will be 1.0 + self.conv.bias.data[...] = 0 + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + # calculate weight + # [N, C, T] + n, _, t = x.shape + # [N, groups, T] + x = self.conv(x) + x = x.view(n, self.groups, t) + # [N, T, groups] + x = x.permute(0, 2, 1) + + # scale the output to range (0, 2) + x = 2 * self.sigmoid(x) + # [N, T, groups] + return x + + +class OffsetNet(nn.Module): + """OffsetNet in Temporal interlace module. + + The OffsetNet consists of one convolution layer and two fc layers + with a relu activation following with a sigmoid function. Following + the convolution layer, two fc layers and relu are applied to the output. + Then, apply the sigmoid function with a multiply factor and a minus 0.5 + to transform the output to (-4, 4). + + Args: + in_channels (int): Channel num of input features. + groups (int): Number of groups for fc layer outputs. + num_segments (int): Number of frame segments. + """ + + def __init__(self, in_channels, groups, num_segments): + super().__init__() + self.sigmoid = nn.Sigmoid() + # hard code ``kernel_size`` and ``padding`` according to original repo. + kernel_size = 3 + padding = 1 + + self.conv = nn.Conv1d(in_channels, 1, kernel_size, padding=padding) + self.fc1 = nn.Linear(num_segments, num_segments) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(num_segments, groups) + + self.init_weights() + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + # The bias of the last fc layer is initialized to + # make the post-sigmoid output start from 1 + self.fc2.bias.data[...] = 0.5108 + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + # calculate offset + # [N, C, T] + n, _, t = x.shape + # [N, 1, T] + x = self.conv(x) + # [N, T] + x = x.view(n, t) + # [N, T] + x = self.relu(self.fc1(x)) + # [N, groups] + x = self.fc2(x) + # [N, 1, groups] + x = x.view(n, 1, -1) + + # to make sure the output is in (-t/2, t/2) + # where t = num_segments = 8 + x = 4 * (self.sigmoid(x) - 0.5) + # [N, 1, groups] + return x + + +class TemporalInterlace(nn.Module): + """Temporal interlace module. + + This module is proposed in `Temporal Interlacing Network + `_ + + Args: + in_channels (int): Channel num of input features. + num_segments (int): Number of frame segments. Default: 3. + shift_div (int): Number of division parts for shift. Default: 1. + """ + + def __init__(self, in_channels, num_segments=3, shift_div=1): + super().__init__() + self.num_segments = num_segments + self.shift_div = shift_div + self.in_channels = in_channels + # hard code ``deform_groups`` according to original repo. + self.deform_groups = 2 + + self.offset_net = OffsetNet(in_channels // shift_div, + self.deform_groups, num_segments) + self.weight_net = WeightNet(in_channels // shift_div, + self.deform_groups) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + # x: [N, C, H, W], + # where N = num_batches x num_segments, C = shift_div * num_folds + n, c, h, w = x.size() + num_batches = n // self.num_segments + num_folds = c // self.shift_div + + # x_out: [num_batches x num_segments, C, H, W] + x_out = torch.zeros((n, c, h, w), device=x.device) + # x_descriptor: [num_batches, num_segments, num_folds, H, W] + x_descriptor = x[:, :num_folds, :, :].view(num_batches, + self.num_segments, + num_folds, h, w) + + # x should only obtain information on temporal and channel dimensions + # x_pooled: [num_batches, num_segments, num_folds, W] + x_pooled = torch.mean(x_descriptor, 3) + # x_pooled: [num_batches, num_segments, num_folds] + x_pooled = torch.mean(x_pooled, 3) + # x_pooled: [num_batches, num_folds, num_segments] + x_pooled = x_pooled.permute(0, 2, 1).contiguous() + + # Calculate weight and bias, here groups = 2 + # x_offset: [num_batches, groups] + x_offset = self.offset_net(x_pooled).view(num_batches, -1) + # x_weight: [num_batches, num_segments, groups] + x_weight = self.weight_net(x_pooled) + + # x_offset: [num_batches, 2 * groups] + x_offset = torch.cat([x_offset, -x_offset], 1) + # x_shift: [num_batches, num_segments, num_folds, H, W] + x_shift = linear_sampler(x_descriptor, x_offset) + + # x_weight: [num_batches, num_segments, groups, 1] + x_weight = x_weight[:, :, :, None] + # x_weight: + # [num_batches, num_segments, groups * 2, c // self.shift_div // 4] + x_weight = x_weight.repeat(1, 1, 2, num_folds // 2 // 2) + # x_weight: + # [num_batches, num_segments, c // self.shift_div = num_folds] + x_weight = x_weight.view(x_weight.size(0), x_weight.size(1), -1) + + # x_weight: [num_batches, num_segments, num_folds, 1, 1] + x_weight = x_weight[:, :, :, None, None] + # x_shift: [num_batches, num_segments, num_folds, H, W] + x_shift = x_shift * x_weight + # x_shift: [num_batches, num_segments, num_folds, H, W] + x_shift = x_shift.contiguous().view(n, num_folds, h, w) + + # x_out: [num_batches x num_segments, C, H, W] + x_out[:, :num_folds, :] = x_shift + x_out[:, num_folds:, :] = x[:, num_folds:, :] + + return x_out + + +@MODELS.register_module() +class ResNetTIN(ResNetTSM): + """ResNet backbone for TIN. + + Args: + depth (int): Depth of ResNet, from {18, 34, 50, 101, 152}. + num_segments (int): Number of frame segments. Default: 8. + is_tin (bool): Whether to apply temporal interlace. Default: True. + shift_div (int): Number of division parts for shift. Default: 4. + kwargs (dict, optional): Arguments for ResNet. + """ + + def __init__(self, depth, is_tin=True, **kwargs): + self.is_tin = is_tin + super().__init__(depth, **kwargs) + + def init_structure(self): + if self.is_tin: + self.make_temporal_interlace() + if len(self.non_local_cfg) != 0: + self.make_non_local() + + def _get_wrap_prefix(self): + return ['.net2'] + + def make_temporal_interlace(self): + """Make temporal interlace for some layers.""" + num_segment_list = [self.num_segments] * 4 + assert num_segment_list[-1] > 0 + + n_round = 1 + if len(list(self.layer3.children())) >= 23: + print(f'=> Using n_round {n_round} to insert temporal shift.') + + def make_block_interlace(stage, num_segments, shift_div): + """Apply Deformable shift for a ResNet layer module. + + Args: + stage (nn.module): A ResNet layer to be deformed. + num_segments (int): Number of frame segments. + shift_div (int): Number of division parts for shift. + + Returns: + nn.Sequential: A Sequential container consisted of + deformed Interlace blocks. + """ + blocks = list(stage.children()) + for i, b in enumerate(blocks): + if i % n_round == 0: + tds = TemporalInterlace( + b.conv1.in_channels, + num_segments=num_segments, + shift_div=shift_div) + blocks[i].conv1.conv = CombineNet(tds, + blocks[i].conv1.conv) + return nn.Sequential(*blocks) + + self.layer1 = make_block_interlace(self.layer1, num_segment_list[0], + self.shift_div) + self.layer2 = make_block_interlace(self.layer2, num_segment_list[1], + self.shift_div) + self.layer3 = make_block_interlace(self.layer3, num_segment_list[2], + self.shift_div) + self.layer4 = make_block_interlace(self.layer4, num_segment_list[3], + self.shift_div) diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..5f87923e373a2542fc135306ca9faaf476f0ccf4 --- /dev/null +++ b/mmaction/models/backbones/resnet_tsm.py @@ -0,0 +1,375 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, NonLocal3d +from mmengine.logging import MMLogger +from mmengine.runner.checkpoint import _load_checkpoint +from torch.nn.modules.utils import _ntuple + +from mmaction.registry import MODELS +from .resnet import ResNet + + +class NL3DWrapper(nn.Module): + """3D Non-local wrapper for ResNet50. + + Wrap ResNet layers with 3D NonLocal modules. + + Args: + block (nn.Module): Residual blocks to be built. + num_segments (int): Number of frame segments. + non_local_cfg (dict): Config for non-local layers. Default: ``dict()``. + """ + + def __init__(self, block, num_segments, non_local_cfg=dict()): + super(NL3DWrapper, self).__init__() + self.block = block + self.non_local_cfg = non_local_cfg + self.non_local_block = NonLocal3d(self.block.conv3.norm.num_features, + **self.non_local_cfg) + self.num_segments = num_segments + + def forward(self, x): + """Defines the computation performed at every call.""" + x = self.block(x) + + n, c, h, w = x.size() + x = x.view(n // self.num_segments, self.num_segments, c, h, + w).transpose(1, 2).contiguous() + x = self.non_local_block(x) + x = x.transpose(1, 2).contiguous().view(n, c, h, w) + return x + + +class TemporalShift(nn.Module): + """Temporal shift module. + + This module is proposed in + `TSM: Temporal Shift Module for Efficient Video Understanding + `_ + + Args: + net (nn.module): Module to make temporal shift. + num_segments (int): Number of frame segments. Default: 3. + shift_div (int): Number of divisions for shift. Default: 8. + """ + + def __init__(self, net, num_segments=3, shift_div=8): + super().__init__() + self.net = net + self.num_segments = num_segments + self.shift_div = shift_div + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + x = self.shift(x, self.num_segments, shift_div=self.shift_div) + return self.net(x) + + @staticmethod + def shift(x, num_segments, shift_div=3): + """Perform temporal shift operation on the feature. + + Args: + x (torch.Tensor): The input feature to be shifted. + num_segments (int): Number of frame segments. + shift_div (int): Number of divisions for shift. Default: 3. + + Returns: + torch.Tensor: The shifted feature. + """ + # [N, C, H, W] + n, c, h, w = x.size() + + # [N // num_segments, num_segments, C, H*W] + # can't use 5 dimensional array on PPL2D backend for caffe + x = x.view(-1, num_segments, c, h * w) + + # get shift fold + fold = c // shift_div + + # split c channel into three parts: + # left_split, mid_split, right_split + left_split = x[:, :, :fold, :] + mid_split = x[:, :, fold:2 * fold, :] + right_split = x[:, :, 2 * fold:, :] + + # can't use torch.zeros(*A.shape) or torch.zeros_like(A) + # because array on caffe inference must be got by computing + + # shift left on num_segments channel in `left_split` + zeros = left_split - left_split + blank = zeros[:, :1, :, :] + left_split = left_split[:, 1:, :, :] + left_split = torch.cat((left_split, blank), 1) + + # shift right on num_segments channel in `mid_split` + zeros = mid_split - mid_split + blank = zeros[:, :1, :, :] + mid_split = mid_split[:, :-1, :, :] + mid_split = torch.cat((blank, mid_split), 1) + + # right_split: no shift + + # concatenate + out = torch.cat((left_split, mid_split, right_split), 2) + + # [N, C, H, W] + # restore the original dimension + return out.view(n, c, h, w) + + +@MODELS.register_module() +class ResNetTSM(ResNet): + """ResNet backbone for TSM. + + Args: + num_segments (int): Number of frame segments. Defaults to 8. + is_shift (bool): Whether to make temporal shift in reset layers. + Defaults to True. + non_local (Sequence[int]): Determine whether to apply non-local module + in the corresponding block of each stages. + Defaults to (0, 0, 0, 0). + non_local_cfg (dict): Config for non-local module. + Defaults to ``dict()``. + shift_div (int): Number of div for shift. Defaults to 8. + shift_place (str): Places in resnet layers for shift, which is chosen + from ['block', 'blockres']. + If set to 'block', it will apply temporal shift to all child blocks + in each resnet layer. + If set to 'blockres', it will apply temporal shift to each `conv1` + layer of all child blocks in each resnet layer. + Defaults to 'blockres'. + temporal_pool (bool): Whether to add temporal pooling. + Defaults to False. + pretrained2d (bool): Whether to load pretrained 2D model. + Defaults to True. + **kwargs (keyword arguments, optional): Arguments for ResNet. + """ + + def __init__(self, + depth, + num_segments=8, + is_shift=True, + non_local=(0, 0, 0, 0), + non_local_cfg=dict(), + shift_div=8, + shift_place='blockres', + temporal_pool=False, + pretrained2d=True, + **kwargs): + super().__init__(depth, **kwargs) + self.num_segments = num_segments + self.is_shift = is_shift + self.shift_div = shift_div + self.shift_place = shift_place + self.temporal_pool = temporal_pool + self.non_local = non_local + self.non_local_stages = _ntuple(self.num_stages)(non_local) + self.non_local_cfg = non_local_cfg + self.pretrained2d = pretrained2d + self.init_structure() + + def init_structure(self): + """Initialize structure for tsm.""" + if self.is_shift: + self.make_temporal_shift() + if len(self.non_local_cfg) != 0: + self.make_non_local() + if self.temporal_pool: + self.make_temporal_pool() + + def make_temporal_shift(self): + """Make temporal shift for some layers.""" + if self.temporal_pool: + num_segment_list = [ + self.num_segments, self.num_segments // 2, + self.num_segments // 2, self.num_segments // 2 + ] + else: + num_segment_list = [self.num_segments] * 4 + if num_segment_list[-1] <= 0: + raise ValueError('num_segment_list[-1] must be positive') + + if self.shift_place == 'block': + + def make_block_temporal(stage, num_segments): + """Make temporal shift on some blocks. + + Args: + stage (nn.Module): Model layers to be shifted. + num_segments (int): Number of frame segments. + + Returns: + nn.Module: The shifted blocks. + """ + blocks = list(stage.children()) + for i, b in enumerate(blocks): + blocks[i] = TemporalShift( + b, num_segments=num_segments, shift_div=self.shift_div) + return nn.Sequential(*blocks) + + self.layer1 = make_block_temporal(self.layer1, num_segment_list[0]) + self.layer2 = make_block_temporal(self.layer2, num_segment_list[1]) + self.layer3 = make_block_temporal(self.layer3, num_segment_list[2]) + self.layer4 = make_block_temporal(self.layer4, num_segment_list[3]) + + elif 'blockres' in self.shift_place: + n_round = 1 + if len(list(self.layer3.children())) >= 23: + n_round = 2 + + def make_block_temporal(stage, num_segments): + """Make temporal shift on some blocks. + + Args: + stage (nn.Module): Model layers to be shifted. + num_segments (int): Number of frame segments. + + Returns: + nn.Module: The shifted blocks. + """ + blocks = list(stage.children()) + for i, b in enumerate(blocks): + if i % n_round == 0: + blocks[i].conv1.conv = TemporalShift( + b.conv1.conv, + num_segments=num_segments, + shift_div=self.shift_div) + return nn.Sequential(*blocks) + + self.layer1 = make_block_temporal(self.layer1, num_segment_list[0]) + self.layer2 = make_block_temporal(self.layer2, num_segment_list[1]) + self.layer3 = make_block_temporal(self.layer3, num_segment_list[2]) + self.layer4 = make_block_temporal(self.layer4, num_segment_list[3]) + + else: + raise NotImplementedError + + def make_temporal_pool(self): + """Make temporal pooling between layer1 and layer2, using a 3D max + pooling layer.""" + + class TemporalPool(nn.Module): + """Temporal pool module. + + Wrap layer2 in ResNet50 with a 3D max pooling layer. + + Args: + net (nn.Module): Module to make temporal pool. + num_segments (int): Number of frame segments. + """ + + def __init__(self, net, num_segments): + super().__init__() + self.net = net + self.num_segments = num_segments + self.max_pool3d = nn.MaxPool3d( + kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0)) + + def forward(self, x): + """Defines the computation performed at every call.""" + # [N, C, H, W] + n, c, h, w = x.size() + # [N // num_segments, C, num_segments, H, W] + x = x.view(n // self.num_segments, self.num_segments, c, h, + w).transpose(1, 2) + # [N // num_segmnets, C, num_segments // 2, H, W] + x = self.max_pool3d(x) + # [N // 2, C, H, W] + x = x.transpose(1, 2).contiguous().view(n // 2, c, h, w) + return self.net(x) + + self.layer2 = TemporalPool(self.layer2, self.num_segments) + + def make_non_local(self): + """Wrap resnet layer into non local wrapper.""" + # This part is for ResNet50 + for i in range(self.num_stages): + non_local_stage = self.non_local_stages[i] + if sum(non_local_stage) == 0: + continue + + layer_name = f'layer{i + 1}' + res_layer = getattr(self, layer_name) + + for idx, non_local in enumerate(non_local_stage): + if non_local: + res_layer[idx] = NL3DWrapper(res_layer[idx], + self.num_segments, + self.non_local_cfg) + + def _get_wrap_prefix(self): + return ['.net', '.block'] + + def load_original_weights(self, logger): + """Load weights from original checkpoint, which required converting + keys.""" + state_dict_torchvision = _load_checkpoint( + self.pretrained, map_location='cpu') + if 'state_dict' in state_dict_torchvision: + state_dict_torchvision = state_dict_torchvision['state_dict'] + + wrapped_layers_map = dict() + for name, module in self.named_modules(): + # convert torchvision keys + ori_name = name + for wrap_prefix in self._get_wrap_prefix(): + if wrap_prefix in ori_name: + ori_name = ori_name.replace(wrap_prefix, '') + wrapped_layers_map[ori_name] = name + + if isinstance(module, ConvModule): + if 'downsample' in ori_name: + # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0 + tv_conv_name = ori_name + '.0' + # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1 + tv_bn_name = ori_name + '.1' + else: + # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n} + tv_conv_name = ori_name + # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n} + tv_bn_name = ori_name.replace('conv', 'bn') + + for conv_param in ['.weight', '.bias']: + if tv_conv_name + conv_param in state_dict_torchvision: + state_dict_torchvision[ori_name+'.conv'+conv_param] = \ + state_dict_torchvision.pop(tv_conv_name+conv_param) + + for bn_param in [ + '.weight', '.bias', '.running_mean', '.running_var' + ]: + if tv_bn_name + bn_param in state_dict_torchvision: + state_dict_torchvision[ori_name+'.bn'+bn_param] = \ + state_dict_torchvision.pop(tv_bn_name+bn_param) + + # convert wrapped keys + for param_name in list(state_dict_torchvision.keys()): + layer_name = '.'.join(param_name.split('.')[:-1]) + if layer_name in wrapped_layers_map: + wrapped_name = param_name.replace( + layer_name, wrapped_layers_map[layer_name]) + print(f'wrapped_name {wrapped_name}') + state_dict_torchvision[ + wrapped_name] = state_dict_torchvision.pop(param_name) + + msg = self.load_state_dict(state_dict_torchvision, strict=False) + logger.info(msg) + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if self.pretrained2d: + logger = MMLogger.get_current_instance() + self.load_original_weights(logger) + else: + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() diff --git a/mmaction/models/backbones/rgbposeconv3d.py b/mmaction/models/backbones/rgbposeconv3d.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4bdd9b89c746579784f4d5df4d7870da71a4bc --- /dev/null +++ b/mmaction/models/backbones/rgbposeconv3d.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmengine.logging import MMLogger, print_log +from mmengine.model import BaseModule +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner.checkpoint import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS +from .resnet3d_slowfast import ResNet3dPathway + + +@MODELS.register_module() +class RGBPoseConv3D(BaseModule): + """RGBPoseConv3D backbone. + + Args: + pretrained (str): The file path to a pretrained model. + Defaults to None. + speed_ratio (int): Speed ratio indicating the ratio between time + dimension of the fast and slow pathway, corresponding to the + :math:`\\alpha` in the paper. Defaults to 4. + channel_ratio (int): Reduce the channel number of fast pathway + by ``channel_ratio``, corresponding to :math:`\\beta` in the paper. + Defaults to 4. + rgb_detach (bool): Whether to detach the gradients from the pose path. + Defaults to False. + pose_detach (bool): Whether to detach the gradients from the rgb path. + Defaults to False. + rgb_drop_path (float): The drop rate for dropping the features from + the pose path. Defaults to 0. + pose_drop_path (float): The drop rate for dropping the features from + the rgb path. Defaults to 0. + rgb_pathway (dict): Configuration of rgb branch. Defaults to + ``dict(num_stages=4, lateral=True, lateral_infl=1, + lateral_activate=(0, 0, 1, 1), fusion_kernel=7, base_channels=64, + conv1_kernel=(1, 7, 7), inflate=(0, 0, 1, 1), with_pool2=False)``. + pose_pathway (dict): Configuration of pose branch. Defaults to + ``dict(num_stages=3, stage_blocks=(4, 6, 3), lateral=True, + lateral_inv=True, lateral_infl=16, lateral_activate=(0, 1, 1), + fusion_kernel=7, in_channels=17, base_channels=32, + out_indices=(2, ), conv1_kernel=(1, 7, 7), conv1_stride_s=1, + conv1_stride_t=1, pool1_stride_s=1, pool1_stride_t=1, + inflate=(0, 1, 1), spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), with_pool2=False)``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + pretrained: Optional[str] = None, + speed_ratio: int = 4, + channel_ratio: int = 4, + rgb_detach: bool = False, + pose_detach: bool = False, + rgb_drop_path: float = 0, + pose_drop_path: float = 0, + rgb_pathway: Dict = dict( + num_stages=4, + lateral=True, + lateral_infl=1, + lateral_activate=(0, 0, 1, 1), + fusion_kernel=7, + base_channels=64, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1), + with_pool2=False), + pose_pathway: Dict = dict( + num_stages=3, + stage_blocks=(4, 6, 3), + lateral=True, + lateral_inv=True, + lateral_infl=16, + lateral_activate=(0, 1, 1), + fusion_kernel=7, + in_channels=17, + base_channels=32, + out_indices=(2, ), + conv1_kernel=(1, 7, 7), + conv1_stride_s=1, + conv1_stride_t=1, + pool1_stride_s=1, + pool1_stride_t=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1), + with_pool2=False), + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.pretrained = pretrained + self.speed_ratio = speed_ratio + self.channel_ratio = channel_ratio + + if rgb_pathway['lateral']: + rgb_pathway['speed_ratio'] = speed_ratio + rgb_pathway['channel_ratio'] = channel_ratio + + if pose_pathway['lateral']: + pose_pathway['speed_ratio'] = speed_ratio + pose_pathway['channel_ratio'] = channel_ratio + + self.rgb_path = ResNet3dPathway(**rgb_pathway) + self.pose_path = ResNet3dPathway(**pose_pathway) + self.rgb_detach = rgb_detach + self.pose_detach = pose_detach + assert 0 <= rgb_drop_path <= 1 + assert 0 <= pose_drop_path <= 1 + self.rgb_drop_path = rgb_drop_path + self.pose_drop_path = pose_drop_path + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + msg = f'load model from: {self.pretrained}' + print_log(msg, logger=logger) + load_checkpoint(self, self.pretrained, strict=True, logger=logger) + elif self.pretrained is None: + # Init two branch separately. + self.rgb_path.init_weights() + self.pose_path.init_weights() + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, imgs: torch.Tensor, heatmap_imgs: torch.Tensor) -> tuple: + """Defines the computation performed at every call. + + Args: + imgs (torch.Tensor): The input data. + heatmap_imgs (torch.Tensor): The input data. + + Returns: + tuple[torch.Tensor]: The feature of the input + samples extracted by the backbone. + """ + if self.training: + rgb_drop_path = torch.rand(1) < self.rgb_drop_path + pose_drop_path = torch.rand(1) < self.pose_drop_path + else: + rgb_drop_path, pose_drop_path = False, False + # We assume base_channel for RGB and Pose are 64 and 32. + x_rgb = self.rgb_path.conv1(imgs) + x_rgb = self.rgb_path.maxpool(x_rgb) + # N x 64 x 8 x 56 x 56 + x_pose = self.pose_path.conv1(heatmap_imgs) + x_pose = self.pose_path.maxpool(x_pose) + + x_rgb = self.rgb_path.layer1(x_rgb) + x_rgb = self.rgb_path.layer2(x_rgb) + x_pose = self.pose_path.layer1(x_pose) + + if hasattr(self.rgb_path, 'layer2_lateral'): + feat = x_pose.detach() if self.rgb_detach else x_pose + x_pose_lateral = self.rgb_path.layer2_lateral(feat) + if rgb_drop_path: + x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape) + + if hasattr(self.pose_path, 'layer1_lateral'): + feat = x_rgb.detach() if self.pose_detach else x_rgb + x_rgb_lateral = self.pose_path.layer1_lateral(feat) + if pose_drop_path: + x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape) + + if hasattr(self.rgb_path, 'layer2_lateral'): + x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1) + + if hasattr(self.pose_path, 'layer1_lateral'): + x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1) + + x_rgb = self.rgb_path.layer3(x_rgb) + x_pose = self.pose_path.layer2(x_pose) + + if hasattr(self.rgb_path, 'layer3_lateral'): + feat = x_pose.detach() if self.rgb_detach else x_pose + x_pose_lateral = self.rgb_path.layer3_lateral(feat) + if rgb_drop_path: + x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape) + + if hasattr(self.pose_path, 'layer2_lateral'): + feat = x_rgb.detach() if self.pose_detach else x_rgb + x_rgb_lateral = self.pose_path.layer2_lateral(feat) + if pose_drop_path: + x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape) + + if hasattr(self.rgb_path, 'layer3_lateral'): + x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1) + + if hasattr(self.pose_path, 'layer2_lateral'): + x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1) + + x_rgb = self.rgb_path.layer4(x_rgb) + x_pose = self.pose_path.layer3(x_pose) + + return x_rgb, x_pose diff --git a/mmaction/models/backbones/stgcn.py b/mmaction/models/backbones/stgcn.py new file mode 100644 index 0000000000000000000000000000000000000000..9011a6262407305d4454c0b3517be11fdddce9f3 --- /dev/null +++ b/mmaction/models/backbones/stgcn.py @@ -0,0 +1,238 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmengine.model import BaseModule, ModuleList + +from mmaction.registry import MODELS +from ..utils import Graph, mstcn, unit_gcn, unit_tcn + +EPS = 1e-4 + + +class STGCNBlock(BaseModule): + """The basic block of STGCN. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + A (torch.Tensor): The adjacency matrix defined in the graph + with shape of `(num_subsets, num_nodes, num_nodes)`. + stride (int): Stride of the temporal convolution. Defaults to 1. + residual (bool): Whether to use residual connection. Defaults to True. + init_cfg (dict or list[dict], optional): Config to control + the initialization. Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + A: torch.Tensor, + stride: int = 1, + residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + + gcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'gcn_'} + tcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'tcn_'} + kwargs = { + k: v + for k, v in kwargs.items() if k[:4] not in ['gcn_', 'tcn_'] + } + assert len(kwargs) == 0, f'Invalid arguments: {kwargs}' + + tcn_type = tcn_kwargs.pop('type', 'unit_tcn') + assert tcn_type in ['unit_tcn', 'mstcn'] + gcn_type = gcn_kwargs.pop('type', 'unit_gcn') + assert gcn_type in ['unit_gcn'] + + self.gcn = unit_gcn(in_channels, out_channels, A, **gcn_kwargs) + + if tcn_type == 'unit_tcn': + self.tcn = unit_tcn( + out_channels, out_channels, 9, stride=stride, **tcn_kwargs) + elif tcn_type == 'mstcn': + self.tcn = mstcn( + out_channels, out_channels, stride=stride, **tcn_kwargs) + self.relu = nn.ReLU() + + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + res = self.residual(x) + x = self.tcn(self.gcn(x)) + res + return self.relu(x) + + +@MODELS.register_module() +class STGCN(BaseModule): + """STGCN backbone. + + Spatial Temporal Graph Convolutional + Networks for Skeleton-Based Action Recognition. + More details can be found in the `paper + `__ . + + Args: + graph_cfg (dict): Config for building the graph. + in_channels (int): Number of input channels. Defaults to 3. + base_channels (int): Number of base channels. Defaults to 64. + data_bn_type (str): Type of the data bn layer. Defaults to ``'VC'``. + ch_ratio (int): Inflation ratio of the number of channels. + Defaults to 2. + num_person (int): Maximum number of people. Only used when + data_bn_type == 'MVC'. Defaults to 2. + num_stages (int): Total number of stages. Defaults to 10. + inflate_stages (list[int]): Stages to inflate the number of channels. + Defaults to ``[5, 8]``. + down_stages (list[int]): Stages to perform downsampling in + the time dimension. Defaults to ``[5, 8]``. + stage_cfgs (dict): Extra config dict for each stage. + Defaults to ``dict()``. + init_cfg (dict or list[dict], optional): Config to control + the initialization. Defaults to None. + + Examples: + >>> import torch + >>> from mmaction.models import STGCN + >>> + >>> mode = 'stgcn_spatial' + >>> batch_size, num_person, num_frames = 2, 2, 150 + >>> + >>> # openpose-18 layout + >>> num_joints = 18 + >>> model = STGCN(graph_cfg=dict(layout='openpose', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # nturgb+d layout + >>> num_joints = 25 + >>> model = STGCN(graph_cfg=dict(layout='nturgb+d', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # coco layout + >>> num_joints = 17 + >>> model = STGCN(graph_cfg=dict(layout='coco', mode=mode)) + >>> model.init_weights() + >>> inputs = torch.randn(batch_size, num_person, + ... num_frames, num_joints, 3) + >>> output = model(inputs) + >>> print(output.shape) + >>> + >>> # custom settings + >>> # instantiate STGCN++ + >>> model = STGCN(graph_cfg=dict(layout='coco', mode='spatial'), + ... gcn_adaptive='init', gcn_with_res=True, + ... tcn_type='mstcn') + >>> model.init_weights() + >>> output = model(inputs) + >>> print(output.shape) + torch.Size([2, 2, 256, 38, 18]) + torch.Size([2, 2, 256, 38, 25]) + torch.Size([2, 2, 256, 38, 17]) + torch.Size([2, 2, 256, 38, 17]) + """ + + def __init__(self, + graph_cfg: Dict, + in_channels: int = 3, + base_channels: int = 64, + data_bn_type: str = 'VC', + ch_ratio: int = 2, + num_person: int = 2, + num_stages: int = 10, + inflate_stages: List[int] = [5, 8], + down_stages: List[int] = [5, 8], + init_cfg: Optional[Union[Dict, List[Dict]]] = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + + self.graph = Graph(**graph_cfg) + A = torch.tensor( + self.graph.A, dtype=torch.float32, requires_grad=False) + self.data_bn_type = data_bn_type + + if data_bn_type == 'MVC': + self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1)) + elif data_bn_type == 'VC': + self.data_bn = nn.BatchNorm1d(in_channels * A.size(1)) + else: + self.data_bn = nn.Identity() + + lw_kwargs = [cp.deepcopy(kwargs) for i in range(num_stages)] + for k, v in kwargs.items(): + if isinstance(v, (tuple, list)) and len(v) == num_stages: + for i in range(num_stages): + lw_kwargs[i][k] = v[i] + lw_kwargs[0].pop('tcn_dropout', None) + + self.in_channels = in_channels + self.base_channels = base_channels + self.ch_ratio = ch_ratio + self.inflate_stages = inflate_stages + self.down_stages = down_stages + + modules = [] + if self.in_channels != self.base_channels: + modules = [ + STGCNBlock( + in_channels, + base_channels, + A.clone(), + 1, + residual=False, + **lw_kwargs[0]) + ] + + inflate_times = 0 + for i in range(2, num_stages + 1): + stride = 1 + (i in down_stages) + in_channels = base_channels + if i in inflate_stages: + inflate_times += 1 + out_channels = int(self.base_channels * + self.ch_ratio**inflate_times + EPS) + base_channels = out_channels + modules.append( + STGCNBlock(in_channels, out_channels, A.clone(), stride, + **lw_kwargs[i - 1])) + + if self.in_channels == self.base_channels: + num_stages -= 1 + + self.num_stages = num_stages + self.gcn = ModuleList(modules) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous() + if self.data_bn_type == 'MVC': + x = self.data_bn(x.view(N, M * V * C, T)) + else: + x = self.data_bn(x.view(N * M, V * C, T)) + x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, + 2).contiguous().view(N * M, C, T, V) + + for i in range(self.num_stages): + x = self.gcn[i](x) + + x = x.reshape((N, M) + x.shape[1:]) + return x diff --git a/mmaction/models/backbones/swin.py b/mmaction/models/backbones/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf57cbc41b5bc25c56dbde6b0ddde1b3d2702ed --- /dev/null +++ b/mmaction/models/backbones/swin.py @@ -0,0 +1,1022 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import lru_cache, reduce +from operator import mul +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from einops import rearrange +from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer +from mmcv.cnn.bricks import DropPath +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import trunc_normal_ +from mmengine.runner.checkpoint import _load_checkpoint + +from mmaction.registry import MODELS + + +def window_partition(x: torch.Tensor, + window_size: Sequence[int]) -> torch.Tensor: + """ + Args: + x (torch.Tensor): The input features of shape :math:`(B, D, H, W, C)`. + window_size (Sequence[int]): The window size, :math:`(w_d, w_h, w_w)`. + + Returns: + torch.Tensor: The partitioned windows of shape + :math:`(B*num_windows, w_d*w_h*w_w, C)`. + """ + B, D, H, W, C = x.shape + x = x.view(B, D // window_size[0], window_size[0], H // window_size[1], + window_size[1], W // window_size[2], window_size[2], C) + windows = x.permute(0, 1, 3, 5, 2, 4, 6, + 7).contiguous().view(-1, reduce(mul, window_size), C) + return windows + + +def window_reverse(windows: torch.Tensor, window_size: Sequence[int], B: int, + D: int, H: int, W: int) -> torch.Tensor: + """ + Args: + windows (torch.Tensor): Input windows of shape + :meth:`(B*num_windows, w_d, w_h, w_w, C)`. + window_size (Sequence[int]): The window size, :meth:`(w_d, w_h, w_w)`. + B (int): Batch size of feature maps. + D (int): Temporal length of feature maps. + H (int): Height of feature maps. + W (int): Width of feature maps. + + Returns: + torch.Tensor: The feature maps reversed from windows of + shape :math:`(B, D, H, W, C)`. + """ + x = windows.view(B, D // window_size[0], H // window_size[1], + W // window_size[2], window_size[0], window_size[1], + window_size[2], -1) + x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1) + return x + + +def get_window_size( + x_size: Sequence[int], + window_size: Sequence[int], + shift_size: Optional[Sequence[int]] = None +) -> Union[Tuple[int], Tuple[Tuple[int]]]: + """Calculate window size and shift size according to the input size. + + Args: + x_size (Sequence[int]): The input size. + window_size (Sequence[int]): The expected window size. + shift_size (Sequence[int], optional): The expected shift size. + Defaults to None. + + Returns: + tuple: The calculated window size and shift size. + """ + use_window_size = list(window_size) + if shift_size is not None: + use_shift_size = list(shift_size) + for i in range(len(x_size)): + if x_size[i] <= window_size[i]: + use_window_size[i] = x_size[i] + if shift_size is not None: + use_shift_size[i] = 0 + + if shift_size is None: + return tuple(use_window_size) + else: + return tuple(use_window_size), tuple(use_shift_size) + + +# cache each stage results +@lru_cache() +def compute_mask(D: int, H: int, W: int, window_size: Sequence[int], + shift_size: Sequence[int], + device: Union[str, torch.device]) -> torch.Tensor: + """Compute attention mask. + + Args: + D (int): Temporal length of feature maps. + H (int): Height of feature maps. + W (int): Width of feature maps. + window_size (Sequence[int]): The window size. + shift_size (Sequence[int]): The shift size. + device (str or :obj:`torch.device`): The device of the mask. + + Returns: + torch.Tensor: The attention mask used for shifted window attention. + """ + img_mask = torch.zeros((1, D, H, W, 1), device=device) # 1 Dp Hp Wp 1 + cnt = 0 + for d in slice(-window_size[0]), slice(-window_size[0], + -shift_size[0]), slice( + -shift_size[0], None): + for h in slice(-window_size[1]), slice(-window_size[1], + -shift_size[1]), slice( + -shift_size[1], None): + for w in slice(-window_size[2]), slice(-window_size[2], + -shift_size[2]), slice( + -shift_size[2], None): + img_mask[:, d, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, + window_size) # nW, ws[0]*ws[1]*ws[2], 1 + mask_windows = mask_windows.squeeze(-1) # nW, ws[0]*ws[1]*ws[2] + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + return attn_mask + + +class WindowAttention3D(BaseModule): + """Window based multi-head self attention (W-MSA) module with relative + position bias. It supports both of shifted and non-shifted window. + + Args: + embed_dims (int): Number of input channels. + window_size (Sequence[int]): The temporal length, height and + width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, + key, value. Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + attn_drop (float): Dropout ratio of attention weight. Defaults to 0.0. + proj_drop (float): Dropout ratio of output. Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims: int, + window_size: Sequence[int], + num_heads: int, + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + attn_drop: float = 0., + proj_drop: float = 0., + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.window_size = window_size # Wd, Wh, Ww + self.num_heads = num_heads + head_dim = embed_dims // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + # # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1) * + (2 * window_size[2] - 1), num_heads)) + + # get pair-wise relative position index for + # each token inside the window + coords_d = torch.arange(self.window_size[0]) + coords_h = torch.arange(self.window_size[1]) + coords_w = torch.arange(self.window_size[2]) + coords = torch.stack(torch.meshgrid( + coords_d, + coords_h, + coords_w, + )) # 3, Wd, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 3, Wd*Wh*Ww + # 3, Wd*Wh*Ww, Wd*Wh*Ww + relative_coords = \ + coords_flatten[:, :, None] - coords_flatten[:, None, :] + # Wd*Wh*Ww, Wd*Wh*Ww, 3 + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + # shift to start from 0 + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 2] += self.window_size[2] - 1 + + relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * \ + (2 * self.window_size[2] - 1) + relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1) + relative_position_index = relative_coords.sum(-1) # Wd*Wh*Ww, Wd*Wh*Ww + self.register_buffer('relative_position_index', + relative_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + """Forward function. + + Args: + x (torch.Tensor): Input feature maps of shape + :meth:`(B*num_windows, N, C)`. + mask (torch.Tensor, optional): (0/-inf) mask of shape + :meth:`(num_windows, N, N)`. Defaults to None. + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # B_, nH, N, C + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index[:N, :N].reshape(-1)].reshape( + N, N, -1) # Wd*Wh*Ww,Wd*Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wd*Wh*Ww, Wd*Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Mlp(BaseModule): + """Multilayer perceptron. + + Args: + in_features (int): Number of input features. + hidden_features (int, optional): Number of hidden features. + Defaults to None. + out_features (int, optional): Number of output features. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='GELU')``. + drop (float): Dropout rate. Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_cfg: Dict = dict(type='GELU'), + drop: float = 0., + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = build_activation_layer(act_cfg) + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class SwinTransformerBlock3D(BaseModule): + """Swin Transformer Block. + + Args: + embed_dims (int): Number of feature channels. + num_heads (int): Number of attention heads. + window_size (Sequence[int]): Window size. Defaults to ``(8, 7, 7)``. + shift_size (Sequence[int]): Shift size for SW-MSA or W-MSA. + Defaults to ``(0, 0, 0)``. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + Defaults to 4.0. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop (float): Dropout rate. Defaults to 0.0. + attn_drop (float): Attention dropout rate. Defaults to 0.0. + drop_path (float): Stochastic depth rate. Defaults to 0.1. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='GELU')``. + norm_cfg (dict): Config dict for norm layer. + Defaults to ``dict(type='LN')``. + with_cp (bool): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Defaults to False. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims: int, + num_heads: int, + window_size: Sequence[int] = (8, 7, 7), + shift_size: Sequence[int] = (0, 0, 0), + mlp_ratio: float = 4., + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + drop: float = 0., + attn_drop: float = 0., + drop_path: float = 0.1, + act_cfg: Dict = dict(type='GELU'), + norm_cfg: Dict = dict(type='LN'), + with_cp: bool = False, + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + self.with_cp = with_cp + + assert 0 <= self.shift_size[0] < self.window_size[ + 0], 'shift_size[0] must in [0, window_size[0])' + assert 0 <= self.shift_size[1] < self.window_size[ + 1], 'shift_size[1] must in [0, window_size[0])' + assert 0 <= self.shift_size[2] < self.window_size[ + 2], 'shift_size[2] must in [0, window_size[0])' + + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + + _attn_cfg = { + 'embed_dims': embed_dims, + 'window_size': window_size, + 'num_heads': num_heads, + 'qkv_bias': qkv_bias, + 'qk_scale': qk_scale, + 'attn_drop': attn_drop, + 'proj_drop': drop + } + self.attn = WindowAttention3D(**_attn_cfg) + + self.drop_path = DropPath(drop_path) \ + if drop_path > 0. else nn.Identity() + + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + + _mlp_cfg = { + 'in_features': embed_dims, + 'hidden_features': int(embed_dims * mlp_ratio), + 'act_cfg': act_cfg, + 'drop': drop + } + self.mlp = Mlp(**_mlp_cfg) + + def forward_part1(self, x: torch.Tensor, + mask_matrix: torch.Tensor) -> torch.Tensor: + """Forward function part1.""" + B, D, H, W, C = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + + x = self.norm1(x) + # pad feature maps to multiples of window size + pad_l = pad_t = pad_d0 = 0 + pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0] + pad_b = (window_size[1] - H % window_size[1]) % window_size[1] + pad_r = (window_size[2] - W % window_size[2]) % window_size[2] + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1)) + _, Dp, Hp, Wp, _ = x.shape + # cyclic shift + if any(i > 0 for i in shift_size): + shifted_x = torch.roll( + x, + shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), + dims=(1, 2, 3)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + # partition windows + x_windows = window_partition(shifted_x, + window_size) # B*nW, Wd*Wh*Ww, C + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask) # B*nW, Wd*Wh*Ww, C + # merge windows + attn_windows = attn_windows.view(-1, *(window_size + (C, ))) + shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, + Wp) # B D' H' W' C + # reverse cyclic shift + if any(i > 0 for i in shift_size): + x = torch.roll( + shifted_x, + shifts=(shift_size[0], shift_size[1], shift_size[2]), + dims=(1, 2, 3)) + else: + x = shifted_x + + if pad_d1 > 0 or pad_r > 0 or pad_b > 0: + x = x[:, :D, :H, :W, :].contiguous() + return x + + def forward_part2(self, x: torch.Tensor) -> torch.Tensor: + """Forward function part2.""" + return self.drop_path(self.mlp(self.norm2(x))) + + def forward(self, x: torch.Tensor, + mask_matrix: torch.Tensor) -> torch.Tensor: + """ + Args: + x (torch.Tensor): Input features of shape :math:`(B, D, H, W, C)`. + mask_matrix (torch.Tensor): Attention mask for cyclic shift. + """ + + shortcut = x + if self.with_cp: + x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix) + else: + x = self.forward_part1(x, mask_matrix) + x = shortcut + self.drop_path(x) + + if self.with_cp: + x = x + checkpoint.checkpoint(self.forward_part2, x) + else: + x = x + self.forward_part2(x) + + return x + + +class PatchMerging(BaseModule): + """Patch Merging Layer. + + Args: + embed_dims (int): Number of input channels. + norm_cfg (dict): Config dict for norm layer. + Defaults to ``dict(type='LN')``. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims: int, + norm_cfg: Dict = dict(type='LN'), + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.mid_embed_dims = 4 * embed_dims + self.out_embed_dims = 2 * embed_dims + self.reduction = nn.Linear( + self.mid_embed_dims, self.out_embed_dims, bias=False) + self.norm = build_norm_layer(norm_cfg, self.mid_embed_dims)[1] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Perform patch merging. + + Args: + x (torch.Tensor): Input feature maps of shape + :math:`(B, D, H, W, C)`. + + Returns: + torch.Tensor: The merged feature maps of shape + :math:`(B, D, H/2, W/2, 2*C)`. + """ + B, D, H, W, C = x.shape + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C + x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C + x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C + x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B D H/2 W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(BaseModule): + """A basic Swin Transformer layer for one stage. + + Args: + embed_dims (int): Number of feature channels. + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (Sequence[int]): Local window size. + Defaults to ``(8, 7, 7)``. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop (float): Dropout rate. Defaults to 0.0. + attn_drop (float): Attention dropout rate. Defaults to 0.0. + drop_paths (float or Sequence[float]): Stochastic depth rates. + Defaults to 0.0. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='GELU')``. + norm_cfg (dict, optional): Config dict for norm layer. + Defaults to ``dict(type='LN')``. + downsample (:class:`PatchMerging`, optional): Downsample layer + at the end of the layer. Defaults to None. + with_cp (bool): Use checkpoint or not. Using checkpoint will + save some memory while slowing down the training speed. + Defaults to False. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims: int, + depth: int, + num_heads: int, + window_size: Sequence[int] = (8, 7, 7), + mlp_ratio: float = 4., + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + drop: float = 0., + attn_drop: float = 0., + drop_paths: Union[float, Sequence[float]] = 0., + act_cfg: Dict = dict(type='GELU'), + norm_cfg: Dict = dict(type='LN'), + downsample: Optional[PatchMerging] = None, + with_cp: bool = False, + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.window_size = window_size + self.shift_size = tuple(i // 2 for i in window_size) + self.depth = depth + self.with_cp = with_cp + + if not isinstance(drop_paths, Sequence): + drop_paths = [drop_paths] * depth + + # build blocks + self.blocks = ModuleList() + for i in range(depth): + _block_cfg = { + 'embed_dims': embed_dims, + 'num_heads': num_heads, + 'window_size': window_size, + 'shift_size': (0, 0, 0) if (i % 2 == 0) else self.shift_size, + 'mlp_ratio': mlp_ratio, + 'qkv_bias': qkv_bias, + 'qk_scale': qk_scale, + 'drop': drop, + 'attn_drop': attn_drop, + 'drop_path': drop_paths[i], + 'act_cfg': act_cfg, + 'norm_cfg': norm_cfg, + 'with_cp': with_cp + } + + block = SwinTransformerBlock3D(**_block_cfg) + self.blocks.append(block) + + self.downsample = downsample + if self.downsample is not None: + self.downsample = downsample( + embed_dims=embed_dims, norm_cfg=norm_cfg) + + def forward(self, + x: torch.Tensor, + do_downsample: bool = True) -> torch.Tensor: + """Forward function. + + Args: + x (torch.Tensor): Input feature maps of shape + :math:`(B, C, D, H, W)`. + do_downsample (bool): Whether to downsample the output of + the current layer. Defaults to True. + """ + # calculate attention mask for SW-MSA + B, C, D, H, W = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + x = rearrange(x, 'b c d h w -> b d h w c') + Dp = int(np.ceil(D / window_size[0])) * window_size[0] + Hp = int(np.ceil(H / window_size[1])) * window_size[1] + Wp = int(np.ceil(W / window_size[2])) * window_size[2] + attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device) + for blk in self.blocks: + x = blk(x, attn_mask) + + if self.downsample is not None and do_downsample: + x = self.downsample(x) + return x + + @property + def out_embed_dims(self): + if self.downsample is not None: + return self.downsample.out_embed_dims + else: + return self.embed_dims + + +class PatchEmbed3D(BaseModule): + """Video to Patch Embedding. + + Args: + patch_size (Sequence[int] or int]): Patch token size. + Defaults to ``(2, 4, 4)``. + in_channels (int): Number of input video channels. Defaults to 3. + embed_dims (int): Dimensions of embedding. Defaults to 96. + conv_cfg: (dict): Config dict for convolution layer. + Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict, optional): Config dict for norm layer. + Defaults to None. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__(self, + patch_size: Union[Sequence[int], int] = (2, 4, 4), + in_channels: int = 3, + embed_dims: int = 96, + norm_cfg: Optional[Dict] = None, + conv_cfg: Dict = dict(type='Conv3d'), + init_cfg: Optional[Dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.patch_size = patch_size + self.in_channels = in_channels + self.embed_dims = embed_dims + + self.proj = build_conv_layer( + conv_cfg, + in_channels, + embed_dims, + kernel_size=patch_size, + stride=patch_size) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Perform video to patch embedding. + + Args: + x (torch.Tensor): The input videos of shape + :math:`(B, C, D, H, W)`. In most cases, C is 3. + + Returns: + torch.Tensor: The video patches of shape + :math:`(B, embed_dims, Dp, Hp, Wp)`. + """ + + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, + (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, 0, 0, + self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # B C Dp Wp Wp + if self.norm is not None: + Dp, Hp, Wp = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) # B Dp*Hp*Wp C + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dims, Dp, Hp, Wp) + + return x + + +@MODELS.register_module() +class SwinTransformer3D(BaseModule): + """Video Swin Transformer backbone. + + A pytorch implement of: `Video Swin Transformer + `_ + + Args: + arch (str or dict): Video Swin Transformer architecture. If use string, + choose from 'tiny', 'small', 'base' and 'large'. If use dict, it + should have below keys: + - **embed_dims** (int): The dimensions of embedding. + - **depths** (Sequence[int]): The number of blocks in each stage. + - **num_heads** (Sequence[int]): The number of heads in attention + modules of each stage. + pretrained (str, optional): Name of pretrained model. + Defaults to None. + pretrained2d (bool): Whether to load pretrained 2D model. + Defaults to True. + patch_size (int or Sequence(int)): Patch size. + Defaults to ``(2, 4, 4)``. + in_channels (int): Number of input image channels. Defaults to 3. + window_size (Sequence[int]): Window size. Defaults to ``(8, 7, 7)``. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop_rate (float): Dropout rate. Defaults to 0.0. + attn_drop_rate (float): Attention dropout rate. Defaults to 0.0. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='GELU')``. + norm_cfg (dict): Config dict for norm layer. + Defaults to ``dict(type='LN')``. + patch_norm (bool): If True, add normalization after patch embedding. + Defaults to True. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Defaults to -1. + with_cp (bool): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Defaults to False. + out_indices (Sequence[int]): Indices of output feature. + Defaults to ``(3, )``. + out_after_downsample (bool): Whether to output the feature map of a + stage after the following downsample layer. Defaults to False. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ]``. + """ + arch_zoo = { + **dict.fromkeys(['t', 'tiny'], + {'embed_dims': 96, + 'depths': [2, 2, 6, 2], + 'num_heads': [3, 6, 12, 24]}), + **dict.fromkeys(['s', 'small'], + {'embed_dims': 96, + 'depths': [2, 2, 18, 2], + 'num_heads': [3, 6, 12, 24]}), + **dict.fromkeys(['b', 'base'], + {'embed_dims': 128, + 'depths': [2, 2, 18, 2], + 'num_heads': [4, 8, 16, 32]}), + **dict.fromkeys(['l', 'large'], + {'embed_dims': 192, + 'depths': [2, 2, 18, 2], + 'num_heads': [6, 12, 24, 48]}), + } # yapf: disable + + def __init__( + self, + arch: Union[str, Dict], + pretrained: Optional[str] = None, + pretrained2d: bool = True, + patch_size: Union[int, Sequence[int]] = (2, 4, 4), + in_channels: int = 3, + window_size: Sequence[int] = (8, 7, 7), + mlp_ratio: float = 4., + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + drop_rate: float = 0., + attn_drop_rate: float = 0., + drop_path_rate: float = 0.1, + act_cfg: Dict = dict(type='GELU'), + norm_cfg: Dict = dict(type='LN'), + patch_norm: bool = True, + frozen_stages: int = -1, + with_cp: bool = False, + out_indices: Sequence[int] = (3, ), + out_after_downsample: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.pretrained = pretrained + self.pretrained2d = pretrained2d + + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + essential_keys = {'embed_dims', 'depths', 'num_heads'} + assert isinstance(arch, dict) and set(arch) == essential_keys, \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.embed_dims = self.arch_settings['embed_dims'] + self.depths = self.arch_settings['depths'] + self.num_heads = self.arch_settings['num_heads'] + assert len(self.depths) == len(self.num_heads) + self.num_layers = len(self.depths) + assert 1 <= self.num_layers <= 4 + self.out_indices = out_indices + assert max(out_indices) < self.num_layers + self.out_after_downsample = out_after_downsample + self.frozen_stages = frozen_stages + self.window_size = window_size + self.patch_size = patch_size + + _patch_cfg = { + 'patch_size': patch_size, + 'in_channels': in_channels, + 'embed_dims': self.embed_dims, + 'norm_cfg': norm_cfg if patch_norm else None, + 'conv_cfg': dict(type='Conv3d') + } + self.patch_embed = PatchEmbed3D(**_patch_cfg) + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + total_depth = sum(self.depths) + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, total_depth) + ] # stochastic depth decay rule + + # build layers + self.layers = ModuleList() + embed_dims = [self.embed_dims] + for i, (depth, num_heads) in \ + enumerate(zip(self.depths, self.num_heads)): + downsample = PatchMerging if i < self.num_layers - 1 else None + _layer_cfg = { + 'embed_dims': embed_dims[-1], + 'depth': depth, + 'num_heads': num_heads, + 'window_size': window_size, + 'mlp_ratio': mlp_ratio, + 'qkv_bias': qkv_bias, + 'qk_scale': qk_scale, + 'drop': drop_rate, + 'attn_drop': attn_drop_rate, + 'drop_paths': dpr[:depth], + 'act_cfg': act_cfg, + 'norm_cfg': norm_cfg, + 'downsample': downsample, + 'with_cp': with_cp + } + + layer = BasicLayer(**_layer_cfg) + self.layers.append(layer) + + dpr = dpr[depth:] + embed_dims.append(layer.out_embed_dims) + + if self.out_after_downsample: + self.num_features = embed_dims[1:] + else: + self.num_features = embed_dims[:-1] + + for i in out_indices: + if norm_cfg is not None: + norm_layer = build_norm_layer(norm_cfg, + self.num_features[i])[1] + else: + norm_layer = nn.Identity() + + self.add_module(f'norm{i}', norm_layer) + + self._freeze_stages() + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1: + self.pos_drop.eval() + for i in range(0, self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def inflate_weights(self, logger: MMLogger) -> None: + """Inflate the swin2d parameters to swin3d. + + The differences between swin3d and swin2d mainly lie in an extra + axis. To utilize the pretrained parameters in 2d model, the weight + of swin2d models should be inflated to fit in the shapes of the + 3d counterpart. + + Args: + logger (MMLogger): The logger used to print debugging information. + """ + checkpoint = _load_checkpoint(self.pretrained, map_location='cpu') + state_dict = checkpoint['model'] + + # delete relative_position_index since we always re-init it + relative_position_index_keys = [ + k for k in state_dict.keys() if 'relative_position_index' in k + ] + for k in relative_position_index_keys: + del state_dict[k] + + # delete attn_mask since we always re-init it + attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k] + for k in attn_mask_keys: + del state_dict[k] + state_dict['patch_embed.proj.weight'] = \ + state_dict['patch_embed.proj.weight'].unsqueeze(2).\ + repeat(1, 1, self.patch_size[0], 1, 1) / self.patch_size[0] + + # bicubic interpolate relative_position_bias_table if not match + relative_position_bias_table_keys = [ + k for k in state_dict.keys() if 'relative_position_bias_table' in k + ] + for k in relative_position_bias_table_keys: + relative_position_bias_table_pretrained = state_dict[k] + relative_position_bias_table_current = self.state_dict()[k] + L1, nH1 = relative_position_bias_table_pretrained.size() + L2, nH2 = relative_position_bias_table_current.size() + L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1) + wd = self.window_size[0] + if nH1 != nH2: + logger.warning(f'Error in loading {k}, passing') + else: + if L1 != L2: + S1 = int(L1**0.5) + relative_position_bias_table_pretrained_resized = \ + torch.nn.functional.interpolate( + relative_position_bias_table_pretrained.permute( + 1, 0).view(1, nH1, S1, S1), + size=(2 * self.window_size[1] - 1, + 2 * self.window_size[2] - 1), + mode='bicubic') + relative_position_bias_table_pretrained = \ + relative_position_bias_table_pretrained_resized. \ + view(nH2, L2).permute(1, 0) + state_dict[k] = relative_position_bias_table_pretrained.repeat( + 2 * wd - 1, 1) + + # In the original swin2d checkpoint, the last layer of the + # backbone is the norm layer, and the original attribute + # name is `norm`. We changed it to `norm3` which means it + # is the last norm layer of stage 4. + if hasattr(self, 'norm3'): + state_dict['norm3.weight'] = state_dict['norm.weight'] + state_dict['norm3.bias'] = state_dict['norm.bias'] + del state_dict['norm.weight'] + del state_dict['norm.bias'] + + msg = self.load_state_dict(state_dict, strict=False) + logger.info(msg) + + def init_weights(self) -> None: + """Initialize the weights in backbone.""" + if self.pretrained2d: + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + # Inflate 2D model into 3D model. + self.inflate_weights(logger) + else: + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() + + def forward(self, x: torch.Tensor) -> \ + Union[Tuple[torch.Tensor], torch.Tensor]: + """Forward function for Swin3d Transformer.""" + x = self.patch_embed(x) + + x = self.pos_drop(x) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x.contiguous(), do_downsample=self.out_after_downsample) + + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + out = norm_layer(x) + out = rearrange(out, 'b d h w c -> b c d h w').contiguous() + outs.append(out) + + if layer.downsample is not None and not self.out_after_downsample: + x = layer.downsample(x) + + if i < self.num_layers - 1: + x = rearrange(x, 'b d h w c -> b c d h w') + + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def train(self, mode: bool = True) -> None: + """Convert the model into training mode while keep layers frozen.""" + super(SwinTransformer3D, self).train(mode) + self._freeze_stages() diff --git a/mmaction/models/backbones/tanet.py b/mmaction/models/backbones/tanet.py new file mode 100644 index 0000000000000000000000000000000000000000..95575f4fbcbb50f77f1868a42ef0c336cc70b722 --- /dev/null +++ b/mmaction/models/backbones/tanet.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from typing import Optional + +import torch +import torch.nn as nn +from torch.utils import checkpoint as cp + +from mmaction.registry import MODELS +from ..common import TAM +from .resnet import Bottleneck, ResNet + + +class TABlock(nn.Module): + """Temporal Adaptive Block (TA-Block) for TANet. + + This block is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO + RECOGNITION `_ + + The temporal adaptive module (TAM) is embedded into ResNet-Block + after the first Conv2D, which turns the vanilla ResNet-Block + into TA-Block. + + Args: + block (nn.Module): Residual blocks to be substituted. + num_segments (int): Number of frame segments. + tam_cfg (dict): Config for temporal adaptive module (TAM). + """ + + def __init__(self, block: nn.Module, num_segments: int, + tam_cfg: dict) -> None: + super().__init__() + self.tam_cfg = deepcopy(tam_cfg) + self.block = block + self.num_segments = num_segments + self.tam = TAM( + in_channels=block.conv1.out_channels, + num_segments=num_segments, + **self.tam_cfg) + + if not isinstance(self.block, Bottleneck): + raise NotImplementedError('TA-Blocks have not been fully ' + 'implemented except the pattern based ' + 'on Bottleneck block.') + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + assert isinstance(self.block, Bottleneck) + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.block.conv1(x) + out = self.tam(out) + out = self.block.conv2(out) + out = self.block.conv3(out) + + if self.block.downsample is not None: + identity = self.block.downsample(x) + + out = out + identity + + return out + + if self.block.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.block.relu(out) + + return out + + +@MODELS.register_module() +class TANet(ResNet): + """Temporal Adaptive Network (TANet) backbone. + + This backbone is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO + RECOGNITION `_ + + Embedding the temporal adaptive module (TAM) into ResNet to + instantiate TANet. + + Args: + depth (int): Depth of resnet, from ``{18, 34, 50, 101, 152}``. + num_segments (int): Number of frame segments. + tam_cfg (dict, optional): Config for temporal adaptive module (TAM). + Defaults to None. + """ + + def __init__(self, + depth: int, + num_segments: int, + tam_cfg: Optional[dict] = None, + **kwargs) -> None: + super().__init__(depth, **kwargs) + assert num_segments >= 3 + self.num_segments = num_segments + tam_cfg = dict() if tam_cfg is None else tam_cfg + self.tam_cfg = deepcopy(tam_cfg) + super().init_weights() + self.make_tam_modeling() + + def init_weights(self): + """Initialize weights.""" + pass + + def make_tam_modeling(self): + """Replace ResNet-Block with TA-Block.""" + + def make_tam_block(stage, num_segments, tam_cfg=dict()): + blocks = list(stage.children()) + for i, block in enumerate(blocks): + blocks[i] = TABlock(block, num_segments, deepcopy(tam_cfg)) + return nn.Sequential(*blocks) + + for i in range(self.num_stages): + layer_name = f'layer{i + 1}' + res_layer = getattr(self, layer_name) + setattr(self, layer_name, + make_tam_block(res_layer, self.num_segments, self.tam_cfg)) diff --git a/mmaction/models/backbones/timesformer.py b/mmaction/models/backbones/timesformer.py new file mode 100644 index 0000000000000000000000000000000000000000..a051282fa65049655146064413c69d3d5b98c7b1 --- /dev/null +++ b/mmaction/models/backbones/timesformer.py @@ -0,0 +1,294 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from mmengine import ConfigDict +from mmengine.logging import MMLogger +from mmengine.model.weight_init import kaiming_init, trunc_normal_ +from mmengine.runner.checkpoint import _load_checkpoint, load_state_dict +from torch.nn.modules.utils import _pair + +from mmaction.registry import MODELS + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding. + + Args: + img_size (int | tuple): Size of input image. + patch_size (int): Size of one patch. + in_channels (int): Channel num of input features. Defaults to 3. + embed_dims (int): Dimensions of embedding. Defaults to 768. + conv_cfg (dict | None): Config dict for convolution layer. Defaults to + `dict(type='Conv2d')`. + """ + + def __init__(self, + img_size, + patch_size, + in_channels=3, + embed_dims=768, + conv_cfg=dict(type='Conv2d')): + super().__init__() + self.img_size = _pair(img_size) + self.patch_size = _pair(patch_size) + + num_patches = (self.img_size[1] // self.patch_size[1]) * ( + self.img_size[0] // self.patch_size[0]) + assert num_patches * self.patch_size[0] * self.patch_size[1] == \ + self.img_size[0] * self.img_size[1], \ + 'The image size H*W must be divisible by patch size' + self.num_patches = num_patches + + # Use conv layer to embed + self.projection = build_conv_layer( + conv_cfg, + in_channels, + embed_dims, + kernel_size=patch_size, + stride=patch_size) + + self.init_weights() + + def init_weights(self): + """Initialize weights.""" + # Lecun norm from ClassyVision + kaiming_init(self.projection, mode='fan_in', nonlinearity='linear') + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The output of the module. + """ + x = rearrange(x, 'b c t h w -> (b t) c h w') + x = self.projection(x).flatten(2).transpose(1, 2) + return x + + +@MODELS.register_module() +class TimeSformer(nn.Module): + """TimeSformer. A PyTorch impl of `Is Space-Time Attention All You Need for + Video Understanding? `_ + + Args: + num_frames (int): Number of frames in the video. + img_size (int | tuple): Size of input image. + patch_size (int): Size of one patch. + pretrained (str | None): Name of pretrained model. Default: None. + embed_dims (int): Dimensions of embedding. Defaults to 768. + num_heads (int): Number of parallel attention heads in + TransformerCoder. Defaults to 12. + num_transformer_layers (int): Number of transformer layers. Defaults to + 12. + in_channels (int): Channel num of input features. Defaults to 3. + dropout_ratio (float): Probability of dropout layer. Defaults to 0.. + transformer_layers (list[obj:`mmcv.ConfigDict`] | + obj:`mmcv.ConfigDict` | None): Config of transformerlayer in + TransformerCoder. If it is obj:`mmcv.ConfigDict`, it would be + repeated `num_transformer_layers` times to a + list[obj:`mmcv.ConfigDict`]. Defaults to None. + attention_type (str): Type of attentions in TransformerCoder. Choices + are 'divided_space_time', 'space_only' and 'joint_space_time'. + Defaults to 'divided_space_time'. + norm_cfg (dict): Config for norm layers. Defaults to + `dict(type='LN', eps=1e-6)`. + """ + supported_attention_types = [ + 'divided_space_time', 'space_only', 'joint_space_time' + ] + + def __init__(self, + num_frames, + img_size, + patch_size, + pretrained=None, + embed_dims=768, + num_heads=12, + num_transformer_layers=12, + in_channels=3, + dropout_ratio=0., + transformer_layers=None, + attention_type='divided_space_time', + norm_cfg=dict(type='LN', eps=1e-6), + **kwargs): + super().__init__(**kwargs) + assert attention_type in self.supported_attention_types, ( + f'Unsupported Attention Type {attention_type}!') + assert transformer_layers is None or isinstance( + transformer_layers, (dict, list)) + + self.num_frames = num_frames + self.pretrained = pretrained + self.embed_dims = embed_dims + self.num_transformer_layers = num_transformer_layers + self.attention_type = attention_type + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dims=embed_dims) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + 1, embed_dims)) + self.drop_after_pos = nn.Dropout(p=dropout_ratio) + if self.attention_type != 'space_only': + self.time_embed = nn.Parameter( + torch.zeros(1, num_frames, embed_dims)) + self.drop_after_time = nn.Dropout(p=dropout_ratio) + + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + + if transformer_layers is None: + # stochastic depth decay rule + dpr = np.linspace(0, 0.1, num_transformer_layers) + + if self.attention_type == 'divided_space_time': + _transformerlayers_cfg = [ + dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='DividedTemporalAttentionWithNorm', + embed_dims=embed_dims, + num_heads=num_heads, + num_frames=num_frames, + dropout_layer=dict( + type='DropPath', drop_prob=dpr[i]), + norm_cfg=dict(type='LN', eps=1e-6)), + dict( + type='DividedSpatialAttentionWithNorm', + embed_dims=embed_dims, + num_heads=num_heads, + num_frames=num_frames, + dropout_layer=dict( + type='DropPath', drop_prob=dpr[i]), + norm_cfg=dict(type='LN', eps=1e-6)) + ], + ffn_cfgs=dict( + type='FFNWithNorm', + embed_dims=embed_dims, + feedforward_channels=embed_dims * 4, + num_fcs=2, + act_cfg=dict(type='GELU'), + dropout_layer=dict( + type='DropPath', drop_prob=dpr[i]), + norm_cfg=dict(type='LN', eps=1e-6)), + operation_order=('self_attn', 'self_attn', 'ffn')) + for i in range(num_transformer_layers) + ] + else: + # Sapce Only & Joint Space Time + _transformerlayers_cfg = [ + dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=embed_dims, + num_heads=num_heads, + batch_first=True, + dropout_layer=dict( + type='DropPath', drop_prob=dpr[i])) + ], + ffn_cfgs=dict( + type='FFN', + embed_dims=embed_dims, + feedforward_channels=embed_dims * 4, + num_fcs=2, + act_cfg=dict(type='GELU'), + dropout_layer=dict( + type='DropPath', drop_prob=dpr[i])), + operation_order=('norm', 'self_attn', 'norm', 'ffn'), + norm_cfg=dict(type='LN', eps=1e-6), + batch_first=True) + for i in range(num_transformer_layers) + ] + + transformer_layers = ConfigDict( + dict( + type='TransformerLayerSequence', + transformerlayers=_transformerlayers_cfg, + num_layers=num_transformer_layers)) + + self.transformer_layers = build_transformer_layer_sequence( + transformer_layers) + + def init_weights(self, pretrained=None): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + + if pretrained: + self.pretrained = pretrained + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + + state_dict = _load_checkpoint(self.pretrained, map_location='cpu') + if 'state_dict' in state_dict: + state_dict = state_dict['state_dict'] + + if self.attention_type == 'divided_space_time': + # modify the key names of norm layers + old_state_dict_keys = list(state_dict.keys()) + for old_key in old_state_dict_keys: + if 'norms' in old_key: + new_key = old_key.replace('norms.0', + 'attentions.0.norm') + new_key = new_key.replace('norms.1', 'ffns.0.norm') + state_dict[new_key] = state_dict.pop(old_key) + + # copy the parameters of space attention to time attention + old_state_dict_keys = list(state_dict.keys()) + for old_key in old_state_dict_keys: + if 'attentions.0' in old_key: + new_key = old_key.replace('attentions.0', + 'attentions.1') + state_dict[new_key] = state_dict[old_key].clone() + + load_state_dict(self, state_dict, strict=False, logger=logger) + + def forward(self, x): + """Defines the computation performed at every call.""" + # x [batch_size * num_frames, num_patches, embed_dims] + batches = x.shape[0] + x = self.patch_embed(x) + + # x [batch_size * num_frames, num_patches + 1, embed_dims] + cls_tokens = self.cls_token.expand(x.size(0), -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.drop_after_pos(x) + + # Add Time Embedding + if self.attention_type != 'space_only': + # x [batch_size, num_patches * num_frames + 1, embed_dims] + cls_tokens = x[:batches, 0, :].unsqueeze(1) + x = rearrange(x[:, 1:, :], '(b t) p m -> (b p) t m', b=batches) + x = x + self.time_embed + x = self.drop_after_time(x) + x = rearrange(x, '(b p) t m -> b (p t) m', b=batches) + x = torch.cat((cls_tokens, x), dim=1) + + x = self.transformer_layers(x, None, None) + + if self.attention_type == 'space_only': + # x [batch_size, num_patches + 1, embed_dims] + x = x.view(-1, self.num_frames, *x.size()[-2:]) + x = torch.mean(x, 1) + + x = self.norm(x) + + # Return Class Token + return x[:, 0] diff --git a/mmaction/models/backbones/uniformer.py b/mmaction/models/backbones/uniformer.py new file mode 100644 index 0000000000000000000000000000000000000000..78d0cd80c4f072aa3e958161e91f148736a971f5 --- /dev/null +++ b/mmaction/models/backbones/uniformer.py @@ -0,0 +1,669 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import DropPath +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, ModuleList +from mmengine.runner.checkpoint import _load_checkpoint +from mmengine.utils import to_2tuple + +from mmaction.registry import MODELS + +logger = MMLogger.get_current_instance() + +MODEL_PATH = 'https://download.openmmlab.com/mmaction/v1.0/recognition' +_MODELS = { + 'uniformer_small_in1k': + os.path.join(MODEL_PATH, + 'uniformerv1/uniformer_small_in1k_20221219-fe0a7ae0.pth'), + 'uniformer_base_in1k': + os.path.join(MODEL_PATH, + 'uniformerv1/uniformer_base_in1k_20221219-82c01015.pth'), +} + + +def conv_3xnxn(inp: int, + oup: int, + kernel_size: int = 3, + stride: int = 3, + groups: int = 1): + """3D convolution with kernel size of 3xnxn. + + Args: + inp (int): Dimension of input features. + oup (int): Dimension of output features. + kernel_size (int): The spatial kernel size (i.e., n). + Defaults to 3. + stride (int): The spatial stride. + Defaults to 3. + groups (int): Group number of operated features. + Defaults to 1. + """ + return nn.Conv3d( + inp, + oup, (3, kernel_size, kernel_size), (2, stride, stride), (1, 0, 0), + groups=groups) + + +def conv_1xnxn(inp: int, + oup: int, + kernel_size: int = 3, + stride: int = 3, + groups: int = 1): + """3D convolution with kernel size of 1xnxn. + + Args: + inp (int): Dimension of input features. + oup (int): Dimension of output features. + kernel_size (int): The spatial kernel size (i.e., n). + Defaults to 3. + stride (int): The spatial stride. + Defaults to 3. + groups (int): Group number of operated features. + Defaults to 1. + """ + return nn.Conv3d( + inp, + oup, (1, kernel_size, kernel_size), (1, stride, stride), (0, 0, 0), + groups=groups) + + +def conv_1x1x1(inp: int, oup: int, groups: int = 1): + """3D convolution with kernel size of 1x1x1. + + Args: + inp (int): Dimension of input features. + oup (int): Dimension of output features. + groups (int): Group number of operated features. + Defaults to 1. + """ + return nn.Conv3d(inp, oup, (1, 1, 1), (1, 1, 1), (0, 0, 0), groups=groups) + + +def conv_3x3x3(inp: int, oup: int, groups: int = 1): + """3D convolution with kernel size of 3x3x3. + + Args: + inp (int): Dimension of input features. + oup (int): Dimension of output features. + groups (int): Group number of operated features. + Defaults to 1. + """ + return nn.Conv3d(inp, oup, (3, 3, 3), (1, 1, 1), (1, 1, 1), groups=groups) + + +def conv_5x5x5(inp: int, oup: int, groups: int = 1): + """3D convolution with kernel size of 5x5x5. + + Args: + inp (int): Dimension of input features. + oup (int): Dimension of output features. + groups (int): Group number of operated features. + Defaults to 1. + """ + return nn.Conv3d(inp, oup, (5, 5, 5), (1, 1, 1), (2, 2, 2), groups=groups) + + +def bn_3d(dim): + """3D batch normalization. + + Args: + dim (int): Dimension of input features. + """ + return nn.BatchNorm3d(dim) + + +class Mlp(BaseModule): + """Multilayer perceptron. + + Args: + in_features (int): Number of input features. + hidden_features (int): Number of hidden features. + Defaults to None. + out_features (int): Number of output features. + Defaults to None. + drop (float): Dropout rate. Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + in_features: int, + hidden_features: int = None, + out_features: int = None, + drop: float = 0., + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(BaseModule): + """Self-Attention. + + Args: + dim (int): Number of input features. + num_heads (int): Number of attention heads. + Defaults to 8. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + attn_drop (float): Attention dropout rate. + Defaults to 0.0. + proj_drop (float): Dropout rate. + Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + qk_scale: float = None, + attn_drop: float = 0., + proj_drop: float = 0., + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, + # can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class CMlp(BaseModule): + """Multilayer perceptron via convolution. + + Args: + in_features (int): Number of input features. + hidden_features (int): Number of hidden features. + Defaults to None. + out_features (int): Number of output features. + Defaults to None. + drop (float): Dropout rate. Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + drop=0., + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = conv_1x1x1(in_features, hidden_features) + self.act = nn.GELU() + self.fc2 = conv_1x1x1(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class CBlock(BaseModule): + """Convolution Block. + + Args: + dim (int): Number of input features. + mlp_ratio (float): Ratio of mlp hidden dimension + to embedding dimension. Defaults to 4. + drop (float): Dropout rate. + Defaults to 0.0. + drop_paths (float): Stochastic depth rates. + Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + dim: int, + mlp_ratio: float = 4., + drop: float = 0., + drop_path: float = 0., + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.pos_embed = conv_3x3x3(dim, dim, groups=dim) + self.norm1 = bn_3d(dim) + self.conv1 = conv_1x1x1(dim, dim, 1) + self.conv2 = conv_1x1x1(dim, dim, 1) + self.attn = conv_5x5x5(dim, dim, groups=dim) + # NOTE: drop path for stochastic depth, + # we shall see if this is better than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = bn_3d(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = CMlp( + in_features=dim, hidden_features=mlp_hidden_dim, drop=drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.pos_embed(x) + x = x + self.drop_path( + self.conv2(self.attn(self.conv1(self.norm1(x))))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class SABlock(BaseModule): + """Self-Attention Block. + + Args: + dim (int): Number of input features. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of mlp hidden dimension + to embedding dimension. Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop (float): Dropout rate. Defaults to 0.0. + attn_drop (float): Attention dropout rate. Defaults to 0.0. + drop_paths (float): Stochastic depth rates. + Defaults to 0.0. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4., + qkv_bias: bool = False, + qk_scale: float = None, + drop: float = 0., + attn_drop: float = 0., + drop_path: float = 0., + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.pos_embed = conv_3x3x3(dim, dim, groups=dim) + self.norm1 = nn.LayerNorm(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, + # we shall see if this is better than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = nn.LayerNorm(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, drop=drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.pos_embed(x) + B, C, T, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + x = x.transpose(1, 2).reshape(B, C, T, H, W) + return x + + +class SpeicalPatchEmbed(BaseModule): + """Image to Patch Embedding. + + Add extra temporal downsampling via temporal kernel size of 3. + + Args: + img_size (int): Number of input size. + Defaults to 224. + patch_size (int): Number of patch size. + Defaults to 16. + in_chans (int): Number of input features. + Defaults to 3. + embed_dim (int): Number of output features. + Defaults to 768. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * ( + img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.norm = nn.LayerNorm(embed_dim) + self.proj = conv_3xnxn( + in_chans, + embed_dim, + kernel_size=patch_size[0], + stride=patch_size[0]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + B, _, T, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous() + return x + + +class PatchEmbed(BaseModule): + """Image to Patch Embedding. + + Args: + img_size (int): Number of input size. + Defaults to 224. + patch_size (int): Number of patch size. + Defaults to 16. + in_chans (int): Number of input features. + Defaults to 3. + embed_dim (int): Number of output features. + Defaults to 768. + init_cfg (dict, optional): Config dict for initialization. + Defaults to None. + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * ( + img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.norm = nn.LayerNorm(embed_dim) + self.proj = conv_1xnxn( + in_chans, + embed_dim, + kernel_size=patch_size[0], + stride=patch_size[0]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + B, _, T, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous() + return x + + +@MODELS.register_module() +class UniFormer(BaseModule): + """UniFormer. + + A pytorch implement of: `UniFormer: Unified Transformer + for Efficient Spatiotemporal Representation Learning + ` + + Args: + depth (List[int]): List of depth in each stage. + Defaults to [5, 8, 20, 7]. + img_size (int): Number of input size. + Defaults to 224. + in_chans (int): Number of input features. + Defaults to 3. + head_dim (int): Dimension of attention head. + Defaults to 64. + embed_dim (List[int]): List of embedding dimension in each layer. + Defaults to [64, 128, 320, 512]. + mlp_ratio (float): Ratio of mlp hidden dimension + to embedding dimension. Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop_rate (float): Dropout rate. Defaults to 0.0. + attn_drop_rate (float): Attention dropout rate. Defaults to 0.0. + drop_path_rate (float): Stochastic depth rates. + Defaults to 0.0. + pretrained2d (bool): Whether to load pretrained from 2D model. + Defaults to True. + pretrained (str): Name of pretrained model. + Defaults to None. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ]``. + """ + + def __init__( + self, + depth: List[int] = [5, 8, 20, 7], + img_size: int = 224, + in_chans: int = 3, + embed_dim: List[int] = [64, 128, 320, 512], + head_dim: int = 64, + mlp_ratio: float = 4., + qkv_bias: bool = True, + qk_scale: float = None, + drop_rate: float = 0., + attn_drop_rate: float = 0., + drop_path_rate: float = 0., + pretrained2d: bool = True, + pretrained: Optional[str] = None, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.pretrained = pretrained + self.pretrained2d = pretrained2d + self.patch_embed1 = SpeicalPatchEmbed( + img_size=img_size, + patch_size=4, + in_chans=in_chans, + embed_dim=embed_dim[0]) + self.patch_embed2 = PatchEmbed( + img_size=img_size // 4, + patch_size=2, + in_chans=embed_dim[0], + embed_dim=embed_dim[1]) + self.patch_embed3 = PatchEmbed( + img_size=img_size // 8, + patch_size=2, + in_chans=embed_dim[1], + embed_dim=embed_dim[2]) + self.patch_embed4 = PatchEmbed( + img_size=img_size // 16, + patch_size=2, + in_chans=embed_dim[2], + embed_dim=embed_dim[3]) + + self.pos_drop = nn.Dropout(p=drop_rate) + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depth)) + ] # stochastic depth decay rule + num_heads = [dim // head_dim for dim in embed_dim] + self.blocks1 = ModuleList([ + CBlock( + dim=embed_dim[0], + mlp_ratio=mlp_ratio, + drop=drop_rate, + drop_path=dpr[i]) for i in range(depth[0]) + ]) + self.blocks2 = ModuleList([ + CBlock( + dim=embed_dim[1], + mlp_ratio=mlp_ratio, + drop=drop_rate, + drop_path=dpr[i + depth[0]]) for i in range(depth[1]) + ]) + self.blocks3 = ModuleList([ + SABlock( + dim=embed_dim[2], + num_heads=num_heads[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i + depth[0] + depth[1]]) + for i in range(depth[2]) + ]) + self.blocks4 = ModuleList([ + SABlock( + dim=embed_dim[3], + num_heads=num_heads[3], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i + depth[0] + depth[1] + depth[2]]) + for i in range(depth[3]) + ]) + self.norm = bn_3d(embed_dim[-1]) + + def _inflate_weight(self, + weight_2d: torch.Tensor, + time_dim: int, + center: bool = True) -> torch.Tensor: + logger.info(f'Init center: {center}') + if center: + weight_3d = torch.zeros(*weight_2d.shape) + weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) + middle_idx = time_dim // 2 + weight_3d[:, :, middle_idx, :, :] = weight_2d + else: + weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) + weight_3d = weight_3d / time_dim + return weight_3d + + def _load_pretrained(self, pretrained: str = None) -> None: + """Load ImageNet-1K pretrained model. + + The model is pretrained with ImageNet-1K. + https://github.com/Sense-X/UniFormer + + Args: + pretrained (str): Model name of ImageNet-1K pretrained model. + Defaults to None. + """ + if pretrained is not None: + model_path = _MODELS[pretrained] + logger.info(f'Load ImageNet pretrained model from {model_path}') + state_dict = _load_checkpoint(model_path, map_location='cpu') + state_dict_3d = self.state_dict() + for k in state_dict.keys(): + if k in state_dict_3d.keys( + ) and state_dict[k].shape != state_dict_3d[k].shape: + if len(state_dict_3d[k].shape) <= 2: + logger.info(f'Ignore: {k}') + continue + logger.info(f'Inflate: {k}, {state_dict[k].shape}' + + f' => {state_dict_3d[k].shape}') + time_dim = state_dict_3d[k].shape[2] + state_dict[k] = self._inflate_weight( + state_dict[k], time_dim) + self.load_state_dict(state_dict, strict=False) + + def init_weights(self): + """Initialize the weights in backbone.""" + if self.pretrained2d: + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + self._load_pretrained(self.pretrained) + else: + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed1(x) + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + x = self.patch_embed2(x) + for blk in self.blocks2: + x = blk(x) + x = self.patch_embed3(x) + for blk in self.blocks3: + x = blk(x) + x = self.patch_embed4(x) + for blk in self.blocks4: + x = blk(x) + x = self.norm(x) + return x diff --git a/mmaction/models/backbones/uniformerv2.py b/mmaction/models/backbones/uniformerv2.py new file mode 100644 index 0000000000000000000000000000000000000000..4d188da848af855166a64f0074ef77c95014b0b6 --- /dev/null +++ b/mmaction/models/backbones/uniformerv2.py @@ -0,0 +1,597 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from collections import OrderedDict +from typing import Dict, List, Optional, Union + +import torch +from mmcv.cnn.bricks import DropPath +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, ModuleList +from mmengine.runner.checkpoint import _load_checkpoint +from torch import nn + +from mmaction.registry import MODELS + +logger = MMLogger.get_current_instance() + +MODEL_PATH = 'https://download.openmmlab.com/mmaction/v1.0/recognition' +_MODELS = { + 'ViT-B/16': + os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder', + 'vit-base-p16-res224_clip-rgb_20221219-b8a5da86.pth'), + 'ViT-L/14': + os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder', + 'vit-large-p14-res224_clip-rgb_20221219-9de7543e.pth'), + 'ViT-L/14_336': + os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder', + 'vit-large-p14-res336_clip-rgb_20221219-d370f9e5.pth'), +} + + +class QuickGELU(BaseModule): + """Quick GELU function. Forked from https://github.com/openai/CLIP/blob/d50 + d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py. + + Args: + x (torch.Tensor): The input features of shape :math:`(B, N, C)`. + """ + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * torch.sigmoid(1.702 * x) + + +class Local_MHRA(BaseModule): + """Local MHRA. + + Args: + d_model (int): Number of input channels. + dw_reduction (float): Downsample ratio of input channels. + Defaults to 1.5. + pos_kernel_size (int): Kernel size of local MHRA. + Defaults to 3. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + d_model: int, + dw_reduction: float = 1.5, + pos_kernel_size: int = 3, + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + padding = pos_kernel_size // 2 + re_d_model = int(d_model // dw_reduction) + self.pos_embed = nn.Sequential( + nn.BatchNorm3d(d_model), + nn.Conv3d(d_model, re_d_model, kernel_size=1, stride=1, padding=0), + nn.Conv3d( + re_d_model, + re_d_model, + kernel_size=(pos_kernel_size, 1, 1), + stride=(1, 1, 1), + padding=(padding, 0, 0), + groups=re_d_model), + nn.Conv3d(re_d_model, d_model, kernel_size=1, stride=1, padding=0), + ) + + # init zero + logger.info('Init zero for Conv in pos_emb') + nn.init.constant_(self.pos_embed[3].weight, 0) + nn.init.constant_(self.pos_embed[3].bias, 0) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.pos_embed(x) + + +class ResidualAttentionBlock(BaseModule): + """Local UniBlock. + + Args: + d_model (int): Number of input channels. + n_head (int): Number of attention head. + drop_path (float): Stochastic depth rate. + Defaults to 0.0. + dw_reduction (float): Downsample ratio of input channels. + Defaults to 1.5. + no_lmhra (bool): Whether removing local MHRA. + Defaults to False. + double_lmhra (bool): Whether using double local MHRA. + Defaults to True. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + d_model: int, + n_head: int, + drop_path: float = 0.0, + dw_reduction: float = 1.5, + no_lmhra: bool = False, + double_lmhra: bool = True, + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.n_head = n_head + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + logger.info(f'Drop path rate: {drop_path}') + + self.no_lmhra = no_lmhra + self.double_lmhra = double_lmhra + logger.info(f'No L_MHRA: {no_lmhra}') + logger.info(f'Double L_MHRA: {double_lmhra}') + if not no_lmhra: + self.lmhra1 = Local_MHRA(d_model, dw_reduction=dw_reduction) + if double_lmhra: + self.lmhra2 = Local_MHRA(d_model, dw_reduction=dw_reduction) + + # spatial + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = nn.LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = nn.LayerNorm(d_model) + + def attention(self, x: torch.Tensor) -> torch.Tensor: + return self.attn(x, x, x, need_weights=False, attn_mask=None)[0] + + def forward(self, x: torch.Tensor, T: int = 8) -> torch.Tensor: + # x: 1+HW, NT, C + if not self.no_lmhra: + # Local MHRA + tmp_x = x[1:, :, :] + L, NT, C = tmp_x.shape + N = NT // T + H = W = int(L**0.5) + tmp_x = tmp_x.view(H, W, N, T, C).permute(2, 4, 3, 0, + 1).contiguous() + tmp_x = tmp_x + self.drop_path(self.lmhra1(tmp_x)) + tmp_x = tmp_x.view(N, C, T, + L).permute(3, 0, 2, + 1).contiguous().view(L, NT, C) + x = torch.cat([x[:1, :, :], tmp_x], dim=0) + # MHSA + x = x + self.drop_path(self.attention(self.ln_1(x))) + # Local MHRA + if not self.no_lmhra and self.double_lmhra: + tmp_x = x[1:, :, :] + tmp_x = tmp_x.view(H, W, N, T, C).permute(2, 4, 3, 0, + 1).contiguous() + tmp_x = tmp_x + self.drop_path(self.lmhra2(tmp_x)) + tmp_x = tmp_x.view(N, C, T, + L).permute(3, 0, 2, + 1).contiguous().view(L, NT, C) + x = torch.cat([x[:1, :, :], tmp_x], dim=0) + # FFN + x = x + self.drop_path(self.mlp(self.ln_2(x))) + return x + + +class Extractor(BaseModule): + """Global UniBlock. + + Args: + d_model (int): Number of input channels. + n_head (int): Number of attention head. + mlp_factor (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + drop_out (float): Stochastic dropout rate. + Defaults to 0.0. + drop_path (float): Stochastic depth rate. + Defaults to 0.0. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + d_model: int, + n_head: int, + mlp_factor: float = 4.0, + dropout: float = 0.0, + drop_path: float = 0.0, + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + logger.info(f'Drop path rate: {drop_path}') + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = nn.LayerNorm(d_model) + d_mlp = round(mlp_factor * d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_mlp)), + ('gelu', QuickGELU()), + ('dropout', nn.Dropout(dropout)), + ('c_proj', nn.Linear(d_mlp, d_model))])) + self.ln_2 = nn.LayerNorm(d_model) + self.ln_3 = nn.LayerNorm(d_model) + + # zero init + nn.init.xavier_uniform_(self.attn.in_proj_weight) + nn.init.constant_(self.attn.out_proj.weight, 0.) + nn.init.constant_(self.attn.out_proj.bias, 0.) + nn.init.xavier_uniform_(self.mlp[0].weight) + nn.init.constant_(self.mlp[-1].weight, 0.) + nn.init.constant_(self.mlp[-1].bias, 0.) + + def attention(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + d_model = self.ln_1.weight.size(0) + q = (x @ self.attn.in_proj_weight[:d_model].T + ) + self.attn.in_proj_bias[:d_model] + + k = (y @ self.attn.in_proj_weight[d_model:-d_model].T + ) + self.attn.in_proj_bias[d_model:-d_model] + v = (y @ self.attn.in_proj_weight[-d_model:].T + ) + self.attn.in_proj_bias[-d_model:] + Tx, Ty, N = q.size(0), k.size(0), q.size(1) + q = q.view(Tx, N, self.attn.num_heads, + self.attn.head_dim).permute(1, 2, 0, 3) + k = k.view(Ty, N, self.attn.num_heads, + self.attn.head_dim).permute(1, 2, 0, 3) + v = v.view(Ty, N, self.attn.num_heads, + self.attn.head_dim).permute(1, 2, 0, 3) + aff = (q @ k.transpose(-2, -1) / (self.attn.head_dim**0.5)) + + aff = aff.softmax(dim=-1) + out = aff @ v + out = out.permute(2, 0, 1, 3).flatten(2) + out = self.attn.out_proj(out) + return out + + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + x = x + self.drop_path(self.attention(self.ln_1(x), self.ln_3(y))) + x = x + self.drop_path(self.mlp(self.ln_2(x))) + return x + + +class Transformer(BaseModule): + """Backbone: + + Args: + width (int): Number of input channels in local UniBlock. + layers (int): Number of layers of local UniBlock. + heads (int): Number of attention head in local UniBlock. + backbone_drop_path_rate (float): Stochastic depth rate + in local UniBlock. Defaults to 0.0. + t_size (int): Number of temporal dimension after patch embedding. + Defaults to 8. + dw_reduction (float): Downsample ratio of input channels in local MHRA. + Defaults to 1.5. + no_lmhra (bool): Whether removing local MHRA in local UniBlock. + Defaults to False. + double_lmhra (bool): Whether using double local MHRA + in local UniBlock. Defaults to True. + return_list (List[int]): Layer index of input features + for global UniBlock. Defaults to [8, 9, 10, 11]. + n_dim (int): Number of layers of global UniBlock. + Defaults to 4. + n_dim (int): Number of layers of global UniBlock. + Defaults to 4. + n_dim (int): Number of input channels in global UniBlock. + Defaults to 768. + n_head (int): Number of attention head in global UniBlock. + Defaults to 12. + mlp_factor (float): Ratio of hidden dimensions in MLP layers + in global UniBlock. Defaults to 4.0. + drop_path_rate (float): Stochastic depth rate in global UniBlock. + Defaults to 0.0. + mlp_dropout (List[float]): Stochastic dropout rate in each MLP layer + in global UniBlock. Defaults to [0.5, 0.5, 0.5, 0.5]. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + width: int, + layers: int, + heads: int, + backbone_drop_path_rate: float = 0., + t_size: int = 8, + dw_reduction: float = 1.5, + no_lmhra: bool = True, + double_lmhra: bool = False, + return_list: List[int] = [8, 9, 10, 11], + n_layers: int = 4, + n_dim: int = 768, + n_head: int = 12, + mlp_factor: float = 4.0, + drop_path_rate: float = 0., + mlp_dropout: List[float] = [0.5, 0.5, 0.5, 0.5], + init_cfg: Optional[dict] = None, + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.T = t_size + self.return_list = return_list + # backbone + b_dpr = [ + x.item() + for x in torch.linspace(0, backbone_drop_path_rate, layers) + ] + self.resblocks = ModuleList([ + ResidualAttentionBlock( + width, + heads, + drop_path=b_dpr[i], + dw_reduction=dw_reduction, + no_lmhra=no_lmhra, + double_lmhra=double_lmhra, + ) for i in range(layers) + ]) + + # global block + assert n_layers == len(return_list) + self.temporal_cls_token = nn.Parameter(torch.zeros(1, 1, n_dim)) + self.dpe = ModuleList([ + nn.Conv3d( + n_dim, + n_dim, + kernel_size=3, + stride=1, + padding=1, + bias=True, + groups=n_dim) for _ in range(n_layers) + ]) + for m in self.dpe: + nn.init.constant_(m.bias, 0.) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, n_layers)] + self.dec = ModuleList([ + Extractor( + n_dim, + n_head, + mlp_factor=mlp_factor, + dropout=mlp_dropout[i], + drop_path=dpr[i], + ) for i in range(n_layers) + ]) + # weight sum + self.norm = nn.LayerNorm(n_dim) + self.balance = nn.Parameter(torch.zeros((n_dim))) + self.sigmoid = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + T_down = self.T + L, NT, C = x.shape + N = NT // T_down + H = W = int((L - 1)**0.5) + cls_token = self.temporal_cls_token.repeat(1, N, 1) + + j = -1 + for i, resblock in enumerate(self.resblocks): + x = resblock(x, T_down) + if i in self.return_list: + j += 1 + tmp_x = x.clone() + tmp_x = tmp_x.view(L, N, T_down, C) + # dpe + _, tmp_feats = tmp_x[:1], tmp_x[1:] + tmp_feats = tmp_feats.permute(1, 3, 2, + 0).reshape(N, C, T_down, H, W) + tmp_feats = self.dpe[j](tmp_feats.clone()).view( + N, C, T_down, L - 1).permute(3, 0, 2, 1).contiguous() + tmp_x[1:] = tmp_x[1:] + tmp_feats + # global block + tmp_x = tmp_x.permute(2, 0, 1, 3).flatten(0, 1) # T * L, N, C + cls_token = self.dec[j](cls_token, tmp_x) + + weight = self.sigmoid(self.balance) + residual = x.view(L, N, T_down, C)[0].mean(1) # L, N, T, C + out = self.norm((1 - weight) * cls_token[0, :, :] + weight * residual) + return out + + +@MODELS.register_module() +class UniFormerV2(BaseModule): + """UniFormerV2: + + A pytorch implement of: `UniFormerV2: Spatiotemporal + Learning by Arming Image ViTs with Video UniFormer + ` + + Args: + input_resolution (int): Number of input resolution. + Defaults to 224. + patch_size (int): Number of patch size. + Defaults to 16. + width (int): Number of input channels in local UniBlock. + Defaults to 768. + layers (int): Number of layers of local UniBlock. + Defaults to 12. + heads (int): Number of attention head in local UniBlock. + Defaults to 12. + backbone_drop_path_rate (float): Stochastic depth rate + in local UniBlock. Defaults to 0.0. + t_size (int): Number of temporal dimension after patch embedding. + Defaults to 8. + temporal_downsample (bool): Whether downsampling temporal dimentison. + Defaults to False. + dw_reduction (float): Downsample ratio of input channels in local MHRA. + Defaults to 1.5. + no_lmhra (bool): Whether removing local MHRA in local UniBlock. + Defaults to False. + double_lmhra (bool): Whether using double local MHRA in local UniBlock. + Defaults to True. + return_list (List[int]): Layer index of input features + for global UniBlock. Defaults to [8, 9, 10, 11]. + n_dim (int): Number of layers of global UniBlock. + Defaults to 4. + n_dim (int): Number of layers of global UniBlock. + Defaults to 4. + n_dim (int): Number of input channels in global UniBlock. + Defaults to 768. + n_head (int): Number of attention head in global UniBlock. + Defaults to 12. + mlp_factor (float): Ratio of hidden dimensions in MLP layers + in global UniBlock. Defaults to 4.0. + drop_path_rate (float): Stochastic depth rate in global UniBlock. + Defaults to 0.0. + mlp_dropout (List[float]): Stochastic dropout rate in each MLP layer + in global UniBlock. Defaults to [0.5, 0.5, 0.5, 0.5]. + clip_pretrained (bool): Whether to load pretrained CLIP visual encoder. + Defaults to True. + pretrained (str): Name of pretrained model. + Defaults to None. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ]``. + """ + + def __init__( + self, + # backbone + input_resolution: int = 224, + patch_size: int = 16, + width: int = 768, + layers: int = 12, + heads: int = 12, + backbone_drop_path_rate: float = 0., + t_size: int = 8, + kernel_size: int = 3, + dw_reduction: float = 1.5, + temporal_downsample: bool = False, + no_lmhra: bool = True, + double_lmhra: bool = False, + # global block + return_list: List[int] = [8, 9, 10, 11], + n_layers: int = 4, + n_dim: int = 768, + n_head: int = 12, + mlp_factor: float = 4.0, + drop_path_rate: float = 0., + mlp_dropout: List[float] = [0.5, 0.5, 0.5, 0.5], + # pretrain + clip_pretrained: bool = True, + pretrained: Optional[str] = None, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.pretrained = pretrained + self.clip_pretrained = clip_pretrained + self.input_resolution = input_resolution + padding = (kernel_size - 1) // 2 + if temporal_downsample: + self.conv1 = nn.Conv3d( + 3, + width, (kernel_size, patch_size, patch_size), + (2, patch_size, patch_size), (padding, 0, 0), + bias=False) + t_size = t_size // 2 + else: + self.conv1 = nn.Conv3d( + 3, + width, (1, patch_size, patch_size), + (1, patch_size, patch_size), (0, 0, 0), + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = nn.LayerNorm(width) + + self.transformer = Transformer( + width, + layers, + heads, + dw_reduction=dw_reduction, + backbone_drop_path_rate=backbone_drop_path_rate, + t_size=t_size, + no_lmhra=no_lmhra, + double_lmhra=double_lmhra, + return_list=return_list, + n_layers=n_layers, + n_dim=n_dim, + n_head=n_head, + mlp_factor=mlp_factor, + drop_path_rate=drop_path_rate, + mlp_dropout=mlp_dropout, + ) + + def _inflate_weight(self, + weight_2d: torch.Tensor, + time_dim: int, + center: bool = True) -> torch.Tensor: + logger.info(f'Init center: {center}') + if center: + weight_3d = torch.zeros(*weight_2d.shape) + weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) + middle_idx = time_dim // 2 + weight_3d[:, :, middle_idx, :, :] = weight_2d + else: + weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1) + weight_3d = weight_3d / time_dim + return weight_3d + + def _load_pretrained(self, pretrained: str = None) -> None: + """Load CLIP pretrained visual encoder. + + The visual encoder is extracted from CLIP. + https://github.com/openai/CLIP + + Args: + pretrained (str): Model name of pretrained CLIP visual encoder. + Defaults to None. + """ + assert pretrained is not None, \ + 'please specify clip pretraied checkpoint' + + model_path = _MODELS[pretrained] + logger.info(f'Load CLIP pretrained model from {model_path}') + state_dict = _load_checkpoint(model_path, map_location='cpu') + state_dict_3d = self.state_dict() + for k in state_dict.keys(): + if k in state_dict_3d.keys( + ) and state_dict[k].shape != state_dict_3d[k].shape: + if len(state_dict_3d[k].shape) <= 2: + logger.info(f'Ignore: {k}') + continue + logger.info(f'Inflate: {k}, {state_dict[k].shape}' + + f' => {state_dict_3d[k].shape}') + time_dim = state_dict_3d[k].shape[2] + state_dict[k] = self._inflate_weight(state_dict[k], time_dim) + self.load_state_dict(state_dict, strict=False) + + def init_weights(self): + """Initialize the weights in backbone.""" + if self.clip_pretrained: + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + self._load_pretrained(self.pretrained) + else: + if self.pretrained: + self.init_cfg = dict( + type='Pretrained', checkpoint=self.pretrained) + super().init_weights() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv1(x) # shape = [*, width, grid, grid] + N, C, T, H, W = x.shape + x = x.permute(0, 2, 3, 4, 1).reshape(N * T, H * W, C) + + x = torch.cat([ + self.class_embedding.to(x.dtype) + torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x + ], + dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + out = self.transformer(x) + return out diff --git a/mmaction/models/backbones/vit_mae.py b/mmaction/models/backbones/vit_mae.py new file mode 100644 index 0000000000000000000000000000000000000000..03111d61ce3ed57f0d3806b5fab9b1fa794b5a90 --- /dev/null +++ b/mmaction/models/backbones/vit_mae.py @@ -0,0 +1,383 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +import torch +import torch.nn.functional as F +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks import DropPath +from mmcv.cnn.bricks.transformer import FFN, PatchEmbed +from mmengine.model import BaseModule, ModuleList +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, OptConfigType + + +class Attention(BaseModule): + """Multi-head Self-attention. + + Args: + embed_dims (int): Dimensions of embedding. + num_heads (int): Number of parallel attention heads. + qkv_bias (bool): If True, add a learnable bias to q and v. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + attn_drop_rate (float): Dropout ratio of attention weight. + Defaults to 0. + drop_rate (float): Dropout ratio of output. Defaults to 0. + init_cfg (dict or ConfigDict, optional): The Config + for initialization. Defaults to None. + """ + + def __init__(self, + embed_dims: int, + num_heads: int = 8, + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + attn_drop_rate: float = 0., + drop_rate: float = 0., + init_cfg: OptConfigType = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + head_embed_dims = embed_dims // num_heads + + self.scale = qk_scale or head_embed_dims**-0.5 + + if qkv_bias: + self._init_qv_bias() + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(drop_rate) + + def _init_qv_bias(self) -> None: + self.q_bias = nn.Parameter(torch.zeros(self.embed_dims)) + self.v_bias = nn.Parameter(torch.zeros(self.embed_dims)) + + def forward(self, x: Tensor) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data with size of (B, N, C). + Returns: + Tensor: The output of the attention block, same size as inputs. + """ + B, N, C = x.shape + + if hasattr(self, 'q_bias'): + k_bias = torch.zeros_like(self.v_bias, requires_grad=False) + qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + else: + qkv = self.qkv(x) + + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(BaseModule): + """The basic block in the Vision Transformer. + + Args: + embed_dims (int): Dimensions of embedding. + num_heads (int): Number of parallel attention heads. + mlp_ratio (int): The ratio between the hidden layer and the + input layer in the FFN. Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to q and v. + Defaults to True. + qk_scale (float): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop_rate (float): Dropout ratio of output. Defaults to 0. + attn_drop_rate (float): Dropout ratio of attention weight. + Defaults to 0. + drop_path_rate (float): Dropout ratio of the residual branch. + Defaults to 0. + init_values (float): Value to init the multiplier of the + residual branch. Defaults to 0. + act_cfg (dict or ConfigDict): Config for activation layer in FFN. + Defaults to `dict(type='GELU')`. + norm_cfg (dict or ConfigDict): Config for norm layers. + Defaults to `dict(type='LN', eps=1e-6)`. + init_cfg (dict or ConfigDict, optional): The Config + for initialization. Defaults to None. + """ + + def __init__(self, + embed_dims: int, + num_heads: int, + mlp_ratio: int = 4., + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + drop_rate: float = 0., + attn_drop_rate: float = 0., + drop_path_rate: float = 0., + init_values: float = 0.0, + act_cfg: ConfigType = dict(type='GELU'), + norm_cfg: ConfigType = dict(type='LN', eps=1e-6), + init_cfg: OptConfigType = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + self.attn = Attention( + embed_dims, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + drop_rate=drop_rate) + + self.drop_path = nn.Identity() + if drop_path_rate > 0.: + self.drop_path = DropPath(drop_path_rate) + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + + mlp_hidden_dim = int(embed_dims * mlp_ratio) + self.mlp = FFN( + embed_dims=embed_dims, + feedforward_channels=mlp_hidden_dim, + act_cfg=act_cfg, + ffn_drop=drop_rate, + add_identity=False) + + self._init_gammas(init_values, embed_dims) + + def _init_gammas(self, init_values: float, dim: int) -> None: + if type(init_values) == float and init_values > 0: + self.gamma_1 = nn.Parameter( + init_values * torch.ones(dim), requires_grad=True) + self.gamma_2 = nn.Parameter( + init_values * torch.ones(dim), requires_grad=True) + + def forward(self, x: Tensor) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data with size of (B, N, C). + Returns: + Tensor: The output of the transformer block, same size as inputs. + """ + if hasattr(self, 'gamma_1'): + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +def get_sinusoid_encoding(n_position: int, embed_dims: int) -> Tensor: + """Generate sinusoid encoding table. + + Sinusoid encoding is a kind of relative position encoding method came from + `Attention Is All You Need`_. + Args: + n_position (int): The length of the input token. + embed_dims (int): The position embedding dimension. + Returns: + :obj:`torch.FloatTensor`: The sinusoid encoding table of size + (1, n_position, embed_dims) + """ + + vec = torch.arange(embed_dims, dtype=torch.float64) + vec = (vec - vec % 2) / embed_dims + vec = torch.pow(10000, -vec).view(1, -1) + + sinusoid_table = torch.arange(n_position).view(-1, 1) * vec + sinusoid_table[:, 0::2].sin_() # dim 2i + sinusoid_table[:, 1::2].cos_() # dim 2i+1 + + sinusoid_table = sinusoid_table.to(torch.float32) + + return sinusoid_table.unsqueeze(0) + + +@MODELS.register_module() +class VisionTransformer(BaseModule): + """Vision Transformer with support for patch or hybrid CNN input stage. An + impl of `VideoMAE: Masked Autoencoders are Data-Efficient Learners for + Self-Supervised Video Pre-Training `_ + + Args: + img_size (int or tuple): Size of input image. + Defaults to 224. + patch_size (int): Spatial size of one patch. Defaults to 16. + in_channels (int): The number of channels of he input. + Defaults to 3. + embed_dims (int): Dimensions of embedding. Defaults to 768. + depth (int): number of blocks in the transformer. + Defaults to 12. + num_heads (int): Number of parallel attention heads in + TransformerCoder. Defaults to 12. + mlp_ratio (int): The ratio between the hidden layer and the + input layer in the FFN. Defaults to 4. + qkv_bias (bool): If True, add a learnable bias to q and v. + Defaults to True. + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + drop_rate (float): Dropout ratio of output. Defaults to 0. + attn_drop_rate (float): Dropout ratio of attention weight. + Defaults to 0. + drop_path_rate (float): Dropout ratio of the residual branch. + Defaults to 0. + norm_cfg (dict or Configdict): Config for norm layers. + Defaults to `dict(type='LN', eps=1e-6)`. + init_values (float): Value to init the multiplier of the residual + branch. Defaults to 0. + use_learnable_pos_emb (bool): If True, use learnable positional + embedding, othersize use sinusoid encoding. Defaults to False. + num_frames (int): Number of frames in the video. Defaults to 16. + tubelet_size (int): Temporal size of one patch. Defaults to 2. + use_mean_pooling (bool): If True, take the mean pooling over all + positions. Defaults to True. + pretrained (str, optional): Name of pretrained model. Default: None. + return_feat_map (bool): If True, return the feature in the shape of + `[B, C, T, H, W]`. Defaults to False. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ]``. + """ + + def __init__(self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + embed_dims: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: int = 4., + qkv_bias: bool = True, + qk_scale: int = None, + drop_rate: float = 0., + attn_drop_rate: float = 0., + drop_path_rate: float = 0., + norm_cfg: ConfigType = dict(type='LN', eps=1e-6), + init_values: int = 0., + use_learnable_pos_emb: bool = False, + num_frames: int = 16, + tubelet_size: int = 2, + use_mean_pooling: int = True, + pretrained: Optional[str] = None, + return_feat_map: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict( + type='TruncNormal', layer='Linear', std=0.02, + bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + **kwargs) -> None: + + if pretrained: + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + super().__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + self.patch_size = patch_size + + self.patch_embed = PatchEmbed( + in_channels=in_channels, + embed_dims=embed_dims, + conv_type='Conv3d', + kernel_size=(tubelet_size, patch_size, patch_size), + stride=(tubelet_size, patch_size, patch_size), + padding=(0, 0, 0), + dilation=(1, 1, 1)) + + grid_size = img_size // patch_size + num_patches = grid_size**2 * (num_frames // tubelet_size) + self.grid_size = (grid_size, grid_size) + + if use_learnable_pos_emb: + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches, embed_dims)) + nn.init.trunc_normal_(self.pos_embed, std=.02) + else: + # sine-cosine positional embeddings is on the way + pos_embed = get_sinusoid_encoding(num_patches, embed_dims) + self.register_buffer('pos_embed', pos_embed) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + + self.blocks = ModuleList([ + Block( + embed_dims=embed_dims, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[i], + norm_cfg=norm_cfg, + init_values=init_values) for i in range(depth) + ]) + + if use_mean_pooling: + self.norm = nn.Identity() + self.fc_norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + self.fc_norm = None + + self.return_feat_map = return_feat_map + + def forward(self, x: Tensor) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + Returns: + Tensor: The feature of the input + samples extracted by the backbone. + """ + b, _, _, h, w = x.shape + h //= self.patch_size + w //= self.patch_size + x = self.patch_embed(x)[0] + if (h, w) != self.grid_size: + pos_embed = self.pos_embed.reshape(-1, *self.grid_size, + self.embed_dims) + pos_embed = pos_embed.permute(0, 3, 1, 2) + pos_embed = F.interpolate( + pos_embed, size=(h, w), mode='bicubic', align_corners=False) + pos_embed = pos_embed.permute(0, 2, 3, 1).flatten(1, 2) + pos_embed = pos_embed.reshape(1, -1, self.embed_dims) + else: + pos_embed = self.pos_embed + + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + if self.return_feat_map: + x = x.reshape(b, -1, h, w, self.embed_dims) + x = x.permute(0, 4, 1, 2, 3) + return x + + if self.fc_norm is not None: + return self.fc_norm(x.mean(1)) + + return x[:, 0] diff --git a/mmaction/models/backbones/x3d.py b/mmaction/models/backbones/x3d.py new file mode 100644 index 0000000000000000000000000000000000000000..f9cb0aa0c693ae4c07018f879765dec0df29ce8a --- /dev/null +++ b/mmaction/models/backbones/x3d.py @@ -0,0 +1,533 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, Swish, build_activation_layer +from mmengine.logging import MMLogger +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS + + +class SEModule(nn.Module): + + def __init__(self, channels, reduction): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool3d(1) + self.bottleneck = self._round_width(channels, reduction) + self.fc1 = nn.Conv3d( + channels, self.bottleneck, kernel_size=1, padding=0) + self.relu = nn.ReLU() + self.fc2 = nn.Conv3d( + self.bottleneck, channels, kernel_size=1, padding=0) + self.sigmoid = nn.Sigmoid() + + @staticmethod + def _round_width(width, multiplier, min_width=8, divisor=8): + """Round width of filters based on width multiplier.""" + width *= multiplier + min_width = min_width or divisor + width_out = max(min_width, + int(width + divisor / 2) // divisor * divisor) + if width_out < 0.9 * width: + width_out += divisor + return int(width_out) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The output of the module. + """ + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class BlockX3D(nn.Module): + """BlockX3D 3d building block for X3D. + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + outplanes (int): Number of channels produced by final the conv3d layer. + spatial_stride (int): Spatial stride in the conv3d layer. Default: 1. + downsample (nn.Module | None): Downsample layer. Default: None. + se_ratio (float | None): The reduction ratio of squeeze and excitation + unit. If set as None, it means not using SE unit. Default: None. + use_swish (bool): Whether to use swish as the activation function + before and after the 3x3x3 conv. Default: True. + conv_cfg (dict): Config dict for convolution layer. + Default: ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. required keys are ``type``, + Default: ``dict(type='BN3d')``. + act_cfg (dict): Config dict for activation layer. + Default: ``dict(type='ReLU')``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + inplanes, + planes, + outplanes, + spatial_stride=1, + downsample=None, + se_ratio=None, + use_swish=True, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + + self.inplanes = inplanes + self.planes = planes + self.outplanes = outplanes + self.spatial_stride = spatial_stride + self.downsample = downsample + self.se_ratio = se_ratio + self.use_swish = use_swish + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.act_cfg_swish = dict(type='Swish') + self.with_cp = with_cp + + self.conv1 = ConvModule( + in_channels=inplanes, + out_channels=planes, + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + # Here we use the channel-wise conv + self.conv2 = ConvModule( + in_channels=planes, + out_channels=planes, + kernel_size=3, + stride=(1, self.spatial_stride, self.spatial_stride), + padding=1, + groups=planes, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + + self.swish = Swish() + + self.conv3 = ConvModule( + in_channels=planes, + out_channels=outplanes, + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + + if self.se_ratio is not None: + self.se_module = SEModule(planes, self.se_ratio) + + self.relu = build_activation_layer(self.act_cfg) + + def forward(self, x): + """Defines the computation performed at every call.""" + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.conv1(x) + out = self.conv2(out) + if self.se_ratio is not None: + out = self.se_module(out) + + out = self.swish(out) + + out = self.conv3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + out = self.relu(out) + return out + + +# We do not support initialize with 2D pretrain weight for X3D +@MODELS.register_module() +class X3D(nn.Module): + """X3D backbone. https://arxiv.org/pdf/2004.04730.pdf. + + Args: + gamma_w (float): Global channel width expansion factor. Default: 1. + gamma_b (float): Bottleneck channel width expansion factor. Default: 1. + gamma_d (float): Network depth expansion factor. Default: 1. + pretrained (str | None): Name of pretrained model. Default: None. + in_channels (int): Channel num of input features. Default: 3. + num_stages (int): Resnet stages. Default: 4. + spatial_strides (Sequence[int]): + Spatial strides of residual blocks of each stage. + Default: ``(1, 2, 2, 2)``. + frozen_stages (int): Stages to be frozen (all param fixed). If set to + -1, it means not freezing any parameters. Default: -1. + se_style (str): The style of inserting SE modules into BlockX3D, 'half' + denotes insert into half of the blocks, while 'all' denotes insert + into all blocks. Default: 'half'. + se_ratio (float | None): The reduction ratio of squeeze and excitation + unit. If set as None, it means not using SE unit. Default: 1 / 16. + use_swish (bool): Whether to use swish as the activation function + before and after the 3x3x3 conv. Default: True. + conv_cfg (dict): Config for conv layers. required keys are ``type`` + Default: ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. required keys are ``type`` and + ``requires_grad``. + Default: ``dict(type='BN3d', requires_grad=True)``. + act_cfg (dict): Config dict for activation layer. + Default: ``dict(type='ReLU', inplace=True)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (mean and var). Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): + Whether to use zero initialization for residual block, + Default: True. + kwargs (dict, optional): Key arguments for "make_res_layer". + """ + + def __init__(self, + gamma_w=1.0, + gamma_b=1.0, + gamma_d=1.0, + pretrained=None, + in_channels=3, + num_stages=4, + spatial_strides=(2, 2, 2, 2), + frozen_stages=-1, + se_style='half', + se_ratio=1 / 16, + use_swish=True, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True), + norm_eval=False, + with_cp=False, + zero_init_residual=True, + **kwargs): + super().__init__() + self.gamma_w = gamma_w + self.gamma_b = gamma_b + self.gamma_d = gamma_d + + self.pretrained = pretrained + self.in_channels = in_channels + # Hard coded, can be changed by gamma_w + self.base_channels = 24 + self.stage_blocks = [1, 2, 5, 3] + + # apply parameters gamma_w and gamma_d + self.base_channels = self._round_width(self.base_channels, + self.gamma_w) + + self.stage_blocks = [ + self._round_repeats(x, self.gamma_d) for x in self.stage_blocks + ] + + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.spatial_strides = spatial_strides + assert len(spatial_strides) == num_stages + self.frozen_stages = frozen_stages + + self.se_style = se_style + assert self.se_style in ['all', 'half'] + self.se_ratio = se_ratio + assert (self.se_ratio is None) or (self.se_ratio > 0) + self.use_swish = use_swish + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + self.block = BlockX3D + self.stage_blocks = self.stage_blocks[:num_stages] + self.layer_inplanes = self.base_channels + self._make_stem_layer() + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + spatial_stride = spatial_strides[i] + inplanes = self.base_channels * 2**i + planes = int(inplanes * self.gamma_b) + + res_layer = self.make_res_layer( + self.block, + self.layer_inplanes, + inplanes, + planes, + num_blocks, + spatial_stride=spatial_stride, + se_style=self.se_style, + se_ratio=self.se_ratio, + use_swish=self.use_swish, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + act_cfg=self.act_cfg, + with_cp=with_cp, + **kwargs) + self.layer_inplanes = inplanes + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = self.base_channels * 2**(len(self.stage_blocks) - 1) + self.conv5 = ConvModule( + self.feat_dim, + int(self.feat_dim * self.gamma_b), + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.feat_dim = int(self.feat_dim * self.gamma_b) + + @staticmethod + def _round_width(width, multiplier, min_depth=8, divisor=8): + """Round width of filters based on width multiplier.""" + if not multiplier: + return width + + width *= multiplier + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(width + divisor / 2) // divisor * divisor) + if new_filters < 0.9 * width: + new_filters += divisor + return int(new_filters) + + @staticmethod + def _round_repeats(repeats, multiplier): + """Round number of layers based on depth multiplier.""" + if not multiplier: + return repeats + return int(math.ceil(multiplier * repeats)) + + # the module is parameterized with gamma_b + # no temporal_stride + def make_res_layer(self, + block, + layer_inplanes, + inplanes, + planes, + blocks, + spatial_stride=1, + se_style='half', + se_ratio=None, + use_swish=True, + norm_cfg=None, + act_cfg=None, + conv_cfg=None, + with_cp=False, + **kwargs): + """Build residual layer for ResNet3D. + + Args: + block (nn.Module): Residual module to be built. + layer_inplanes (int): Number of channels for the input feature + of the res layer. + inplanes (int): Number of channels for the input feature in each + block, which equals to base_channels * gamma_w. + planes (int): Number of channels for the output feature in each + block, which equals to base_channel * gamma_w * gamma_b. + blocks (int): Number of residual blocks. + spatial_stride (int): Spatial strides in residual and conv layers. + Default: 1. + se_style (str): The style of inserting SE modules into BlockX3D, + 'half' denotes insert into half of the blocks, while 'all' + denotes insert into all blocks. Default: 'half'. + se_ratio (float | None): The reduction ratio of squeeze and + excitation unit. If set as None, it means not using SE unit. + Default: None. + use_swish (bool): Whether to use swish as the activation function + before and after the 3x3x3 conv. Default: True. + conv_cfg (dict | None): Config for norm layers. Default: None. + norm_cfg (dict | None): Config for norm layers. Default: None. + act_cfg (dict | None): Config for activate layers. Default: None. + with_cp (bool | None): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + + Returns: + nn.Module: A residual layer for the given config. + """ + downsample = None + if spatial_stride != 1 or layer_inplanes != inplanes: + downsample = ConvModule( + layer_inplanes, + inplanes, + kernel_size=1, + stride=(1, spatial_stride, spatial_stride), + padding=0, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + use_se = [False] * blocks + if self.se_style == 'all': + use_se = [True] * blocks + elif self.se_style == 'half': + use_se = [i % 2 == 0 for i in range(blocks)] + else: + raise NotImplementedError + + layers = [] + layers.append( + block( + layer_inplanes, + planes, + inplanes, + spatial_stride=spatial_stride, + downsample=downsample, + se_ratio=se_ratio if use_se[0] else None, + use_swish=use_swish, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + inplanes, + spatial_stride=1, + se_ratio=se_ratio if use_se[i] else None, + use_swish=use_swish, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + + return nn.Sequential(*layers) + + def _make_stem_layer(self): + """Construct the stem layers consists of a conv+norm+act module and a + pooling layer.""" + self.conv1_s = ConvModule( + self.in_channels, + self.base_channels, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=None, + act_cfg=None) + self.conv1_t = ConvModule( + self.base_channels, + self.base_channels, + kernel_size=(5, 1, 1), + stride=(1, 1, 1), + padding=(2, 0, 0), + groups=self.base_channels, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def _freeze_stages(self): + """Prevent all the parameters from being optimized before + ``self.frozen_stages``.""" + if self.frozen_stages >= 0: + self.conv1_s.eval() + self.conv1_t.eval() + for param in self.conv1_s.parameters(): + param.requires_grad = False + for param in self.conv1_t.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + + load_checkpoint(self, self.pretrained, strict=False, logger=logger) + + elif self.pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, BlockX3D): + constant_init(m.conv3.bn, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The feature of the input + samples extracted by the backbone. + """ + x = self.conv1_s(x) + x = self.conv1_t(x) + for layer_name in self.res_layers: + res_layer = getattr(self, layer_name) + x = res_layer(x) + x = self.conv5(x) + return x + + def train(self, mode=True): + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmaction/models/common/__init__.py b/mmaction/models/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b91cb4f537d0a2daefccaf015ce9c0b606c38f1 --- /dev/null +++ b/mmaction/models/common/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .conv2plus1d import Conv2plus1d +from .conv_audio import ConvAudio +from .sub_batchnorm3d import SubBatchNorm3D +from .tam import TAM +from .transformer import (DividedSpatialAttentionWithNorm, + DividedTemporalAttentionWithNorm, FFNWithNorm) + +__all__ = [ + 'Conv2plus1d', 'TAM', 'DividedSpatialAttentionWithNorm', + 'DividedTemporalAttentionWithNorm', 'FFNWithNorm', 'SubBatchNorm3D', + 'ConvAudio' +] diff --git a/mmaction/models/common/__pycache__/__init__.cpython-310.pyc b/mmaction/models/common/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07e85eaa94ababafbbdbccad7e0b96f511b9d6d5 Binary files /dev/null and b/mmaction/models/common/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/common/__pycache__/conv2plus1d.cpython-310.pyc b/mmaction/models/common/__pycache__/conv2plus1d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f596cb843d7af5431ddb48348528e57a5e252ca Binary files /dev/null and b/mmaction/models/common/__pycache__/conv2plus1d.cpython-310.pyc differ diff --git a/mmaction/models/common/__pycache__/conv_audio.cpython-310.pyc b/mmaction/models/common/__pycache__/conv_audio.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0f06b10ac6f3230acc3c5fd8e9aea445ba107a5 Binary files /dev/null and b/mmaction/models/common/__pycache__/conv_audio.cpython-310.pyc differ diff --git a/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-310.pyc b/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3951e2fde09191a04271d0d5d95cf1cb16a04f7f Binary files /dev/null and b/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-310.pyc differ diff --git a/mmaction/models/common/__pycache__/tam.cpython-310.pyc b/mmaction/models/common/__pycache__/tam.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73360aea4bbe05d855bff631587e38438e7df76f Binary files /dev/null and b/mmaction/models/common/__pycache__/tam.cpython-310.pyc differ diff --git a/mmaction/models/common/__pycache__/transformer.cpython-310.pyc b/mmaction/models/common/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b189dc1af949567f1e0d82b547ecef8fd77c1cc8 Binary files /dev/null and b/mmaction/models/common/__pycache__/transformer.cpython-310.pyc differ diff --git a/mmaction/models/common/conv2plus1d.py b/mmaction/models/common/conv2plus1d.py new file mode 100644 index 0000000000000000000000000000000000000000..67e481a90ada086b8b5711ff4407421e1b6cb07b --- /dev/null +++ b/mmaction/models/common/conv2plus1d.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_norm_layer +from mmengine.model.weight_init import constant_init, kaiming_init +from torch.nn.modules.utils import _triple + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType + + +@MODELS.register_module() +class Conv2plus1d(nn.Module): + """(2+1)d Conv module for R(2+1)d backbone. + + https://arxiv.org/pdf/1711.11248.pdf. + + Args: + in_channels (int): Same as ``nn.Conv3d``. + out_channels (int): Same as ``nn.Conv3d``. + kernel_size (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. + stride (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. Defaults to 1. + padding (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. Defaults to 0. + dilation (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. + Defaults to 1. + groups (int): Same as ``nn.Conv3d``. Defaults to 1. + bias (Union[bool, str]): If specified as `auto`, it will be decided by + the norm_cfg. Bias will be set as True if norm_cfg is None, + otherwise False. + norm_cfg (Union[dict, ConfigDict]): Config for norm layers. + Defaults to ``dict(type='BN3d')``. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]], + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 0, + dilation: Union[int, Tuple[int]] = 1, + groups: int = 1, + bias: Union[bool, str] = True, + norm_cfg: ConfigType = dict(type='BN3d') + ) -> None: + super().__init__() + + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + assert len(kernel_size) == len(stride) == len(padding) == 3 + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.bias = bias + self.norm_cfg = norm_cfg + self.output_padding = (0, 0, 0) + self.transposed = False + + # The middle-plane is calculated according to: + # M_i = \floor{\frac{t * d^2 N_i-1 * N_i} + # {d^2 * N_i-1 + t * N_i}} + # where d, t are spatial and temporal kernel, and + # N_i, N_i-1 are planes + # and inplanes. https://arxiv.org/pdf/1711.11248.pdf + mid_channels = 3 * ( + in_channels * out_channels * kernel_size[1] * kernel_size[2]) + mid_channels /= ( + in_channels * kernel_size[1] * kernel_size[2] + 3 * out_channels) + mid_channels = int(mid_channels) + + self.conv_s = nn.Conv3d( + in_channels, + mid_channels, + kernel_size=(1, kernel_size[1], kernel_size[2]), + stride=(1, stride[1], stride[2]), + padding=(0, padding[1], padding[2]), + bias=bias) + _, self.bn_s = build_norm_layer(self.norm_cfg, mid_channels) + self.relu = nn.ReLU(inplace=True) + self.conv_t = nn.Conv3d( + mid_channels, + out_channels, + kernel_size=(kernel_size[0], 1, 1), + stride=(stride[0], 1, 1), + padding=(padding[0], 0, 0), + bias=bias) + + self.init_weights() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + x = self.conv_s(x) + x = self.bn_s(x) + x = self.relu(x) + x = self.conv_t(x) + return x + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + kaiming_init(self.conv_s) + kaiming_init(self.conv_t) + constant_init(self.bn_s, 1, bias=0) diff --git a/mmaction/models/common/conv_audio.py b/mmaction/models/common/conv_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..c53aad1d0907e90d3b73fa9f77c840b68a9d8d75 --- /dev/null +++ b/mmaction/models/common/conv_audio.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model.weight_init import constant_init, kaiming_init +from torch.nn.modules.utils import _pair + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class ConvAudio(nn.Module): + """Conv2d module for AudioResNet backbone. + + `_. + + Args: + in_channels (int): Same as ``nn.Conv2d``. + out_channels (int): Same as ``nn.Conv2d``. + kernel_size (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. + op (str): Operation to merge the output of freq + and time feature map. Choices are ``sum`` and ``concat``. + Defaults to ``concat``. + stride (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. Defaults to 1. + padding (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. Defaults to 0. + dilation (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. + Defaults to 1. + groups (int): Same as ``nn.Conv2d``. Defaults to 1. + bias (Union[bool, str]): If specified as ``auto``, it will be decided + by the ``norm_cfg``. Bias will be set as True if ``norm_cfg`` + is None, otherwise False. Defaults to False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]], + op: str = 'concat', + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 0, + dilation: Union[int, Tuple[int]] = 1, + groups: int = 1, + bias: Union[bool, str] = False) -> None: + super().__init__() + + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + assert op in ['concat', 'sum'] + self.op = op + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.bias = bias + self.output_padding = (0, 0) + self.transposed = False + + self.conv_1 = ConvModule( + in_channels, + out_channels, + kernel_size=(kernel_size[0], 1), + stride=stride, + padding=(kernel_size[0] // 2, 0), + bias=bias, + conv_cfg=dict(type='Conv'), + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU')) + + self.conv_2 = ConvModule( + in_channels, + out_channels, + kernel_size=(1, kernel_size[1]), + stride=stride, + padding=(0, kernel_size[1] // 2), + bias=bias, + conv_cfg=dict(type='Conv'), + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU')) + + self.init_weights() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + x_1 = self.conv_1(x) + x_2 = self.conv_2(x) + if self.op == 'concat': + out = torch.cat([x_1, x_2], 1) + else: + out = x_1 + x_2 + return out + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + kaiming_init(self.conv_1.conv) + kaiming_init(self.conv_2.conv) + constant_init(self.conv_1.bn, 1, bias=0) + constant_init(self.conv_2.bn, 1, bias=0) diff --git a/mmaction/models/common/sub_batchnorm3d.py b/mmaction/models/common/sub_batchnorm3d.py new file mode 100644 index 0000000000000000000000000000000000000000..60c7e80d60cb051e9941913f2ef7f448eec8c9a3 --- /dev/null +++ b/mmaction/models/common/sub_batchnorm3d.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy + +import torch +import torch.nn as nn + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class SubBatchNorm3D(nn.Module): + """Sub BatchNorm3d splits the batch dimension into N splits, and run BN on + each of them separately (so that the stats are computed on each subset of + examples (1/N of batch) independently). During evaluation, it aggregates + the stats from all splits into one BN. + + Args: + num_features (int): Dimensions of BatchNorm. + """ + + def __init__(self, num_features, **cfg): + super(SubBatchNorm3D, self).__init__() + + self.num_features = num_features + self.cfg_ = deepcopy(cfg) + self.num_splits = self.cfg_.pop('num_splits', 1) + self.num_features_split = self.num_features * self.num_splits + # only keep one set of affine params, not in .bn or .split_bn + self.cfg_['affine'] = False + self.bn = nn.BatchNorm3d(num_features, **self.cfg_) + self.split_bn = nn.BatchNorm3d(self.num_features_split, **self.cfg_) + self.init_weights(cfg) + + def init_weights(self, cfg): + """Initialize weights.""" + if cfg.get('affine', True): + self.weight = torch.nn.Parameter(torch.ones(self.num_features)) + self.bias = torch.nn.Parameter(torch.zeros(self.num_features)) + self.affine = True + else: + self.affine = False + + def _get_aggregated_mean_std(self, means, stds, n): + """Calculate aggregated mean and std.""" + mean = means.view(n, -1).sum(0) / n + std = stds.view(n, -1).sum(0) / n + ( + (means.view(n, -1) - mean)**2).view(n, -1).sum(0) / n + return mean.detach(), std.detach() + + def aggregate_stats(self): + """Synchronize running_mean, and running_var to self.bn. + + Call this before eval, then call model.eval(); When eval, forward + function will call self.bn instead of self.split_bn, During this time + the running_mean, and running_var of self.bn has been obtained from + self.split_bn. + """ + if self.split_bn.track_running_stats: + aggre_func = self._get_aggregated_mean_std + self.bn.running_mean.data, self.bn.running_var.data = aggre_func( + self.split_bn.running_mean, self.split_bn.running_var, + self.num_splits) + self.bn.num_batches_tracked = self.split_bn.num_batches_tracked.detach( + ) + + def forward(self, x): + """Defines the computation performed at every call.""" + if self.training: + n, c, t, h, w = x.shape + assert n % self.num_splits == 0 + x = x.view(n // self.num_splits, c * self.num_splits, t, h, w) + x = self.split_bn(x) + x = x.view(n, c, t, h, w) + else: + x = self.bn(x) + if self.affine: + x = x * self.weight.view(-1, 1, 1, 1) + x = x + self.bias.view(-1, 1, 1, 1) + return x diff --git a/mmaction/models/common/tam.py b/mmaction/models/common/tam.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce4bbedb29f1a614c3668de46f32aced44f9496 --- /dev/null +++ b/mmaction/models/common/tam.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class TAM(nn.Module): + """Temporal Adaptive Module(TAM) for TANet. + + This module is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO + RECOGNITION `_ + + Args: + in_channels (int): Channel num of input features. + num_segments (int): Number of frame segments. + alpha (int): ``alpha`` in the paper and is the ratio of the + intermediate channel number to the initial channel number in the + global branch. Defaults to 2. + adaptive_kernel_size (int): ``K`` in the paper and is the size of the + adaptive kernel size in the global branch. Defaults to 3. + beta (int): ``beta`` in the paper and is set to control the model + complexity in the local branch. Defaults to 4. + conv1d_kernel_size (int): Size of the convolution kernel of Conv1d in + the local branch. Defaults to 3. + adaptive_convolution_stride (int): The first dimension of strides in + the adaptive convolution of ``Temporal Adaptive Aggregation``. + Defaults to 1. + adaptive_convolution_padding (int): The first dimension of paddings in + the adaptive convolution of ``Temporal Adaptive Aggregation``. + Defaults to 1. + init_std (float): Std value for initiation of `nn.Linear`. Defaults to + 0.001. + """ + + def __init__(self, + in_channels: int, + num_segments: int, + alpha: int = 2, + adaptive_kernel_size: int = 3, + beta: int = 4, + conv1d_kernel_size: int = 3, + adaptive_convolution_stride: int = 1, + adaptive_convolution_padding: int = 1, + init_std: float = 0.001) -> None: + super().__init__() + + assert beta > 0 and alpha > 0 + self.in_channels = in_channels + self.num_segments = num_segments + self.alpha = alpha + self.adaptive_kernel_size = adaptive_kernel_size + self.beta = beta + self.conv1d_kernel_size = conv1d_kernel_size + self.adaptive_convolution_stride = adaptive_convolution_stride + self.adaptive_convolution_padding = adaptive_convolution_padding + self.init_std = init_std + + self.G = nn.Sequential( + nn.Linear(num_segments, num_segments * alpha, bias=False), + nn.BatchNorm1d(num_segments * alpha), nn.ReLU(inplace=True), + nn.Linear(num_segments * alpha, adaptive_kernel_size, bias=False), + nn.Softmax(-1)) + + self.L = nn.Sequential( + nn.Conv1d( + in_channels, + in_channels // beta, + conv1d_kernel_size, + stride=1, + padding=conv1d_kernel_size // 2, + bias=False), nn.BatchNorm1d(in_channels // beta), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels // beta, in_channels, 1, bias=False), + nn.Sigmoid()) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The output of the module. + """ + # [n, c, h, w] + n, c, h, w = x.size() + num_segments = self.num_segments + num_batches = n // num_segments + assert c == self.in_channels + + # [num_batches, c, num_segments, h, w] + x = x.view(num_batches, num_segments, c, h, w) + x = x.permute(0, 2, 1, 3, 4).contiguous() + + # [num_batches * c, num_segments, 1, 1] + theta_out = F.adaptive_avg_pool2d( + x.view(-1, num_segments, h, w), (1, 1)) + + # [num_batches * c, 1, adaptive_kernel_size, 1] + conv_kernel = self.G(theta_out.view(-1, num_segments)).view( + num_batches * c, 1, -1, 1) + + # [num_batches, c, num_segments, 1, 1] + local_activation = self.L(theta_out.view(-1, c, num_segments)).view( + num_batches, c, num_segments, 1, 1) + + # [num_batches, c, num_segments, h, w] + new_x = x * local_activation + + # [1, num_batches * c, num_segments, h * w] + y = F.conv2d( + new_x.view(1, num_batches * c, num_segments, h * w), + conv_kernel, + bias=None, + stride=(self.adaptive_convolution_stride, 1), + padding=(self.adaptive_convolution_padding, 0), + groups=num_batches * c) + + # [n, c, h, w] + y = y.view(num_batches, c, num_segments, h, w) + y = y.permute(0, 2, 1, 3, 4).contiguous().view(n, c, h, w) + + return y diff --git a/mmaction/models/common/transformer.py b/mmaction/models/common/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5af874f05f5b33ebe8bc57a345153fa2ee6df825 --- /dev/null +++ b/mmaction/models/common/transformer.py @@ -0,0 +1,222 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from einops import rearrange +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, build_dropout +from mmengine.model import BaseModule +from mmengine.model.weight_init import constant_init +from mmengine.utils import digit_version + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class DividedTemporalAttentionWithNorm(BaseModule): + """Temporal Attention in Divided Space Time Attention. + + Args: + embed_dims (int): Dimensions of embedding. + num_heads (int): Number of parallel attention heads in + TransformerCoder. + num_frames (int): Number of frames in the video. + attn_drop (float): A Dropout layer on attn_output_weights. Defaults to + 0.. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Defaults to 0.. + dropout_layer (dict): The dropout_layer used when adding the shortcut. + Defaults to `dict(type='DropPath', drop_prob=0.1)`. + norm_cfg (dict): Config dict for normalization layer. Defaults to + `dict(type='LN')`. + init_cfg (dict | None): The Config for initialization. Defaults to + None. + """ + + def __init__(self, + embed_dims, + num_heads, + num_frames, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='DropPath', drop_prob=0.1), + norm_cfg=dict(type='LN'), + init_cfg=None, + **kwargs): + super().__init__(init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + self.num_frames = num_frames + self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1] + + if digit_version(torch.__version__) < digit_version('1.9.0'): + kwargs.pop('batch_first', None) + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + self.temporal_fc = nn.Linear(self.embed_dims, self.embed_dims) + + self.init_weights() + + def init_weights(self): + """Initialize weights.""" + constant_init(self.temporal_fc, val=0, bias=0) + + def forward(self, query, key=None, value=None, residual=None, **kwargs): + """Defines the computation performed at every call.""" + assert residual is None, ( + 'Always adding the shortcut in the forward function') + + init_cls_token = query[:, 0, :].unsqueeze(1) + identity = query_t = query[:, 1:, :] + + # query_t [batch_size, num_patches * num_frames, embed_dims] + b, pt, m = query_t.size() + p, t = pt // self.num_frames, self.num_frames + + # res_temporal [batch_size * num_patches, num_frames, embed_dims] + query_t = self.norm(query_t.reshape(b * p, t, m)).permute(1, 0, 2) + res_temporal = self.attn(query_t, query_t, query_t)[0].permute(1, 0, 2) + res_temporal = self.dropout_layer( + self.proj_drop(res_temporal.contiguous())) + res_temporal = self.temporal_fc(res_temporal) + + # res_temporal [batch_size, num_patches * num_frames, embed_dims] + res_temporal = res_temporal.reshape(b, p * t, m) + + # ret_value [batch_size, num_patches * num_frames + 1, embed_dims] + new_query_t = identity + res_temporal + new_query = torch.cat((init_cls_token, new_query_t), 1) + return new_query + + +@MODELS.register_module() +class DividedSpatialAttentionWithNorm(BaseModule): + """Spatial Attention in Divided Space Time Attention. + + Args: + embed_dims (int): Dimensions of embedding. + num_heads (int): Number of parallel attention heads in + TransformerCoder. + num_frames (int): Number of frames in the video. + attn_drop (float): A Dropout layer on attn_output_weights. Defaults to + 0.. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Defaults to 0.. + dropout_layer (dict): The dropout_layer used when adding the shortcut. + Defaults to `dict(type='DropPath', drop_prob=0.1)`. + norm_cfg (dict): Config dict for normalization layer. Defaults to + `dict(type='LN')`. + init_cfg (dict | None): The Config for initialization. Defaults to + None. + """ + + def __init__(self, + embed_dims, + num_heads, + num_frames, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='DropPath', drop_prob=0.1), + norm_cfg=dict(type='LN'), + init_cfg=None, + **kwargs): + super().__init__(init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + self.num_frames = num_frames + self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1] + if digit_version(torch.__version__) < digit_version('1.9.0'): + kwargs.pop('batch_first', None) + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + self.init_weights() + + def init_weights(self): + """init DividedSpatialAttentionWithNorm by default.""" + pass + + def forward(self, query, key=None, value=None, residual=None, **kwargs): + """Defines the computation performed at every call.""" + assert residual is None, ( + 'Always adding the shortcut in the forward function') + + identity = query + init_cls_token = query[:, 0, :].unsqueeze(1) + query_s = query[:, 1:, :] + + # query_s [batch_size, num_patches * num_frames, embed_dims] + b, pt, m = query_s.size() + p, t = pt // self.num_frames, self.num_frames + + # cls_token [batch_size * num_frames, 1, embed_dims] + cls_token = init_cls_token.repeat(1, t, 1).reshape(b * t, + m).unsqueeze(1) + + # query_s [batch_size * num_frames, num_patches + 1, embed_dims] + query_s = rearrange(query_s, 'b (p t) m -> (b t) p m', p=p, t=t) + query_s = torch.cat((cls_token, query_s), 1) + + # res_spatial [batch_size * num_frames, num_patches + 1, embed_dims] + query_s = self.norm(query_s).permute(1, 0, 2) + res_spatial = self.attn(query_s, query_s, query_s)[0].permute(1, 0, 2) + res_spatial = self.dropout_layer( + self.proj_drop(res_spatial.contiguous())) + + # cls_token [batch_size, 1, embed_dims] + cls_token = res_spatial[:, 0, :].reshape(b, t, m) + cls_token = torch.mean(cls_token, 1, True) + + # res_spatial [batch_size * num_frames, num_patches + 1, embed_dims] + res_spatial = rearrange( + res_spatial[:, 1:, :], '(b t) p m -> b (p t) m', p=p, t=t) + res_spatial = torch.cat((cls_token, res_spatial), 1) + + new_query = identity + res_spatial + return new_query + + +@MODELS.register_module() +class FFNWithNorm(FFN): + """FFN with pre normalization layer. + + FFNWithNorm is implemented to be compatible with `BaseTransformerLayer` + when using `DividedTemporalAttentionWithNorm` and + `DividedSpatialAttentionWithNorm`. + + FFNWithNorm has one main difference with FFN: + + - It apply one normalization layer before forwarding the input data to + feed-forward networks. + + Args: + embed_dims (int): Dimensions of embedding. Defaults to 256. + feedforward_channels (int): Hidden dimension of FFNs. Defaults to 1024. + num_fcs (int, optional): Number of fully-connected layers in FFNs. + Defaults to 2. + act_cfg (dict): Config for activate layers. + Defaults to `dict(type='ReLU')` + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Defaults to 0.. + add_residual (bool, optional): Whether to add the + residual connection. Defaults to `True`. + dropout_layer (dict | None): The dropout_layer used when adding the + shortcut. Defaults to None. + init_cfg (dict): The Config for initialization. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. Defaults to + `dict(type='LN')`. + """ + + def __init__(self, *args, norm_cfg=dict(type='LN'), **kwargs): + super().__init__(*args, **kwargs) + self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1] + + def forward(self, x, residual=None): + """Defines the computation performed at every call.""" + assert residual is None, ('Cannot apply pre-norm with FFNWithNorm') + return super().forward(self.norm(x), x) diff --git a/mmaction/models/data_preprocessors/__init__.py b/mmaction/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3d8ab6cd02f64b4f4fce318e384d708482425f2 --- /dev/null +++ b/mmaction/models/data_preprocessors/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import ActionDataPreprocessor +from .multimodal_data_preprocessor import MultiModalDataPreprocessor + +__all__ = ['ActionDataPreprocessor', 'MultiModalDataPreprocessor'] diff --git a/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-310.pyc b/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4d6b6398d29de3a40cd986a9a2dff8a94798541 Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-310.pyc b/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a62aeaa60d20b8c10e673d5cc7a2ac650c0a9bb Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-310.pyc differ diff --git a/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-310.pyc b/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ecd5a6d4c95040aeb0ff507d2323f4ae2b13284 Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-310.pyc differ diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..499cdd730fc53d66dcb5a657f5476f6f25fc8536 --- /dev/null +++ b/mmaction/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import torch +from mmengine.model import BaseDataPreprocessor, stack_batch + +from mmaction.registry import MODELS +from mmaction.utils import SampleList + + +@MODELS.register_module() +class ActionDataPreprocessor(BaseDataPreprocessor): + """Data pre-processor for action recognition tasks. + + Args: + mean (Sequence[float or int], optional): The pixel mean of channels + of images or stacked optical flow. Defaults to None. + std (Sequence[float or int], optional): The pixel standard deviation + of channels of images or stacked optical flow. Defaults to None. + to_rgb (bool): Whether to convert image from BGR to RGB. + Defaults to False. + to_float32 (bool): Whether to convert data to float32. + Defaults to True. + blending (dict, optional): Config for batch blending. + Defaults to None. + format_shape (str): Format shape of input data. + Defaults to ``'NCHW'``. + """ + + def __init__(self, + mean: Optional[Sequence[Union[float, int]]] = None, + std: Optional[Sequence[Union[float, int]]] = None, + to_rgb: bool = False, + to_float32: bool = True, + blending: Optional[dict] = None, + format_shape: str = 'NCHW') -> None: + super().__init__() + self.to_rgb = to_rgb + self.to_float32 = to_float32 + self.format_shape = format_shape + + if mean is not None: + assert std is not None, 'To enable the normalization in ' \ + 'preprocessing, please specify both ' \ + '`mean` and `std`.' + # Enable the normalization in preprocessing. + self._enable_normalize = True + if self.format_shape == 'NCHW': + normalizer_shape = (-1, 1, 1) + elif self.format_shape in ['NCTHW', 'MIX2d3d']: + normalizer_shape = (-1, 1, 1, 1) + else: + raise ValueError(f'Invalid format shape: {format_shape}') + + self.register_buffer( + 'mean', + torch.tensor(mean, dtype=torch.float32).view(normalizer_shape), + False) + self.register_buffer( + 'std', + torch.tensor(std, dtype=torch.float32).view(normalizer_shape), + False) + else: + self._enable_normalize = False + + if blending is not None: + self.blending = MODELS.build(blending) + else: + self.blending = None + + def forward(self, + data: Union[dict, Tuple[dict]], + training: bool = False) -> Union[dict, Tuple[dict]]: + """Perform normalization, padding, bgr2rgb conversion and batch + augmentation based on ``BaseDataPreprocessor``. + + Args: + data (dict or Tuple[dict]): data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict or Tuple[dict]: Data in the same format as the model input. + """ + data = self.cast_data(data) + if isinstance(data, dict): + return self.forward_onesample(data, training=training) + elif isinstance(data, (tuple, list)): + outputs = [] + for data_sample in data: + output = self.forward_onesample(data_sample, training=training) + outputs.append(output) + return tuple(outputs) + else: + raise TypeError(f'Unsupported data type: {type(data)}!') + + def forward_onesample(self, data, training: bool = False) -> dict: + """Perform normalization, padding, bgr2rgb conversion and batch + augmentation on one data sample. + + Args: + data (dict): data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + inputs, data_samples = data['inputs'], data['data_samples'] + inputs, data_samples = self.preprocess(inputs, data_samples, training) + data['inputs'] = inputs + data['data_samples'] = data_samples + return data + + def preprocess(self, + inputs: List[torch.Tensor], + data_samples: SampleList, + training: bool = False) -> Tuple: + # --- Pad and stack -- + batch_inputs = stack_batch(inputs) + + if self.format_shape == 'MIX2d3d': + if batch_inputs.ndim == 4: + format_shape, view_shape = 'NCHW', (-1, 1, 1) + else: + format_shape, view_shape = 'NCTHW', None + else: + format_shape, view_shape = self.format_shape, None + + # ------ To RGB ------ + if self.to_rgb: + if format_shape == 'NCHW': + batch_inputs = batch_inputs[..., [2, 1, 0], :, :] + elif format_shape == 'NCTHW': + batch_inputs = batch_inputs[..., [2, 1, 0], :, :, :] + else: + raise ValueError(f'Invalid format shape: {format_shape}') + + # -- Normalization --- + if self._enable_normalize: + if view_shape is None: + batch_inputs = (batch_inputs - self.mean) / self.std + else: + mean = self.mean.view(view_shape) + std = self.std.view(view_shape) + batch_inputs = (batch_inputs - mean) / std + elif self.to_float32: + batch_inputs = batch_inputs.to(torch.float32) + + # ----- Blending ----- + if training and self.blending is not None: + batch_inputs, data_samples = self.blending(batch_inputs, + data_samples) + + return batch_inputs, data_samples diff --git a/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..c4353c447244251f2000cdf6a1efc8df3135a349 --- /dev/null +++ b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + +from mmengine.model import BaseDataPreprocessor, ModuleDict + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class MultiModalDataPreprocessor(BaseDataPreprocessor): + """Multi-Modal data pre-processor for action recognition tasks.""" + + def __init__(self, preprocessors: Dict) -> None: + super().__init__() + self.preprocessors = ModuleDict() + for name, pre_cfg in preprocessors.items(): + assert 'type' in pre_cfg, ( + 'Each data preprocessor should contain the key type, ' + f'but got {pre_cfg}') + self.preprocessors[name] = MODELS.build(pre_cfg) + + def forward(self, data: Dict, training: bool = False) -> Dict: + """Preprocesses the data into the model input format. + + Args: + data (dict): Data returned by dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + for modality, modality_data in inputs.items(): + preprocessor = self.preprocessors[modality] + modality_data, data_samples = preprocessor.preprocess( + modality_data, data_samples, training) + inputs[modality] = modality_data + + data['inputs'] = inputs + data['data_samples'] = data_samples + return data diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b03f72f37fe383eb691f00625bdd6f904a9504cf --- /dev/null +++ b/mmaction/models/heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseHead +from .feature_head import FeatureHead +from .gcn_head import GCNHead +from .i3d_head import I3DHead +from .mvit_head import MViTHead +from .omni_head import OmniHead +from .rgbpose_head import RGBPoseHead +from .slowfast_head import SlowFastHead +from .timesformer_head import TimeSformerHead +from .tpn_head import TPNHead +from .trn_head import TRNHead +from .tsm_head import TSMHead +from .tsn_audio_head import TSNAudioHead +from .tsn_head import TSNHead +from .uniformer_head import UniFormerHead +from .x3d_head import X3DHead + +__all__ = [ + 'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead', + 'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead', + 'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead', 'FeatureHead' +] diff --git a/mmaction/models/heads/__pycache__/__init__.cpython-310.pyc b/mmaction/models/heads/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c8e71e7f1c376906cea0f1e3b39af8194846cad Binary files /dev/null and b/mmaction/models/heads/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/base.cpython-310.pyc b/mmaction/models/heads/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2e5beb8a532c3d1d27d842bc62552506539150a Binary files /dev/null and b/mmaction/models/heads/__pycache__/base.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/feature_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/feature_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..964dec24a7195e5930f226c3de8893c2e0f7c780 Binary files /dev/null and b/mmaction/models/heads/__pycache__/feature_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/gcn_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/gcn_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85d5d76bd9e7751e198e980cd5e3aadfe6e3457c Binary files /dev/null and b/mmaction/models/heads/__pycache__/gcn_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/i3d_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/i3d_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84a0df9d841022ca24f15f203aa1007e548f3f5b Binary files /dev/null and b/mmaction/models/heads/__pycache__/i3d_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/mvit_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/mvit_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54d1e833e9ad9e16280c2368c2f2edce0109c541 Binary files /dev/null and b/mmaction/models/heads/__pycache__/mvit_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/omni_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/omni_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab47b71fe6076b1a2797d3bed4a69b373c9b79e Binary files /dev/null and b/mmaction/models/heads/__pycache__/omni_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/rgbpose_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/rgbpose_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a76937e67d6aa32486260698e2dbdecf8662b07d Binary files /dev/null and b/mmaction/models/heads/__pycache__/rgbpose_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/slowfast_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/slowfast_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc6bde8805b5f3cfb9e2a9af252951d7d369ee68 Binary files /dev/null and b/mmaction/models/heads/__pycache__/slowfast_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/timesformer_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/timesformer_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7d5c0c07acd9c33214e024bc78b635a75170abf Binary files /dev/null and b/mmaction/models/heads/__pycache__/timesformer_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/tpn_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/tpn_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fa7e4a8b7fe43c5d45f76bedfbdf2fcf2f5ea02 Binary files /dev/null and b/mmaction/models/heads/__pycache__/tpn_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/trn_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/trn_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bcc342d339d71f53ed166ab68b08082dc213f50 Binary files /dev/null and b/mmaction/models/heads/__pycache__/trn_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/tsm_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/tsm_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c255aa3d9672f53c0b19b21a67c9d20fc7721525 Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsm_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a08f6e5cede94a7e7796513d72e45fd2103aef83 Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/tsn_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/tsn_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cf92d8774c5863cfedb1f1f04c46ab1ba2002d4 Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsn_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/uniformer_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/uniformer_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b35463c7f4f89b64f77b17d76b7529cb7bba7d81 Binary files /dev/null and b/mmaction/models/heads/__pycache__/uniformer_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/__pycache__/x3d_head.cpython-310.pyc b/mmaction/models/heads/__pycache__/x3d_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23d361912221918f8c39160fdd7b5f212539d875 Binary files /dev/null and b/mmaction/models/heads/__pycache__/x3d_head.cpython-310.pyc differ diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py new file mode 100644 index 0000000000000000000000000000000000000000..98ee11ee745fc965fc10d99c46989fad35e7261d --- /dev/null +++ b/mmaction/models/heads/base.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModule + +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import MODELS +from mmaction.utils import ForwardResults, SampleList + + +class AvgConsensus(nn.Module): + """Average consensus module. + + Args: + dim (int): Decide which dim consensus function to apply. + Defaults to 1. + """ + + def __init__(self, dim: int = 1) -> None: + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + return x.mean(dim=self.dim, keepdim=True) + + +class BaseHead(BaseModule, metaclass=ABCMeta): + """Base class for head. + + All Head should subclass it. + All subclass should overwrite: + - :meth:`forward`, supporting to forward both for training and testing. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss', loss_weight=1.0)``. + multi_class (bool): Determines whether it is a multi-class + recognition task. Defaults to False. + label_smooth_eps (float): Epsilon used in label smooth. + Reference: arxiv.org/abs/1906.02629. Defaults to 0. + topk (int or tuple): Top-k accuracy. Defaults to ``(1, 5)``. + average_clips (dict, optional): Config for averaging class + scores over multiple clips. Defaults to None. + init_cfg (dict, optional): Config to control the initialization. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: Dict = dict( + type='CrossEntropyLoss', loss_weight=1.0), + multi_class: bool = False, + label_smooth_eps: float = 0.0, + topk: Union[int, Tuple[int]] = (1, 5), + average_clips: Optional[Dict] = None, + init_cfg: Optional[Dict] = None) -> None: + super(BaseHead, self).__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.in_channels = in_channels + self.loss_cls = MODELS.build(loss_cls) + self.multi_class = multi_class + self.label_smooth_eps = label_smooth_eps + self.average_clips = average_clips + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + for _topk in topk: + assert _topk > 0, 'Top-k should be larger than 0' + self.topk = topk + + @abstractmethod + def forward(self, x, **kwargs) -> ForwardResults: + """Defines the computation performed at every call.""" + raise NotImplementedError + + def loss(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]], + data_samples: SampleList, **kwargs) -> Dict: + """Perform forward propagation of head and loss calculation on the + features of the upstream network. + + Args: + feats (torch.Tensor | tuple[torch.Tensor]): Features from + upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + cls_scores = self(feats, **kwargs) + return self.loss_by_feat(cls_scores, data_samples) + + def loss_by_feat(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> Dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (torch.Tensor): Classification prediction results of + all class, has shape (batch_size, num_classes). + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + labels = [x.gt_label for x in data_samples] + labels = torch.stack(labels).to(cls_scores.device) + labels = labels.squeeze() + + losses = dict() + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + elif labels.dim() == 1 and labels.size()[0] == self.num_classes \ + and cls_scores.size()[0] == 1: + # Fix a bug when training with soft labels and batch size is 1. + # When using soft labels, `labels` and `cls_score` share the same + # shape. + labels = labels.unsqueeze(0) + + if cls_scores.size() != labels.size(): + top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(), + labels.detach().cpu().numpy(), + self.topk) + for k, a in zip(self.topk, top_k_acc): + losses[f'top{k}_acc'] = torch.tensor( + a, device=cls_scores.device) + if self.label_smooth_eps != 0: + if cls_scores.size() != labels.size(): + labels = F.one_hot(labels, num_classes=self.num_classes) + labels = ((1 - self.label_smooth_eps) * labels + + self.label_smooth_eps / self.num_classes) + + loss_cls = self.loss_cls(cls_scores, labels) + # loss_cls may be dictionary or single tensor + if isinstance(loss_cls, dict): + losses.update(loss_cls) + else: + losses['loss_cls'] = loss_cls + return losses + + def predict(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]], + data_samples: SampleList, **kwargs) -> SampleList: + """Perform forward propagation of head and predict recognition results + on the features of the upstream network. + + Args: + feats (torch.Tensor | tuple[torch.Tensor]): Features from + upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + list[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + cls_scores = self(feats, **kwargs) + return self.predict_by_feat(cls_scores, data_samples) + + def predict_by_feat(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> SampleList: + """Transform a batch of output features extracted from the head into + prediction results. + + Args: + cls_scores (torch.Tensor): Classification scores, has a shape + (B*num_segs, num_classes) + data_samples (list[:obj:`ActionDataSample`]): The + annotation data of every samples. It usually includes + information such as `gt_label`. + + Returns: + List[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + num_segs = cls_scores.shape[0] // len(data_samples) + cls_scores = self.average_clip(cls_scores, num_segs=num_segs) + pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach() + + for data_sample, score, pred_label in zip(data_samples, cls_scores, + pred_labels): + data_sample.set_pred_score(score) + data_sample.set_pred_label(pred_label) + return data_samples + + def average_clip(self, + cls_scores: torch.Tensor, + num_segs: int = 1) -> torch.Tensor: + """Averaging class scores over multiple clips. + + Using different averaging types ('score' or 'prob' or None, + which defined in test_cfg) to computed the final averaged + class score. Only called in test mode. + + Args: + cls_scores (torch.Tensor): Class scores to be averaged. + num_segs (int): Number of clips for each input sample. + + Returns: + torch.Tensor: Averaged class scores. + """ + + if self.average_clips not in ['score', 'prob', None]: + raise ValueError(f'{self.average_clips} is not supported. ' + f'Currently supported ones are ' + f'["score", "prob", None]') + + batch_size = cls_scores.shape[0] + cls_scores = cls_scores.view((batch_size // num_segs, num_segs) + + cls_scores.shape[1:]) + + if self.average_clips is None: + return cls_scores + elif self.average_clips == 'prob': + cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1) + elif self.average_clips == 'score': + cls_scores = cls_scores.mean(dim=1) + + return cls_scores diff --git a/mmaction/models/heads/feature_head.py b/mmaction/models/heads/feature_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b010daa65caa2da9588ea0644444fc7fc1dfc97c --- /dev/null +++ b/mmaction/models/heads/feature_head.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +from torch import Tensor + +from mmaction.registry import MODELS +from .base import BaseHead + + +@MODELS.register_module() +class FeatureHead(BaseHead): + """General head for feature extraction. + + Args: + spatial_type (str, optional): Pooling type in spatial dimension. + Default: 'avg'. If set to None, means keeping spatial dimension, + and for GCN backbone, keeping last two dimension(T, V). + temporal_type (str, optional): Pooling type in temporal dimension. + Default: 'avg'. If set to None, meanse keeping temporal dimnsion, + and for GCN backbone, keeping dimesion M. Please note that the + channel order would keep same with the output of backbone, + [N, T, C, H, W] for 2D recognizer, and [N, M, C, T, V] for GCN + recognizer. + backbone_name (str, optional): Backbone name to specifying special + operations.Currently supports: `'tsm'`, `'slowfast'`, and `'gcn'`. + Defaults to None, means take the input as normal feature. + num_segments (int, optional): Number of frame segments for TSM + backbone. Defaults to None. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + spatial_type: str = 'avg', + temporal_type: str = 'avg', + backbone_name: Optional[str] = None, + num_segments: Optional[str] = None, + **kwargs) -> None: + super().__init__(None, None, **kwargs) + + self.temporal_type = temporal_type + self.backbone_name = backbone_name + self.num_segments = num_segments + if spatial_type == 'avg': + self.pool2d = torch.mean + elif spatial_type == 'max': + self.pool2d = torch.max + elif spatial_type is None: + self.pool2d = lambda x, dim: x + else: + raise NotImplementedError( + f'Unsupported spatial_type {spatial_type}') + + if temporal_type == 'avg': + self.pool1d = torch.mean + elif temporal_type == 'max': + self.pool1d = torch.max + elif temporal_type is None: + self.pool1d = lambda x, dim: x + else: + raise NotImplementedError( + f'Unsupported temporal_type {temporal_type}') + + def forward(self, + x: Tensor, + num_segs: Optional[int] = None, + **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + num_segs (int): For 2D backbone. Number of segments into which + a video is divided. Defaults to None. + Returns: + Tensor: The output features after pooling. + """ + if isinstance(x, Tensor): + n_dims = x.ndim + elif isinstance(x, tuple): + n_dims = x[0].ndim + assert self.backbone_name == 'slowfast', \ + 'Only support SlowFast backbone to input tuple' + else: + raise NotImplementedError(f'Unsupported feature type: {type(x)}') + # For 2D backbone with spatial dimension + if n_dims == 4: + assert num_segs is not None + if self.backbone_name == 'tsm': + assert self.num_segments is not None, \ + 'Please Specify num_segments for TSM' + num_segs = self.num_segments + # [N, T, channels, H, W] + x = x.view((-1, num_segs) + x.shape[1:]) + feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1) + + elif n_dims == 5: + if self.backbone_name == 'slowfast': + x_slow, x_fast = x + assert self.temporal_type is not None, \ + 'slowfast backbone has to pool temporal dimension' + x_fast = self.pool1d(self.pool2d(x_fast, dim=[-2, -1]), dim=2) + x_slow = self.pool1d(self.pool2d(x_slow, dim=[-2, -1]), dim=2) + feat = torch.cat((x_slow, x_fast), dim=1) + + # For GCN-based backbone + elif self.backbone_name == 'gcn': + # N, M, C, T, V + feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1) + # For 3D backbone with spatial dimension + else: + # [N, channels, T, H, W] + feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=2) + # For backbone output feature without spatial and temporal dimension + elif n_dims == 2: + # [N, channels] + feat = x + + return feat + + def predict_by_feat(self, feats: Union[Tensor, Tuple[Tensor]], + data_samples) -> Tensor: + """Integrate multi-view features into one tensor. + + Args: + feats (torch.Tensor | tuple[torch.Tensor]): Features from + upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + Tensor: The integrated multi-view features. + """ + num_segs = feats.shape[0] // len(data_samples) + feats = self.average_clip(feats, num_segs=num_segs) + + return feats diff --git a/mmaction/models/heads/gcn_head.py b/mmaction/models/heads/gcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1d21504640c64fad13ff714cb0d12fe9bdfb2338 --- /dev/null +++ b/mmaction/models/heads/gcn_head.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Union + +import torch +import torch.nn as nn + +from mmaction.registry import MODELS +from .base import BaseHead + + +@MODELS.register_module() +class GCNHead(BaseHead): + """The classification head for GCN. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss')``. + dropout (float): Probability of dropout layer. Defaults to 0. + init_cfg (dict or list[dict]): Config to control the initialization. + Defaults to ``dict(type='Normal', layer='Linear', std=0.01)``. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: Dict = dict(type='CrossEntropyLoss'), + dropout: float = 0., + average_clips: str = 'prob', + init_cfg: Union[Dict, List[Dict]] = dict( + type='Normal', layer='Linear', std=0.01), + **kwargs) -> None: + super().__init__( + num_classes, + in_channels, + loss_cls=loss_cls, + average_clips=average_clips, + init_cfg=init_cfg, + **kwargs) + self.dropout_ratio = dropout + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Linear(self.in_channels, self.num_classes) + + def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor: + """Forward features from the upstream network. + + Args: + x (torch.Tensor): Features from the upstream network. + + Returns: + torch.Tensor: Classification scores with shape (B, num_classes). + """ + + N, M, C, T, V = x.shape + x = x.view(N * M, C, T, V) + x = self.pool(x) + x = x.view(N, M, C) + x = x.mean(dim=1) + assert x.shape[1] == self.in_channels + + if self.dropout is not None: + x = self.dropout(x) + + cls_scores = self.fc(x) + return cls_scores diff --git a/mmaction/models/heads/i3d_head.py b/mmaction/models/heads/i3d_head.py new file mode 100644 index 0000000000000000000000000000000000000000..53ad1b4243fbc46624cbfcb149e35cd0930f1190 --- /dev/null +++ b/mmaction/models/heads/i3d_head.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.model.weight_init import normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class I3DHead(BaseHead): + """Classification head for I3D. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + dropout_ratio (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation. Default: 0.01. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + dropout_ratio: float = 0.5, + init_std: float = 0.01, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.init_std = init_std + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + else: + self.avg_pool = None + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # [N, in_channels, 4, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N, in_channels, 1, 1, 1] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels, 1, 1, 1] + x = x.view(x.shape[0], -1) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d832f4a09c230ab6901f66e4a275c0e20641601a --- /dev/null +++ b/mmaction/models/heads/mvit_head.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +from mmengine.model.weight_init import constant_init, trunc_normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class MViTHead(BaseHead): + """Classification head for Multi-scale ViT. + + A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers + for Classification and Detection `_ + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to `dict(type='CrossEntropyLoss')`. + dropout_ratio (float): Probability of dropout layer. Defaults to 0.5. + init_std (float): Std value for Initiation. Defaults to 0.02. + init_scale (float): Scale factor for Initiation parameters. + Defaults to 1. + with_cls_token (bool): Whether the backbone output feature with + cls_token. Defaults to True. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + dropout_ratio: float = 0.5, + init_std: float = 0.02, + init_scale: float = 1.0, + with_cls_token: bool = True, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.init_std = init_std + self.init_scale = init_scale + self.dropout_ratio = dropout_ratio + self.with_cls_token = with_cls_token + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + trunc_normal_init(self.fc_cls.weight, std=self.init_std) + constant_init(self.fc_cls.bias, 0.02) + self.fc_cls.weight.data.mul_(self.init_scale) + self.fc_cls.bias.data.mul_(self.init_scale) + + def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor: + """The process before the final classification head. + + The input ``feats`` is a tuple of list of tensor, and each tensor is + the feature of a backbone stage. + """ + if self.with_cls_token: + _, cls_token = feats[-1] + return cls_token + else: + patch_token = feats[-1] + return patch_token.mean(dim=(2, 3, 4)) + + def forward(self, x: Tuple[List[Tensor]], **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tuple[List[Tensor]]): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + x = self.pre_logits(x) + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/omni_head.py b/mmaction/models/heads/omni_head.py new file mode 100644 index 0000000000000000000000000000000000000000..04c42e603dfc88b9c2e781a2d3ea76317df5434d --- /dev/null +++ b/mmaction/models/heads/omni_head.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, SampleList +from .base import BaseHead + + +@MODELS.register_module() +class OmniHead(BaseHead): + """Classification head for OmniResNet that accepts both image and video + inputs. + + Args: + image_classes (int): Number of image classes to be classified. + video_classes (int): Number of video classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + image_dropout_ratio (float): Probability of dropout layer for the image + head. Defaults to 0.2. + video_dropout_ratio (float): Probability of dropout layer for the video + head. Defaults to 0.5. + video_nl_head (bool): if true, use a non-linear head for the video + head. Defaults to True. + """ + + def __init__(self, + image_classes: int, + video_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + image_dropout_ratio: float = 0.2, + video_dropout_ratio: float = 0.5, + video_nl_head: bool = True, + **kwargs) -> None: + super().__init__(image_classes, in_channels, loss_cls, **kwargs) + + self.fc2d = nn.Sequential( + nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.BatchNorm1d(in_channels), + nn.Dropout(image_dropout_ratio), + nn.Linear(in_channels, image_classes)) + + if video_nl_head: + self.fc3d = nn.Sequential( + nn.AdaptiveAvgPool3d(1), nn.Flatten(), + nn.Linear(in_channels, video_classes * 2), + nn.BatchNorm1d(video_classes * 2), nn.ReLU(inplace=True), + nn.Dropout(video_dropout_ratio), + nn.Linear(video_classes * 2, video_classes)) + else: + self.fc3d = nn.Sequential( + nn.AdaptiveAvgPool3d(1), nn.Flatten(), + nn.BatchNorm1d(in_channels), nn.Dropout(video_dropout_ratio), + nn.Linear(in_channels, video_classes)) + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + if len(x.shape) == 4: + cls_score = self.fc2d(x) + else: + cls_score = self.fc3d(x) + return cls_score + + def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]], + data_samples: SampleList) -> dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (Tensor): Classification prediction results of + all class, has shape (batch_size, num_classes). + data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + labels = [x.gt_label for x in data_samples] + labels = torch.stack(labels).to(cls_scores.device) + labels = labels.squeeze() + + losses = dict() + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + elif labels.dim() == 1 and cls_scores.size()[0] == 1: + # Fix a bug when training with soft labels and batch size is 1. + # When using soft labels, `labels` and `cls_socre` share the same + # shape. + labels = labels.unsqueeze(0) + + if cls_scores.size() != labels.size(): + top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(), + labels.detach().cpu().numpy(), + self.topk) + for k, a in zip(self.topk, top_k_acc): + losses[f'top{k}_acc'] = torch.tensor( + a, device=cls_scores.device) + if self.label_smooth_eps != 0: + if cls_scores.size() != labels.size(): + labels = F.one_hot(labels, num_classes=self.num_classes) + labels = ((1 - self.label_smooth_eps) * labels + + self.label_smooth_eps / self.num_classes) + + loss_cls = self.loss_cls(cls_scores, labels) + # loss_cls may be dictionary or single tensor + if isinstance(loss_cls, dict): + losses.update(loss_cls) + else: + losses['loss_cls'] = loss_cls + return losses diff --git a/mmaction/models/heads/rgbpose_head.py b/mmaction/models/heads/rgbpose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ff68e14842a57da5fc479f4f85149c1c8d2188e3 --- /dev/null +++ b/mmaction/models/heads/rgbpose_head.py @@ -0,0 +1,229 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model.weight_init import normal_init + +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import MODELS +from mmaction.utils import SampleList +from .base import BaseHead + + +@MODELS.register_module() +class RGBPoseHead(BaseHead): + """The classification head for RGBPoseConv3D. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (tuple[int]): Number of channels in input feature. + loss_cls (dict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss')``. + loss_components (list[str]): The components of the loss. + Defaults to ``['rgb', 'pose']``. + loss_weights (float or tuple[float]): The weights of the losses. + Defaults to 1. + dropout (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation. Default: 0.01. + """ + + def __init__(self, + num_classes: int, + in_channels: Tuple[int], + loss_cls: Dict = dict(type='CrossEntropyLoss'), + loss_components: List[str] = ['rgb', 'pose'], + loss_weights: Union[float, Tuple[float]] = 1., + dropout: float = 0.5, + init_std: float = 0.01, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + if isinstance(dropout, float): + dropout = {'rgb': dropout, 'pose': dropout} + assert isinstance(dropout, dict) + + if loss_components is not None: + self.loss_components = loss_components + if isinstance(loss_weights, float): + loss_weights = [loss_weights] * len(loss_components) + assert len(loss_weights) == len(loss_components) + self.loss_weights = loss_weights + + self.dropout = dropout + self.init_std = init_std + + self.dropout_rgb = nn.Dropout(p=self.dropout['rgb']) + self.dropout_pose = nn.Dropout(p=self.dropout['pose']) + + self.fc_rgb = nn.Linear(self.in_channels[0], num_classes) + self.fc_pose = nn.Linear(self.in_channels[1], num_classes) + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_rgb, std=self.init_std) + normal_init(self.fc_pose, std=self.init_std) + + def forward(self, x: Tuple[torch.Tensor]) -> Dict: + """Defines the computation performed at every call.""" + x_rgb, x_pose = self.avg_pool(x[0]), self.avg_pool(x[1]) + x_rgb = x_rgb.view(x_rgb.size(0), -1) + x_pose = x_pose.view(x_pose.size(0), -1) + + x_rgb = self.dropout_rgb(x_rgb) + x_pose = self.dropout_pose(x_pose) + + cls_scores = dict() + cls_scores['rgb'] = self.fc_rgb(x_rgb) + cls_scores['pose'] = self.fc_pose(x_pose) + + return cls_scores + + def loss(self, feats: Tuple[torch.Tensor], data_samples: SampleList, + **kwargs) -> Dict: + """Perform forward propagation of head and loss calculation on the + features of the upstream network. + + Args: + feats (tuple[torch.Tensor]): Features from upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + cls_scores = self(feats, **kwargs) + return self.loss_by_feat(cls_scores, data_samples) + + def loss_by_feat(self, cls_scores: Dict[str, torch.Tensor], + data_samples: SampleList) -> Dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (dict[str, torch.Tensor]): The dict of + classification scores, + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + labels = torch.stack([x.gt_label for x in data_samples]) + labels = labels.squeeze() + + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + elif labels.dim() == 1 and labels.size()[0] == self.num_classes \ + and cls_scores.size()[0] == 1: + # Fix a bug when training with soft labels and batch size is 1. + # When using soft labels, `labels` and `cls_score` share the same + # shape. + labels = labels.unsqueeze(0) + + losses = dict() + for loss_name, weight in zip(self.loss_components, self.loss_weights): + cls_score = cls_scores[loss_name] + loss_cls = self.loss_by_scores(cls_score, labels) + loss_cls = {loss_name + '_' + k: v for k, v in loss_cls.items()} + loss_cls[f'{loss_name}_loss_cls'] *= weight + losses.update(loss_cls) + return losses + + def loss_by_scores(self, cls_scores: torch.Tensor, + labels: torch.Tensor) -> Dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (torch.Tensor): Classification prediction + results of all class, has shape (batch_size, num_classes). + labels (torch.Tensor): The labels used to calculate the loss. + + Returns: + dict: A dictionary of loss components. + """ + losses = dict() + if cls_scores.size() != labels.size(): + top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(), + labels.detach().cpu().numpy(), + self.topk) + for k, a in zip(self.topk, top_k_acc): + losses[f'top{k}_acc'] = torch.tensor( + a, device=cls_scores.device) + if self.label_smooth_eps != 0: + if cls_scores.size() != labels.size(): + labels = F.one_hot(labels, num_classes=self.num_classes) + labels = ((1 - self.label_smooth_eps) * labels + + self.label_smooth_eps / self.num_classes) + + loss_cls = self.loss_cls(cls_scores, labels) + # loss_cls may be dictionary or single tensor + if isinstance(loss_cls, dict): + losses.update(loss_cls) + else: + losses['loss_cls'] = loss_cls + return losses + + def predict(self, feats: Tuple[torch.Tensor], data_samples: SampleList, + **kwargs) -> SampleList: + """Perform forward propagation of head and predict recognition results + on the features of the upstream network. + + Args: + feats (tuple[torch.Tensor]): Features from upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + list[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + cls_scores = self(feats, **kwargs) + return self.predict_by_feat(cls_scores, data_samples) + + def predict_by_feat(self, cls_scores: Dict[str, torch.Tensor], + data_samples: SampleList) -> SampleList: + """Transform a batch of output features extracted from the head into + prediction results. + + Args: + cls_scores (dict[str, torch.Tensor]): The dict of + classification scores, + data_samples (list[:obj:`ActionDataSample`]): The + annotation data of every samples. It usually includes + information such as `gt_label`. + + Returns: + list[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + pred_scores = [dict() for _ in range(len(data_samples))] + + for name in self.loss_components: + cls_score = cls_scores[name] + cls_score = self.predict_by_scores(cls_score, data_samples) + for pred_score, score in zip(pred_scores, cls_score): + pred_score[f'{name}'] = score + + for data_sample, pred_score, in zip(data_samples, pred_scores): + data_sample.set_pred_score(pred_score) + return data_samples + + def predict_by_scores(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> torch.Tensor: + """Transform a batch of output features extracted from the head into + prediction results. + + Args: + cls_scores (torch.Tensor): Classification scores, has a shape + (B*num_segs, num_classes) + data_samples (list[:obj:`ActionDataSample`]): The annotation + data of every samples. + + Returns: + torch.Tensor: The averaged classification scores. + """ + + num_segs = cls_scores.shape[0] // len(data_samples) + cls_scores = self.average_clip(cls_scores, num_segs=num_segs) + return cls_scores diff --git a/mmaction/models/heads/slowfast_head.py b/mmaction/models/heads/slowfast_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6745cf32ec4f99132d0c7b7c555c2b515f303ffb --- /dev/null +++ b/mmaction/models/heads/slowfast_head.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from mmengine.model.weight_init import normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class SlowFastHead(BaseHead): + """The classification head for SlowFast. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss'). + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + dropout_ratio (float): Probability of dropout layer. Default: 0.8. + init_std (float): Std value for Initiation. Default: 0.01. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + dropout_ratio: float = 0.8, + init_std: float = 0.01, + **kwargs) -> None: + + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.init_std = init_std + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(in_channels, num_classes) + + if self.spatial_type == 'avg': + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + else: + self.avg_pool = None + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: Tuple[Tensor], **kwargs) -> None: + """Defines the computation performed at every call. + + Args: + x (tuple[torch.Tensor]): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # ([N, channel_slow, T1, H, W], [(N, channel_fast, T2, H, W)]) + x_slow, x_fast = x + # ([N, channel_slow, 1, 1, 1], [N, channel_fast, 1, 1, 1]) + x_slow = self.avg_pool(x_slow) + x_fast = self.avg_pool(x_fast) + # [N, channel_fast + channel_slow, 1, 1, 1] + x = torch.cat((x_fast, x_slow), dim=1) + + if self.dropout is not None: + x = self.dropout(x) + + # [N x C] + x = x.view(x.size(0), -1) + # [N x num_classes] + cls_score = self.fc_cls(x) + + return cls_score diff --git a/mmaction/models/heads/timesformer_head.py b/mmaction/models/heads/timesformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..291fa28a86b8dcdffa2a5ef51433f4258ae02964 --- /dev/null +++ b/mmaction/models/heads/timesformer_head.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.model.weight_init import trunc_normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class TimeSformerHead(BaseHead): + """Classification head for TimeSformer. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to `dict(type='CrossEntropyLoss')`. + init_std (float): Std value for Initiation. Defaults to 0.02. + dropout_ratio (float): Probability of dropout layer. + Defaults to : 0.0. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + init_std: float = 0.02, + dropout_ratio: float = 0.0, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.init_std = init_std + self.dropout_ratio = dropout_ratio + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + trunc_normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/tpn_head.py b/mmaction/models/heads/tpn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b90d979883fa2f904edc9a439be6a9269cbcacbf --- /dev/null +++ b/mmaction/models/heads/tpn_head.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.device import get_device +from torch import Tensor + +from mmaction.registry import MODELS +from .tsn_head import TSNHead + + +@MODELS.register_module() +class TPNHead(TSNHead): + """Class head for TPN.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels. + self.avg_pool3d = nn.AdaptiveAvgPool3d((1, 1, 1)) + else: + self.avg_pool3d = None + + self.avg_pool2d = None + self.new_cls = None + + def _init_new_cls(self) -> None: + self.new_cls = nn.Conv3d(self.in_channels, self.num_classes, 1, 1, 0) + self.new_cls = self.new_cls.to(get_device()) + self.new_cls.weight.copy_(self.fc_cls.weight[..., None, None, None]) + self.new_cls.bias.copy_(self.fc_cls.bias) + + def forward(self, + x, + num_segs: Optional[int] = None, + fcn_test: bool = False, + **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + num_segs (int, optional): Number of segments into which a video + is divided. Defaults to None. + fcn_test (bool): Whether to apply full convolution (fcn) testing. + Defaults to False. + + Returns: + Tensor: The classification scores for input samples. + """ + if fcn_test: + if self.avg_pool3d: + x = self.avg_pool3d(x) + if self.new_cls is None: + self._init_new_cls() + x = self.new_cls(x) + cls_score_feat_map = x.view(x.size(0), -1) + return cls_score_feat_map + + if self.avg_pool2d is None: + kernel_size = (1, x.shape[-2], x.shape[-1]) + self.avg_pool2d = nn.AvgPool3d(kernel_size, stride=1, padding=0) + + if num_segs is None: + # [N, in_channels, 3, 7, 7] + x = self.avg_pool3d(x) + else: + # [N * num_segs, in_channels, 7, 7] + x = self.avg_pool2d(x) + # [N * num_segs, in_channels, 1, 1] + x = x.reshape((-1, num_segs) + x.shape[1:]) + # [N, num_segs, in_channels, 1, 1] + x = self.consensus(x) + # [N, 1, in_channels, 1, 1] + x = x.squeeze(1) + # [N, in_channels, 1, 1] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels, 1, 1] + x = x.view(x.size(0), -1) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/trn_head.py b/mmaction/models/heads/trn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa2b0bcd58157279fffd270a2055a1dabd1dff6 --- /dev/null +++ b/mmaction/models/heads/trn_head.py @@ -0,0 +1,218 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools + +import numpy as np +import torch +import torch.nn as nn +from mmengine.model.weight_init import normal_init + +from mmaction.registry import MODELS +from .base import BaseHead + + +class RelationModule(nn.Module): + """Relation Module of TRN. + + Args: + hidden_dim (int): The dimension of hidden layer of MLP in relation + module. + num_segments (int): Number of frame segments. + num_classes (int): Number of classes to be classified. + """ + + def __init__(self, hidden_dim, num_segments, num_classes): + super().__init__() + self.hidden_dim = hidden_dim + self.num_segments = num_segments + self.num_classes = num_classes + bottleneck_dim = 512 + self.classifier = nn.Sequential( + nn.ReLU(), + nn.Linear(self.num_segments * self.hidden_dim, bottleneck_dim), + nn.ReLU(), nn.Linear(bottleneck_dim, self.num_classes)) + + def init_weights(self): + """Use the default kaiming_uniform for all nn.linear layers.""" + pass + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + Returns: + Tensor: The classification scores for input samples. + """ + # [N, num_segs * hidden_dim] + x = x.view(x.size(0), -1) + x = self.classifier(x) + return x + + +class RelationModuleMultiScale(nn.Module): + """Relation Module with Multi Scale of TRN. + + Args: + hidden_dim (int): The dimension of hidden layer of MLP in relation + module. + num_segments (int): Number of frame segments. + num_classes (int): Number of classes to be classified. + """ + + def __init__(self, hidden_dim, num_segments, num_classes): + super().__init__() + self.hidden_dim = hidden_dim + self.num_segments = num_segments + self.num_classes = num_classes + + # generate the multiple frame relations + self.scales = range(num_segments, 1, -1) + + self.relations_scales = [] + self.subsample_scales = [] + max_subsample = 3 + for scale in self.scales: + # select the different frame features for different scales + relations_scale = list( + itertools.combinations(range(self.num_segments), scale)) + self.relations_scales.append(relations_scale) + # sample `max_subsample` relation_scale at most + self.subsample_scales.append( + min(max_subsample, len(relations_scale))) + assert len(self.relations_scales[0]) == 1 + + bottleneck_dim = 256 + self.fc_fusion_scales = nn.ModuleList() + for scale in self.scales: + fc_fusion = nn.Sequential( + nn.ReLU(), nn.Linear(scale * self.hidden_dim, bottleneck_dim), + nn.ReLU(), nn.Linear(bottleneck_dim, self.num_classes)) + self.fc_fusion_scales.append(fc_fusion) + + def init_weights(self): + """Use the default kaiming_uniform for all nn.linear layers.""" + pass + + def forward(self, x): + # the first one is the largest scale + act_all = x[:, self.relations_scales[0][0], :] + act_all = act_all.view( + act_all.size(0), self.scales[0] * self.hidden_dim) + act_all = self.fc_fusion_scales[0](act_all) + + for scaleID in range(1, len(self.scales)): + # iterate over the scales + idx_relations_randomsample = np.random.choice( + len(self.relations_scales[scaleID]), + self.subsample_scales[scaleID], + replace=False) + for idx in idx_relations_randomsample: + act_relation = x[:, self.relations_scales[scaleID][idx], :] + act_relation = act_relation.view( + act_relation.size(0), + self.scales[scaleID] * self.hidden_dim) + act_relation = self.fc_fusion_scales[scaleID](act_relation) + act_all += act_relation + return act_all + + +@MODELS.register_module() +class TRNHead(BaseHead): + """Class head for TRN. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + num_segments (int): Number of frame segments. Default: 8. + loss_cls (dict): Config for building loss. Default: + dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + relation_type (str): The relation module type. Choices are 'TRN' or + 'TRNMultiScale'. Default: 'TRNMultiScale'. + hidden_dim (int): The dimension of hidden layer of MLP in relation + module. Default: 256. + dropout_ratio (float): Probability of dropout layer. Default: 0.8. + init_std (float): Std value for Initiation. Default: 0.001. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes, + in_channels, + num_segments=8, + loss_cls=dict(type='CrossEntropyLoss'), + spatial_type='avg', + relation_type='TRNMultiScale', + hidden_dim=256, + dropout_ratio=0.8, + init_std=0.001, + **kwargs): + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + + self.num_classes = num_classes + self.in_channels = in_channels + self.num_segments = num_segments + self.spatial_type = spatial_type + self.relation_type = relation_type + self.hidden_dim = hidden_dim + self.dropout_ratio = dropout_ratio + self.init_std = init_std + + if self.relation_type == 'TRN': + self.consensus = RelationModule(self.hidden_dim, self.num_segments, + self.num_classes) + elif self.relation_type == 'TRNMultiScale': + self.consensus = RelationModuleMultiScale(self.hidden_dim, + self.num_segments, + self.num_classes) + else: + raise ValueError(f'Unknown Relation Type {self.relation_type}!') + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.hidden_dim) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d(1) + else: + self.avg_pool = None + + def init_weights(self): + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + self.consensus.init_weights() + + def forward(self, x, num_segs, **kwargs): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + num_segs (int): Useless in TRNHead. By default, `num_segs` + is equal to `clip_len * num_clips * num_crops`, which is + automatically generated in Recognizer forward phase and + useless in TRN models. The `self.num_segments` we need is a + hyper parameter to build TRN models. + Returns: + torch.Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N * num_segs, in_channels, 1, 1] + x = torch.flatten(x, 1) + # [N * num_segs, in_channels] + if self.dropout is not None: + x = self.dropout(x) + + # [N, num_segs, hidden_dim] + cls_score = self.fc_cls(x) + cls_score = cls_score.view((-1, self.num_segments) + + cls_score.size()[1:]) + + # [N, num_classes] + cls_score = self.consensus(cls_score) + return cls_score diff --git a/mmaction/models/heads/tsm_head.py b/mmaction/models/heads/tsm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2d2469d86c119fe06610f0ee1c08b435115ccea1 --- /dev/null +++ b/mmaction/models/heads/tsm_head.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmengine.model.weight_init import normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, get_str_type +from .base import AvgConsensus, BaseHead + + +@MODELS.register_module() +class TSMHead(BaseHead): + """Class head for TSM. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + num_segments (int): Number of frame segments. Default: 8. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + consensus (dict or ConfigDict): Consensus config dict. + dropout_ratio (float): Probability of dropout layer. Default: 0.4. + init_std (float): Std value for Initiation. Default: 0.01. + is_shift (bool): Indicating whether the feature is shifted. + Default: True. + temporal_pool (bool): Indicating whether feature is temporal pooled. + Default: False. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + num_segments: int = 8, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + consensus: ConfigType = dict(type='AvgConsensus', dim=1), + dropout_ratio: float = 0.8, + init_std: float = 0.001, + is_shift: bool = True, + temporal_pool: bool = False, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.num_segments = num_segments + self.init_std = init_std + self.is_shift = is_shift + self.temporal_pool = temporal_pool + + consensus_ = consensus.copy() + + consensus_type = consensus_.pop('type') + if get_str_type(consensus_type) == 'AvgConsensus': + self.consensus = AvgConsensus(**consensus_) + else: + self.consensus = None + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d(1) + else: + self.avg_pool = None + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: Tensor, num_segs: int, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + num_segs (int): Useless in TSMHead. By default, `num_segs` + is equal to `clip_len * num_clips * num_crops`, which is + automatically generated in Recognizer forward phase and + useless in TSM models. The `self.num_segments` we need is a + hyper parameter to build TSM models. + Returns: + Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N * num_segs, in_channels, 1, 1] + x = torch.flatten(x, 1) + # [N * num_segs, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N * num_segs, num_classes] + cls_score = self.fc_cls(x) + + if self.is_shift and self.temporal_pool: + # [2 * N, num_segs // 2, num_classes] + cls_score = cls_score.view((-1, self.num_segments // 2) + + cls_score.size()[1:]) + else: + # [N, num_segs, num_classes] + cls_score = cls_score.view((-1, self.num_segments) + + cls_score.size()[1:]) + # [N, 1, num_classes] + cls_score = self.consensus(cls_score) + # [N, num_classes] + return cls_score.squeeze(1) diff --git a/mmaction/models/heads/tsn_audio_head.py b/mmaction/models/heads/tsn_audio_head.py new file mode 100644 index 0000000000000000000000000000000000000000..25fa7c1095db6fdc70d43b3bd53bbb56bf52f417 --- /dev/null +++ b/mmaction/models/heads/tsn_audio_head.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.model.weight_init import normal_init + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class TSNAudioHead(BaseHead): + """Classification head for TSN on audio. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (Union[dict, ConfigDict]): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss')``. + spatial_type (str): Pooling type in spatial dimension. + Defaults to ``avg``. + dropout_ratio (float): Probability of dropout layer. Defaults to 0.4. + init_std (float): Std value for Initiation. Defaults to 0.01. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + dropout_ratio: float = 0.4, + init_std: float = 0.01, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs) + + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.init_std = init_std + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + else: + self.avg_pool = None + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, h, w] + x = self.avg_pool(x) + # [N, in_channels, 1, 1] + x = x.view(x.size(0), -1) + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/tsn_head.py b/mmaction/models/heads/tsn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..e11ce7d15bea53d453eb03c79a3a9dfff7d9a925 --- /dev/null +++ b/mmaction/models/heads/tsn_head.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.model.weight_init import normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, get_str_type +from .base import AvgConsensus, BaseHead + + +@MODELS.register_module() +class TSNHead(BaseHead): + """Class head for TSN. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss'). + spatial_type (str or ConfigDict): Pooling type in spatial dimension. + Default: 'avg'. + consensus (dict): Consensus config dict. + dropout_ratio (float): Probability of dropout layer. Default: 0.4. + init_std (float): Std value for Initiation. Default: 0.01. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + consensus: ConfigType = dict(type='AvgConsensus', dim=1), + dropout_ratio: float = 0.4, + init_std: float = 0.01, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs) + + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.init_std = init_std + + consensus_ = consensus.copy() + + consensus_type = consensus_.pop('type') + if get_str_type(consensus_type) == 'AvgConsensus': + self.consensus = AvgConsensus(**consensus_) + else: + self.consensus = None + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + else: + self.avg_pool = None + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x: Tensor, num_segs: int, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + num_segs (int): Number of segments into which a video + is divided. + Returns: + Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, 7, 7] + if self.avg_pool is not None: + if isinstance(x, tuple): + shapes = [y.shape for y in x] + assert 1 == 0, f'x is tuple {shapes}' + x = self.avg_pool(x) + # [N * num_segs, in_channels, 1, 1] + x = x.reshape((-1, num_segs) + x.shape[1:]) + # [N, num_segs, in_channels, 1, 1] + x = self.consensus(x) + # [N, 1, in_channels, 1, 1] + x = x.squeeze(1) + # [N, in_channels, 1, 1] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels, 1, 1] + x = x.view(x.size(0), -1) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa296de665281eb9240ea1f628fe577cb019d58 --- /dev/null +++ b/mmaction/models/heads/uniformer_head.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +from mmengine.fileio import load +from mmengine.logging import MMLogger +from mmengine.runner.checkpoint import _load_checkpoint_with_prefix +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, get_str_type +from .base import BaseHead + + +@MODELS.register_module() +class UniFormerHead(BaseHead): + """Classification head for UniFormer. supports loading pretrained + Kinetics-710 checkpoint to fine-tuning on other Kinetics dataset. + + A pytorch implement of: `UniFormerV2: Spatiotemporal + Learning by Arming Image ViTs with Video UniFormer + ` + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to `dict(type='CrossEntropyLoss')`. + dropout_ratio (float): Probability of dropout layer. + Defaults to : 0.0. + channel_map (str, optional): Channel map file to selecting + channels from pretrained head with extra channels. + Defaults to None. + init_cfg (dict or ConfigDict, optional): Config to control the + initialization. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.01) + ]``. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + dropout_ratio: float = 0.0, + channel_map: Optional[str] = None, + init_cfg: Optional[dict] = dict( + type='TruncNormal', layer='Linear', std=0.02), + **kwargs) -> None: + super().__init__( + num_classes, in_channels, loss_cls, init_cfg=init_cfg, **kwargs) + self.channel_map = channel_map + self.dropout_ratio = dropout_ratio + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def _select_channels(self, stact_dict): + selected_channels = load(self.channel_map) + for key in stact_dict: + stact_dict[key] = stact_dict[key][selected_channels] + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + if get_str_type(self.init_cfg['type']) == 'Pretrained': + assert self.channel_map is not None, \ + 'load cls_head weights needs to specify the channel map file' + logger = MMLogger.get_current_instance() + pretrained = self.init_cfg['checkpoint'] + logger.info(f'load pretrained model from {pretrained}') + state_dict = _load_checkpoint_with_prefix( + 'cls_head.', pretrained, map_location='cpu') + self._select_channels(state_dict) + msg = self.load_state_dict(state_dict, strict=False) + logger.info(msg) + else: + super().init_weights() + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/heads/x3d_head.py b/mmaction/models/heads/x3d_head.py new file mode 100644 index 0000000000000000000000000000000000000000..bee94882950c6da1443b8cc372f58d419bea18be --- /dev/null +++ b/mmaction/models/heads/x3d_head.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.model.weight_init import normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class X3DHead(BaseHead): + """Classification head for I3D. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + dropout_ratio (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation. Default: 0.01. + fc1_bias (bool): If the first fc layer has bias. Default: False. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + spatial_type: str = 'avg', + dropout_ratio: float = 0.5, + init_std: float = 0.01, + fc1_bias: bool = False, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.init_std = init_std + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.in_channels = in_channels + self.mid_channels = 2048 + self.num_classes = num_classes + self.fc1_bias = fc1_bias + + self.fc1 = nn.Linear( + self.in_channels, self.mid_channels, bias=self.fc1_bias) + self.fc2 = nn.Linear(self.mid_channels, self.num_classes) + + self.relu = nn.ReLU() + + self.pool = None + if self.spatial_type == 'avg': + self.pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + elif self.spatial_type == 'max': + self.pool = nn.AdaptiveMaxPool3d((1, 1, 1)) + else: + raise NotImplementedError + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc1, std=self.init_std) + normal_init(self.fc2, std=self.init_std) + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # [N, in_channels, T, H, W] + assert self.pool is not None + x = self.pool(x) + # [N, in_channels, 1, 1, 1] + # [N, in_channels, 1, 1, 1] + x = x.view(x.shape[0], -1) + # [N, in_channels] + x = self.fc1(x) + # [N, 2048] + x = self.relu(x) + + if self.dropout is not None: + x = self.dropout(x) + + cls_score = self.fc2(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/localizers/__init__.py b/mmaction/models/localizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f42f0fd18d5ae83e3f2c3f3bf6839da30c2654ba --- /dev/null +++ b/mmaction/models/localizers/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bmn import BMN +from .bsn import PEM, TEM +from .drn.drn import DRN +from .tcanet import TCANet + +__all__ = ['TEM', 'PEM', 'BMN', 'TCANet', 'DRN'] diff --git a/mmaction/models/localizers/__pycache__/__init__.cpython-310.pyc b/mmaction/models/localizers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5068c0814071663340d2833e125279d0d49b734 Binary files /dev/null and b/mmaction/models/localizers/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/localizers/__pycache__/bmn.cpython-310.pyc b/mmaction/models/localizers/__pycache__/bmn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90de7d634efd09b4f803451232c0c2d74ada24eb Binary files /dev/null and b/mmaction/models/localizers/__pycache__/bmn.cpython-310.pyc differ diff --git a/mmaction/models/localizers/__pycache__/bsn.cpython-310.pyc b/mmaction/models/localizers/__pycache__/bsn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb06eadd38f9121a1b7aeb73ccdb56e624a17d05 Binary files /dev/null and b/mmaction/models/localizers/__pycache__/bsn.cpython-310.pyc differ diff --git a/mmaction/models/localizers/bmn.py b/mmaction/models/localizers/bmn.py new file mode 100644 index 0000000000000000000000000000000000000000..e24e5b16caff45fa29eeacdf8818348d19e43003 --- /dev/null +++ b/mmaction/models/localizers/bmn.py @@ -0,0 +1,467 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +import torch.nn as nn +from mmengine.model import BaseModel + +from mmaction.registry import MODELS +from .utils import post_processing, temporal_iop, temporal_iou + + +@MODELS.register_module() +class BMN(BaseModel): + """Boundary Matching Network for temporal action proposal generation. + + Please refer `BMN: Boundary-Matching Network for Temporal Action Proposal + Generation `_. + Code Reference https://github.com/JJBOY/BMN-Boundary-Matching-Network + Args: + temporal_dim (int): Total frames selected for each video. + boundary_ratio (float): Ratio for determining video boundaries. + num_samples (int): Number of samples for each proposal. + num_samples_per_bin (int): Number of bin samples for each sample. + feat_dim (int): Feature dimension. + soft_nms_alpha (float): Soft NMS alpha. + soft_nms_low_threshold (float): Soft NMS low threshold. + soft_nms_high_threshold (float): Soft NMS high threshold. + post_process_top_k (int): Top k proposals in post process. + feature_extraction_interval (int): + Interval used in feature extraction. Default: 16. + loss_cls (dict): Config for building loss. + Default: ``dict(type='BMNLoss')``. + hidden_dim_1d (int): Hidden dim for 1d conv. Default: 256. + hidden_dim_2d (int): Hidden dim for 2d conv. Default: 128. + hidden_dim_3d (int): Hidden dim for 3d conv. Default: 512. + """ + + def __init__(self, + temporal_dim, + boundary_ratio, + num_samples, + num_samples_per_bin, + feat_dim, + soft_nms_alpha, + soft_nms_low_threshold, + soft_nms_high_threshold, + post_process_top_k, + feature_extraction_interval=16, + loss_cls=dict(type='BMNLoss'), + hidden_dim_1d=256, + hidden_dim_2d=128, + hidden_dim_3d=512): + super().__init__() + + self.tscale = temporal_dim + self.boundary_ratio = boundary_ratio + self.num_samples = num_samples + self.num_samples_per_bin = num_samples_per_bin + self.feat_dim = feat_dim + self.soft_nms_alpha = soft_nms_alpha + self.soft_nms_low_threshold = soft_nms_low_threshold + self.soft_nms_high_threshold = soft_nms_high_threshold + self.post_process_top_k = post_process_top_k + self.feature_extraction_interval = feature_extraction_interval + self.loss_cls = MODELS.build(loss_cls) + self.hidden_dim_1d = hidden_dim_1d + self.hidden_dim_2d = hidden_dim_2d + self.hidden_dim_3d = hidden_dim_3d + + self._get_interp1d_mask() + + # Base Module + self.x_1d_b = nn.Sequential( + nn.Conv1d( + self.feat_dim, + self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4), nn.ReLU(inplace=True), + nn.Conv1d( + self.hidden_dim_1d, + self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4), nn.ReLU(inplace=True)) + + # Temporal Evaluation Module + self.x_1d_s = nn.Sequential( + nn.Conv1d( + self.hidden_dim_1d, + self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4), nn.ReLU(inplace=True), + nn.Conv1d(self.hidden_dim_1d, 1, kernel_size=1), nn.Sigmoid()) + self.x_1d_e = nn.Sequential( + nn.Conv1d( + self.hidden_dim_1d, + self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4), nn.ReLU(inplace=True), + nn.Conv1d(self.hidden_dim_1d, 1, kernel_size=1), nn.Sigmoid()) + + # Proposal Evaluation Module + self.x_1d_p = nn.Sequential( + nn.Conv1d( + self.hidden_dim_1d, + self.hidden_dim_1d, + kernel_size=3, + padding=1), nn.ReLU(inplace=True)) + self.x_3d_p = nn.Sequential( + nn.Conv3d( + self.hidden_dim_1d, + self.hidden_dim_3d, + kernel_size=(self.num_samples, 1, 1)), nn.ReLU(inplace=True)) + self.x_2d_p = nn.Sequential( + nn.Conv2d(self.hidden_dim_3d, self.hidden_dim_2d, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d( + self.hidden_dim_2d, + self.hidden_dim_2d, + kernel_size=3, + padding=1), nn.ReLU(inplace=True), + nn.Conv2d( + self.hidden_dim_2d, + self.hidden_dim_2d, + kernel_size=3, + padding=1), nn.ReLU(inplace=True), + nn.Conv2d(self.hidden_dim_2d, 2, kernel_size=1), nn.Sigmoid()) + self.anchors_tmins, self.anchors_tmaxs = self._temporal_anchors( + -0.5, 1.5) + self.match_map = self._match_map() + # self.bm_mask = self._get_bm_mask() + self.register_buffer('bm_mask', self._get_bm_mask()) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + pass + + def forward(self, inputs, data_samples, mode, **kwargs): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (List[:obj:`ActionDataSample`], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + inputs = torch.stack(inputs) + if mode == 'tensor': + return self._forward(inputs, **kwargs) + if mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def loss(self, batch_inputs, batch_data_samples, **kwargs): + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Raw Inputs of the recognizer. + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. It usually includes information such + as ``gt_labels``. + + Returns: + dict: A dictionary of loss components. + """ + gt_bbox = [ + sample.gt_instances['gt_bbox'] for sample in batch_data_samples + ] + label_confidence, label_start, label_end = self.generate_labels( + gt_bbox) + + device = batch_inputs.device + label_confidence = label_confidence.to(device) + label_start = label_start.to(device) + label_end = label_end.to(device) + + confidence_map, start, end = self._forward(batch_inputs) + + loss = self.loss_cls(confidence_map, start, end, label_confidence, + label_start, label_end, self.bm_mask) + loss_dict = dict(loss=loss[0]) + return loss_dict + + def predict(self, batch_inputs, batch_data_samples, **kwargs): + """Define the computation performed at every call when testing.""" + confidence_map, start, end = self._forward(batch_inputs) + start_scores = start[0].cpu().numpy() + end_scores = end[0].cpu().numpy() + cls_confidence = (confidence_map[0][1]).cpu().numpy() + reg_confidence = (confidence_map[0][0]).cpu().numpy() + + max_start = max(start_scores) + max_end = max(end_scores) + + # generate the set of start points and end points + start_bins = np.zeros(len(start_scores)) + start_bins[0] = 1 # [1,0,0...,0,0] + end_bins = np.zeros(len(end_scores)) + end_bins[-1] = 1 # [0,0,0...,0,1] + for idx in range(1, self.tscale - 1): + if start_scores[idx] > start_scores[ + idx + 1] and start_scores[idx] > start_scores[idx - 1]: + start_bins[idx] = 1 + elif start_scores[idx] > (0.5 * max_start): + start_bins[idx] = 1 + if end_scores[idx] > end_scores[ + idx + 1] and end_scores[idx] > end_scores[idx - 1]: + end_bins[idx] = 1 + elif end_scores[idx] > (0.5 * max_end): + end_bins[idx] = 1 + + # iterate through all combinations of start_index and end_index + new_proposals = [] + for idx in range(self.tscale): + for jdx in range(self.tscale): + start_index = jdx + end_index = start_index + idx + 1 + if end_index < self.tscale and start_bins[ + start_index] == 1 and end_bins[end_index] == 1: + tmin = start_index / self.tscale + tmax = end_index / self.tscale + tmin_score = start_scores[start_index] + tmax_score = end_scores[end_index] + cls_score = cls_confidence[idx, jdx] + reg_score = reg_confidence[idx, jdx] + score = tmin_score * tmax_score * cls_score * reg_score + new_proposals.append([ + tmin, tmax, tmin_score, tmax_score, cls_score, + reg_score, score + ]) + new_proposals = np.stack(new_proposals) + video_info = batch_data_samples[0].metainfo + proposal_list = post_processing(new_proposals, video_info, + self.soft_nms_alpha, + self.soft_nms_low_threshold, + self.soft_nms_high_threshold, + self.post_process_top_k, + self.feature_extraction_interval) + output = [ + dict( + video_name=video_info['video_name'], + proposal_list=proposal_list) + ] + return output + + @staticmethod + def _get_interp1d_bin_mask(seg_tmin, seg_tmax, tscale, num_samples, + num_samples_per_bin): + """Generate sample mask for a boundary-matching pair.""" + plen = float(seg_tmax - seg_tmin) + plen_sample = plen / (num_samples * num_samples_per_bin - 1.0) + total_samples = [ + seg_tmin + plen_sample * i + for i in range(num_samples * num_samples_per_bin) + ] + p_mask = [] + for idx in range(num_samples): + bin_samples = total_samples[idx * num_samples_per_bin:(idx + 1) * + num_samples_per_bin] + bin_vector = np.zeros(tscale) + for sample in bin_samples: + sample_upper = math.ceil(sample) + sample_decimal, sample_down = math.modf(sample) + if 0 <= int(sample_down) <= (tscale - 1): + bin_vector[int(sample_down)] += 1 - sample_decimal + if 0 <= int(sample_upper) <= (tscale - 1): + bin_vector[int(sample_upper)] += sample_decimal + bin_vector = 1.0 / num_samples_per_bin * bin_vector + p_mask.append(bin_vector) + p_mask = np.stack(p_mask, axis=1) + return p_mask + + def _get_interp1d_mask(self): + """Generate sample mask for each point in Boundary-Matching Map.""" + mask_mat = [] + for start_index in range(self.tscale): + mask_mat_vector = [] + for duration_index in range(self.tscale): + if start_index + duration_index < self.tscale: + p_tmin = start_index + p_tmax = start_index + duration_index + center_len = float(p_tmax - p_tmin) + 1 + sample_tmin = p_tmin - (center_len * self.boundary_ratio) + sample_tmax = p_tmax + (center_len * self.boundary_ratio) + p_mask = self._get_interp1d_bin_mask( + sample_tmin, sample_tmax, self.tscale, + self.num_samples, self.num_samples_per_bin) + else: + p_mask = np.zeros([self.tscale, self.num_samples]) + mask_mat_vector.append(p_mask) + mask_mat_vector = np.stack(mask_mat_vector, axis=2) + mask_mat.append(mask_mat_vector) + mask_mat = np.stack(mask_mat, axis=3) + mask_mat = mask_mat.astype(np.float32) + self.sample_mask = nn.Parameter( + torch.tensor(mask_mat).view(self.tscale, -1), requires_grad=False) + + def _get_bm_mask(self): + """Generate Boundary-Matching Mask.""" + bm_mask = [] + for idx in range(self.tscale): + mask_vector = [1] * (self.tscale - idx) + [0] * idx + bm_mask.append(mask_vector) + bm_mask = torch.tensor(bm_mask, dtype=torch.float) + return bm_mask + + def _match_map(self): + """Generate match map.""" + temporal_gap = 1. / self.tscale + match_map = [] + for idx in range(self.tscale): + match_window = [] + tmin = temporal_gap * idx + for jdx in range(1, self.tscale + 1): + tmax = tmin + temporal_gap * jdx + match_window.append([tmin, tmax]) + match_map.append(match_window) + match_map = np.array(match_map) + match_map = np.transpose(match_map, [1, 0, 2]) + match_map = np.reshape(match_map, [-1, 2]) + return match_map + + def _temporal_anchors(self, tmin_offset=0., tmax_offset=1.): + """Generate temporal anchors. + + Args: + tmin_offset (int): Offset for the minimum value of temporal anchor. + Default: 0. + tmax_offset (int): Offset for the maximum value of temporal anchor. + Default: 1. + Returns: + tuple[Sequence[float]]: The minimum and maximum values of temporal + anchors. + """ + temporal_gap = 1. / self.tscale + anchors_tmins = [] + anchors_tmaxs = [] + for i in range(self.tscale): + anchors_tmins.append(temporal_gap * (i + tmin_offset)) + anchors_tmaxs.append(temporal_gap * (i + tmax_offset)) + + return anchors_tmins, anchors_tmaxs + + def _forward(self, x): + """Define the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + Returns: + torch.Tensor: The output of the module. + """ + # x.shape [batch_size, self.feat_dim, self.tscale] + base_feature = self.x_1d_b(x) + # base_feature.shape [batch_size, self.hidden_dim_1d, self.tscale] + start = self.x_1d_s(base_feature).squeeze(1) + # start.shape [batch_size, self.tscale] + end = self.x_1d_e(base_feature).squeeze(1) + # end.shape [batch_size, self.tscale] + confidence_map = self.x_1d_p(base_feature) + # [batch_size, self.hidden_dim_1d, self.tscale] + confidence_map = self._boundary_matching_layer(confidence_map) + # [batch_size, self.hidden_dim_1d,, self.num_sampls, self.tscale, self.tscale] # noqa + confidence_map = self.x_3d_p(confidence_map).squeeze(2) + # [batch_size, self.hidden_dim_3d, self.tscale, self.tscale] + confidence_map = self.x_2d_p(confidence_map) + # [batch_size, 2, self.tscale, self.tscale] + + return confidence_map, start, end + + def _boundary_matching_layer(self, x): + """Generate matching layer.""" + input_size = x.size() + out = torch.matmul(x, + self.sample_mask).reshape(input_size[0], + input_size[1], + self.num_samples, + self.tscale, self.tscale) + return out + + def generate_labels(self, gt_bbox): + """Generate training labels.""" + # TODO: do this without numpy + match_score_confidence_list = [] + match_score_start_list = [] + match_score_end_list = [] + for every_gt_bbox in gt_bbox: + gt_iou_map = [] + every_gt_bbox = every_gt_bbox.cpu() + for start, end in every_gt_bbox: + if isinstance(start, torch.Tensor): + start = start.numpy() + if isinstance(end, torch.Tensor): + end = end.numpy() + current_gt_iou_map = temporal_iou(self.match_map[:, 0], + self.match_map[:, 1], start, + end) + current_gt_iou_map = np.reshape(current_gt_iou_map, + [self.tscale, self.tscale]) + gt_iou_map.append(current_gt_iou_map) + gt_iou_map = np.array(gt_iou_map).astype(np.float32) + gt_iou_map = np.max(gt_iou_map, axis=0) + + gt_tmins = every_gt_bbox[:, 0] + gt_tmaxs = every_gt_bbox[:, 1] + + gt_len_pad = 3 * (1. / self.tscale) + + gt_start_bboxs = np.stack( + (gt_tmins - gt_len_pad / 2, gt_tmins + gt_len_pad / 2), axis=1) + gt_end_bboxs = np.stack( + (gt_tmaxs - gt_len_pad / 2, gt_tmaxs + gt_len_pad / 2), axis=1) + + match_score_start = [] + match_score_end = [] + + for anchor_tmin, anchor_tmax in zip(self.anchors_tmins, + self.anchors_tmaxs): + match_score_start.append( + np.max( + temporal_iop(anchor_tmin, anchor_tmax, + gt_start_bboxs[:, 0], gt_start_bboxs[:, + 1]))) + match_score_end.append( + np.max( + temporal_iop(anchor_tmin, anchor_tmax, + gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) + match_score_confidence_list.append(gt_iou_map) + match_score_start_list.append(match_score_start) + match_score_end_list.append(match_score_end) + + def to_tensor(x): + return torch.Tensor(np.array(x)) + + match_score_confidence_list = to_tensor(match_score_confidence_list) + match_score_start_list = to_tensor(match_score_start_list) + match_score_end_list = to_tensor(match_score_end_list) + return (match_score_confidence_list, match_score_start_list, + match_score_end_list) diff --git a/mmaction/models/localizers/bsn.py b/mmaction/models/localizers/bsn.py new file mode 100644 index 0000000000000000000000000000000000000000..2f084c7970e4d536a1ba2de03eefb1697dda3ee7 --- /dev/null +++ b/mmaction/models/localizers/bsn.py @@ -0,0 +1,506 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel +from mmengine.model.weight_init import constant_init, kaiming_init + +from mmaction.registry import MODELS +from .utils import post_processing, temporal_iop + + +@MODELS.register_module() +class TEM(BaseModel): + """Temporal Evaluation Model for Boundary Sensitive Network. + + Please refer `BSN: Boundary Sensitive Network for Temporal Action + Proposal Generation `_. + Code reference + https://github.com/wzmsltw/BSN-boundary-sensitive-network + Args: + temporal_dim (int): Total frames selected for each video. + tem_feat_dim (int): Feature dimension. + tem_hidden_dim (int): Hidden layer dimension. + tem_match_threshold (float): Temporal evaluation match threshold. + loss_cls (dict): Config for building loss. + Default: ``dict(type='BinaryLogisticRegressionLoss')``. + loss_weight (float): Weight term for action_loss. Default: 2. + output_dim (int): Output dimension. Default: 3. + conv1_ratio (float): Ratio of conv1 layer output. Default: 1.0. + conv2_ratio (float): Ratio of conv2 layer output. Default: 1.0. + conv3_ratio (float): Ratio of conv3 layer output. Default: 0.01. + """ + + def __init__(self, + temporal_dim, + boundary_ratio, + tem_feat_dim, + tem_hidden_dim, + tem_match_threshold, + loss_cls=dict(type='BinaryLogisticRegressionLoss'), + loss_weight=2, + output_dim=3, + conv1_ratio=1, + conv2_ratio=1, + conv3_ratio=0.01): + super().__init__() + + self.temporal_dim = temporal_dim + self.boundary_ratio = boundary_ratio + self.feat_dim = tem_feat_dim + self.c_hidden = tem_hidden_dim + self.match_threshold = tem_match_threshold + self.output_dim = output_dim + self.loss_cls = MODELS.build(loss_cls) + self.loss_weight = loss_weight + self.conv1_ratio = conv1_ratio + self.conv2_ratio = conv2_ratio + self.conv3_ratio = conv3_ratio + + self.conv1 = nn.Conv1d( + in_channels=self.feat_dim, + out_channels=self.c_hidden, + kernel_size=3, + stride=1, + padding=1, + groups=1) + self.conv2 = nn.Conv1d( + in_channels=self.c_hidden, + out_channels=self.c_hidden, + kernel_size=3, + stride=1, + padding=1, + groups=1) + self.conv3 = nn.Conv1d( + in_channels=self.c_hidden, + out_channels=self.output_dim, + kernel_size=1, + stride=1, + padding=0) + self.anchors_tmins, self.anchors_tmaxs = self._temporal_anchors() + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + + def _temporal_anchors(self, tmin_offset=0., tmax_offset=1.): + """Generate temporal anchors. + + Args: + tmin_offset (int): Offset for the minimum value of temporal anchor. + Default: 0. + tmax_offset (int): Offset for the maximum value of temporal anchor. + Default: 1. + Returns: + tuple[Sequence[float]]: The minimum and maximum values of temporal + anchors. + """ + temporal_gap = 1. / self.temporal_dim + anchors_tmins = [] + anchors_tmaxs = [] + for i in range(self.temporal_dim): + anchors_tmins.append(temporal_gap * (i + tmin_offset)) + anchors_tmaxs.append(temporal_gap * (i + tmax_offset)) + + return anchors_tmins, anchors_tmaxs + + def _forward(self, x): + """Define the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + Returns: + torch.Tensor: The output of the module. + """ + x = F.relu(self.conv1_ratio * self.conv1(x)) + x = F.relu(self.conv2_ratio * self.conv2(x)) + x = torch.sigmoid(self.conv3_ratio * self.conv3(x)) + return x + + def loss(self, batch_inputs, batch_data_samples, **kwargs): + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Raw Inputs of the recognizer. + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. It usually includes information such + as ``gt_labels``. + + Returns: + dict: A dictionary of loss components. + """ + tem_output = self._forward(batch_inputs) + + score_action = tem_output[:, 0, :] + score_start = tem_output[:, 1, :] + score_end = tem_output[:, 2, :] + + gt_bbox = [ + sample.gt_instances['gt_bbox'] for sample in batch_data_samples + ] + label_action, label_start, label_end = self.generate_labels(gt_bbox) + device = batch_inputs.device + label_action = label_action.to(device) + label_start = label_start.to(device) + label_end = label_end.to(device) + + loss_action = self.loss_cls(score_action, label_action, + self.match_threshold) + loss_start = self.loss_cls(score_start, label_start, + self.match_threshold) + loss_end = self.loss_cls(score_end, label_end, self.match_threshold) + + loss_dict = { + 'loss_action': loss_action * self.loss_weight, + 'loss_start': loss_start, + 'loss_end': loss_end + } + + return loss_dict + + def predict(self, batch_inputs, batch_data_samples, **kwargs): + """Define the computation performed at every call when testing.""" + tem_output = self._forward(batch_inputs).cpu().numpy() + batch_action = tem_output[:, 0, :] + batch_start = tem_output[:, 1, :] + batch_end = tem_output[:, 2, :] + + video_results = [] + for batch_idx, _ in enumerate(batch_action): + video_name = batch_data_samples[batch_idx].metainfo['video_name'] + video_action = batch_action[batch_idx] + video_start = batch_start[batch_idx] + video_end = batch_end[batch_idx] + video_result = np.stack((video_action, video_start, video_end, + self.anchors_tmins, self.anchors_tmaxs), + axis=1) + video_results.append((video_name, video_result)) + return video_results + + def generate_labels(self, gt_bbox): + """Generate training labels.""" + # TODO: do this without numpy + match_score_action_list = [] + match_score_start_list = [] + match_score_end_list = [] + for every_gt_bbox in gt_bbox: + gt_tmins = every_gt_bbox[:, 0].cpu().numpy() + gt_tmaxs = every_gt_bbox[:, 1].cpu().numpy() + + gt_lens = gt_tmaxs - gt_tmins + gt_len_pad = np.maximum(1. / self.temporal_dim, + self.boundary_ratio * gt_lens) + + gt_start_bboxs = np.stack( + (gt_tmins - gt_len_pad / 2, gt_tmins + gt_len_pad / 2), axis=1) + gt_end_bboxs = np.stack( + (gt_tmaxs - gt_len_pad / 2, gt_tmaxs + gt_len_pad / 2), axis=1) + + match_score_action = [] + match_score_start = [] + match_score_end = [] + + for anchor_tmin, anchor_tmax in zip(self.anchors_tmins, + self.anchors_tmaxs): + match_score_action.append( + np.max( + temporal_iop(anchor_tmin, anchor_tmax, gt_tmins, + gt_tmaxs))) + match_score_start.append( + np.max( + temporal_iop(anchor_tmin, anchor_tmax, + gt_start_bboxs[:, 0], gt_start_bboxs[:, + 1]))) + match_score_end.append( + np.max( + temporal_iop(anchor_tmin, anchor_tmax, + gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) + match_score_action_list.append(match_score_action) + match_score_start_list.append(match_score_start) + match_score_end_list.append(match_score_end) + match_score_action_list = torch.Tensor(match_score_action_list) + match_score_start_list = torch.Tensor(match_score_start_list) + match_score_end_list = torch.Tensor(match_score_end_list) + return (match_score_action_list, match_score_start_list, + match_score_end_list) + + def forward(self, inputs, data_samples, mode, **kwargs): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (List[:obj:`ActionDataSample`], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + if type(inputs) is not torch.Tensor: + inputs = torch.stack(inputs) + + if mode == 'tensor': + return self._forward(inputs, **kwargs) + if mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + +@MODELS.register_module() +class PEM(BaseModel): + """Proposals Evaluation Model for Boundary Sensitive Network. + + Please refer `BSN: Boundary Sensitive Network for Temporal Action + Proposal Generation `_. + Code reference + https://github.com/wzmsltw/BSN-boundary-sensitive-network + Args: + pem_feat_dim (int): Feature dimension. + pem_hidden_dim (int): Hidden layer dimension. + pem_u_ratio_m (float): Ratio for medium score proprosals to balance + data. + pem_u_ratio_l (float): Ratio for low score proprosals to balance data. + pem_high_temporal_iou_threshold (float): High IoU threshold. + pem_low_temporal_iou_threshold (float): Low IoU threshold. + soft_nms_alpha (float): Soft NMS alpha. + soft_nms_low_threshold (float): Soft NMS low threshold. + soft_nms_high_threshold (float): Soft NMS high threshold. + post_process_top_k (int): Top k proposals in post process. + feature_extraction_interval (int): + Interval used in feature extraction. Default: 16. + fc1_ratio (float): Ratio for fc1 layer output. Default: 0.1. + fc2_ratio (float): Ratio for fc2 layer output. Default: 0.1. + output_dim (int): Output dimension. Default: 1. + """ + + def __init__(self, + pem_feat_dim: int, + pem_hidden_dim: int, + pem_u_ratio_m: float, + pem_u_ratio_l: float, + pem_high_temporal_iou_threshold: float, + pem_low_temporal_iou_threshold: float, + soft_nms_alpha: float, + soft_nms_low_threshold: float, + soft_nms_high_threshold: float, + post_process_top_k: int, + feature_extraction_interval: int = 16, + fc1_ratio: float = 0.1, + fc2_ratio: float = 0.1, + output_dim: int = 1): + super().__init__() + + self.feat_dim = pem_feat_dim + self.hidden_dim = pem_hidden_dim + self.u_ratio_m = pem_u_ratio_m + self.u_ratio_l = pem_u_ratio_l + self.pem_high_temporal_iou_threshold = pem_high_temporal_iou_threshold + self.pem_low_temporal_iou_threshold = pem_low_temporal_iou_threshold + self.soft_nms_alpha = soft_nms_alpha + self.soft_nms_low_threshold = soft_nms_low_threshold + self.soft_nms_high_threshold = soft_nms_high_threshold + self.post_process_top_k = post_process_top_k + self.feature_extraction_interval = feature_extraction_interval + self.fc1_ratio = fc1_ratio + self.fc2_ratio = fc2_ratio + self.output_dim = output_dim + + self.fc1 = nn.Linear( + in_features=self.feat_dim, out_features=self.hidden_dim, bias=True) + self.fc2 = nn.Linear( + in_features=self.hidden_dim, + out_features=self.output_dim, + bias=True) + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + + def _forward(self, x): + """Define the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + Returns: + torch.Tensor: The output of the module. + """ + x = F.relu(self.fc1_ratio * self.fc1(x)) + x = torch.sigmoid(self.fc2_ratio * self.fc2(x)) + return x + + def loss(self, batch_inputs, batch_data_samples, **kwargs): + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Raw Inputs of the recognizer. + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. It usually includes information such + as ``gt_labels``. + + Returns: + dict: A dictionary of loss components. + """ + device = self.fc1.weight.device + + bsp_feature = torch.cat([ + sample.gt_instances['bsp_feature'] for sample in batch_data_samples + ]).to(device) + + reference_temporal_iou = torch.cat([ + sample.gt_instances['reference_temporal_iou'] + for sample in batch_data_samples + ]).to(device) + + pem_output = self._forward(bsp_feature) + + anchors_temporal_iou = pem_output.view(-1) + u_hmask = (reference_temporal_iou > + self.pem_high_temporal_iou_threshold).float() + u_mmask = ( + (reference_temporal_iou <= self.pem_high_temporal_iou_threshold) + & (reference_temporal_iou > self.pem_low_temporal_iou_threshold) + ).float() + u_lmask = (reference_temporal_iou <= + self.pem_low_temporal_iou_threshold).float() + + num_h = torch.sum(u_hmask) + num_m = torch.sum(u_mmask) + num_l = torch.sum(u_lmask) + + r_m = self.u_ratio_m * num_h / (num_m) + r_m = torch.min(r_m, torch.Tensor([1.0]).to(device))[0] + u_smmask = torch.rand(u_hmask.size()[0], device=device) + u_smmask = u_smmask * u_mmask + u_smmask = (u_smmask > (1. - r_m)).float() + + r_l = self.u_ratio_l * num_h / (num_l) + r_l = torch.min(r_l, torch.Tensor([1.0]).to(device))[0] + u_slmask = torch.rand(u_hmask.size()[0], device=device) + u_slmask = u_slmask * u_lmask + u_slmask = (u_slmask > (1. - r_l)).float() + + temporal_iou_weights = u_hmask + u_smmask + u_slmask + temporal_iou_loss = F.smooth_l1_loss(anchors_temporal_iou, + reference_temporal_iou) + temporal_iou_loss = torch.sum( + temporal_iou_loss * + temporal_iou_weights) / torch.sum(temporal_iou_weights) + loss_dict = dict(temporal_iou_loss=temporal_iou_loss) + + return loss_dict + + def _parse(self, gt_instances, key): + out = torch.cat([gt[key] for gt in gt_instances]) + out = out.view(-1).cpu().numpy().reshape(-1, 1) + return out + + def predict(self, batch_inputs, batch_data_samples, **kwargs): + """Define the computation performed at every call when testing.""" + device = self.fc1.weight.device + + bsp_feature = torch.cat([ + sample.gt_instances['bsp_feature'] for sample in batch_data_samples + ]).to(device) + + pem_output = self._forward(bsp_feature).view(-1).cpu().numpy() + pem_output = pem_output.reshape(-1, 1) + + gt_instances = [sample.gt_instances for sample in batch_data_samples] + + tmin = self._parse(gt_instances, 'tmin') + tmax = self._parse(gt_instances, 'tmax') + tmin_score = self._parse(gt_instances, 'tmin_score') + tmax_score = self._parse(gt_instances, 'tmax_score') + + score = np.array(pem_output * tmin_score * tmax_score).reshape(-1, 1) + result = np.concatenate( + (tmin, tmax, tmin_score, tmax_score, pem_output, score), axis=1) + result = result.reshape(-1, 6) + + video_info = batch_data_samples[0].metainfo + proposal_list = post_processing(result, video_info, + self.soft_nms_alpha, + self.soft_nms_low_threshold, + self.soft_nms_high_threshold, + self.post_process_top_k, + self.feature_extraction_interval) + output = [ + dict( + video_name=video_info['video_name'], + proposal_list=proposal_list) + ] + return output + + def forward(self, inputs, data_samples, mode, **kwargs): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + batch_inputs (Tensor): The input tensor with shape + (N, C, ...) in general. + batch_data_samples (List[:obj:`ActionDataSample`], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + inputs = torch.stack(inputs) + if mode == 'tensor': + return self._forward(inputs, **kwargs) + if mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') diff --git a/mmaction/models/localizers/drn/__pycache__/drn.cpython-310.pyc b/mmaction/models/localizers/drn/__pycache__/drn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da3b6b91f8feea8fdcb1fc0b1d760c538f553b4e Binary files /dev/null and b/mmaction/models/localizers/drn/__pycache__/drn.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn.py b/mmaction/models/localizers/drn/drn.py new file mode 100644 index 0000000000000000000000000000000000000000..1d6af4a55bd205e7922778bfcb77995280de92f4 --- /dev/null +++ b/mmaction/models/localizers/drn/drn.py @@ -0,0 +1,260 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import numpy as np +import torch +import torch.nn as nn +from mmengine.model import BaseModel + +from mmaction.registry import MODELS +from mmaction.utils import OptConfigType +from ..utils import soft_nms +from .drn_utils import FPN, Backbone, FCOSModule, QueryEncoder + + +@MODELS.register_module() +class DRN(BaseModel): + """Dense Regression Network for Video Grounding. + + Please refer `Dense Regression Network for Video Grounding + `_. + Code Reference: https://github.com/Alvin-Zeng/DRN + + Args: + vocab_size (int): number of all possible words in the query. + Defaults to 1301. + hidden_dim (int): the hidden dimension of the LSTM in the + language model. Defaults to 512. + embed_dim (int): the embedding dimension of the query. Defaults + to 300. + bidirection (bool): if True, use bi-direction LSTM in the + language model. Defaults to True. + first_output_dim (int): the output dimension of the first layer + in the backbone. Defaults to 256. + fpn_feature_dim (int): the output dimension of the FPN. Defaults + to 512. + feature_dim (int): the dimension of the video clip feature. + lstm_layers (int): the number of LSTM layers in the language model. + Defaults to 1. + fcos_pre_nms_top_n (int): value of Top-N in the FCOS module before + nms. Defaults to 32. + fcos_inference_thr (float): threshold in the FOCS inference. BBoxes + with scores higher than this threshold are regarded as positive. + Defaults to 0.05. + fcos_prior_prob (float): A prior probability of the positive bboexes. + Used to initialized the bias of the classification head. + Defaults to 0.01. + focal_alpha (float):Focal loss hyper-parameter alpha. + Defaults to 0.25. + focal_gamma (float): Focal loss hyper-parameter gamma. + Defaults to 2.0. + fpn_stride (Sequence[int]): the strides in the FPN. Defaults to + [1, 2, 4]. + fcos_nms_thr (float): NMS threshold in the FOCS module. + Defaults to 0.6. + fcos_conv_layers (int): number of convolution layers in FCOS. + Defaults to 1. + fcos_num_class (int): number of classes in FCOS. + Defaults to 2. + is_first_stage (bool): if true, the model is in the first stage + training. + is_second_stage (bool): if true, the model is in the second stage + training. + """ + + def __init__(self, + vocab_size: int = 1301, + hidden_dim: int = 512, + embed_dim: int = 300, + bidirection: bool = True, + first_output_dim: int = 256, + fpn_feature_dim: int = 512, + feature_dim: int = 4096, + lstm_layers: int = 1, + fcos_pre_nms_top_n: int = 32, + fcos_inference_thr: float = 0.05, + fcos_prior_prob: float = 0.01, + focal_alpha: float = 0.25, + focal_gamma: float = 2.0, + fpn_stride: Sequence[int] = [1, 2, 4], + fcos_nms_thr: float = 0.6, + fcos_conv_layers: int = 1, + fcos_num_class: int = 2, + is_first_stage: bool = False, + is_second_stage: bool = False, + init_cfg: OptConfigType = None, + **kwargs) -> None: + super(DRN, self).__init__(init_cfg) + + self.query_encoder = QueryEncoder( + vocab_size=vocab_size, + hidden_dim=hidden_dim, + embed_dim=embed_dim, + num_layers=lstm_layers, + bidirection=bidirection) + + channels_list = [ + (feature_dim + 256, first_output_dim, 3, 1), + (first_output_dim, first_output_dim * 2, 3, 2), + (first_output_dim * 2, first_output_dim * 4, 3, 2), + ] + self.backbone_net = Backbone(channels_list) + + self.fpn = FPN( + in_channels_list=[256, 512, 1024], out_channels=fpn_feature_dim) + + self.fcos = FCOSModule( + in_channels=fpn_feature_dim, + fcos_num_class=fcos_num_class, + fcos_conv_layers=fcos_conv_layers, + fcos_prior_prob=fcos_prior_prob, + fcos_inference_thr=fcos_inference_thr, + fcos_pre_nms_top_n=fcos_pre_nms_top_n, + fcos_nms_thr=fcos_nms_thr, + test_detections_per_img=32, + fpn_stride=fpn_stride, + focal_alpha=focal_alpha, + focal_gamma=focal_gamma, + is_first_stage=is_first_stage, + is_second_stage=is_second_stage) + + self.prop_fc = nn.Linear(feature_dim, feature_dim) + self.position_transform = nn.Linear(3, 256) + + qInput = [] + for t in range(len(channels_list)): + if t > 0: + qInput += [nn.Linear(1024, channels_list[t - 1][1])] + else: + qInput += [nn.Linear(1024, feature_dim)] + self.qInput = nn.ModuleList(qInput) + + self.is_second_stage = is_second_stage + + def forward(self, inputs, data_samples, mode, **kwargs): + props_features = torch.stack(inputs) + batch_size = props_features.shape[0] + device = props_features.device + proposals = torch.stack([ + sample.proposals['proposals'] for sample in data_samples + ]).to(device) + gt_bbox = torch.stack([ + sample.gt_instances['gt_bbox'] for sample in data_samples + ]).to(device) + + video_info = [i.metainfo for i in data_samples] + query_tokens_ = [i['query_tokens'] for i in video_info] + query_length = [i['query_length'] for i in video_info] + query_length = torch.from_numpy(np.array(query_length)) + + max_query_len = max([i.shape[0] for i in query_tokens_]) + query_tokens = torch.zeros(batch_size, max_query_len) + for idx, query_token in enumerate(query_tokens_): + query_len = query_token.shape[0] + query_tokens[idx, :query_len] = query_token + + query_tokens = query_tokens.to(device).long() + query_length = query_length.to(device).long() # should be on CPU! + + sort_index = query_length.argsort(descending=True) + box_lists, loss_dict = self._forward(query_tokens[sort_index], + query_length[sort_index], + props_features[sort_index], + proposals[sort_index], + gt_bbox[sort_index]) + if mode == 'loss': + return loss_dict + elif mode == 'predict': + # only support batch size = 1 + bbox = box_lists[0] + + per_vid_detections = bbox['detections'] + per_vid_scores = bbox['scores'] + + props_pred = torch.cat( + (per_vid_detections, per_vid_scores.unsqueeze(-1)), dim=-1) + + props_pred = props_pred.cpu().numpy() + props_pred = sorted(props_pred, key=lambda x: x[-1], reverse=True) + props_pred = np.array(props_pred) + + props_pred = soft_nms( + props_pred, + alpha=0.4, + low_threshold=0.5, + high_threshold=0.9, + top_k=5) + result = { + 'vid_name': data_samples[0].metainfo['vid_name'], + 'gt': gt_bbox[0].cpu().numpy(), + 'predictions': props_pred, + } + return [result] + + raise ValueError(f'Unsupported mode {mode}!') + + def nms_temporal(self, start, end, score, overlap=0.45): + pick = [] + assert len(start) == len(score) + assert len(end) == len(score) + if len(start) == 0: + return pick + + union = end - start + # sort and get index + intervals = [ + i[0] for i in sorted(enumerate(score), key=lambda x: x[1]) + ] + + while len(intervals) > 0: + i = intervals[-1] + pick.append(i) + + xx1 = [max(start[i], start[j]) for j in intervals[:-1]] + xx2 = [min(end[i], end[j]) for j in intervals[:-1]] + inter = [max(0., k2 - k1) for k1, k2 in zip(xx1, xx2)] + o = [ + inter[u] / (union[i] + union[intervals[u]] - inter[u]) + for u in range(len(intervals) - 1) + ] + I_new = [] + for j in range(len(o)): + if o[j] <= overlap: + I_new.append(intervals[j]) + intervals = I_new + return np.array(pick) + + def _forward(self, query_tokens, query_length, props_features, + props_start_end, gt_bbox): + + position_info = [props_start_end, props_start_end] + position_feats = [] + query_features = self.query_encoder(query_tokens, query_length) + for i in range(len(query_features)): + query_features[i] = self.qInput[i](query_features[i]) + if i > 1: + position_info.append( + torch.cat([ + props_start_end[:, ::2 * (i - 1), [0]], + props_start_end[:, 1::2 * (i - 1), [1]] + ], + dim=-1)) + props_duration = position_info[i][:, :, 1] - position_info[i][:, :, + 0] + props_duration = props_duration.unsqueeze(-1) + position_feat = torch.cat((position_info[i], props_duration), + dim=-1).float() + position_feats.append( + self.position_transform(position_feat).permute(0, 2, 1)) + + props_features = self.prop_fc(props_features) + + inputs = props_features.permute(0, 2, 1) + outputs = self.backbone_net(inputs, query_features, position_feats) + outputs = self.fpn(outputs) + + if self.is_second_stage: + outputs = [_.detach() for _ in outputs] + box_lists, loss_dict = self.fcos(outputs, gt_bbox.float()) + + return box_lists, loss_dict diff --git a/mmaction/models/localizers/drn/drn_utils/FPN.py b/mmaction/models/localizers/drn/drn_utils/FPN.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf1b9fcbb9b4a81ee523ffba0d961530228733a --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/FPN.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch.nn.functional as F +from torch import Tensor, nn + +from .backbone import conv_block + + +class FPN(nn.Module): + + def __init__(self, in_channels_list: List, out_channels: int) -> None: + super(FPN, self).__init__() + + inner_blocks = [] + layer_blocks = [] + for idx, in_channels in enumerate(in_channels_list, 1): + inner_block = conv_block(in_channels, out_channels, 1, 1) + layer_block = conv_block(out_channels, out_channels, 3, 1) + + inner_blocks.append(inner_block) + layer_blocks.append(layer_block) + + self.inner_blocks = nn.ModuleList(inner_blocks) + self.layer_blocks = nn.ModuleList(layer_blocks) + + def forward(self, x: Tensor) -> Tuple[Tensor]: + # process the last lowest resolution feat and + # first feed it into 1 x 1 conv + last_inner = self.inner_blocks[-1](x[-1]) + results = [self.layer_blocks[-1](last_inner)] + + for feature, inner_block, layer_block in zip( + x[:-1][::-1], self.inner_blocks[:-1][::-1], + self.layer_blocks[:-1][::-1]): + if not inner_block: + continue + inner_top_down = F.interpolate( + last_inner, scale_factor=2, mode='nearest') + inner_lateral = inner_block(feature) + last_inner = inner_lateral + inner_top_down + results.insert(0, layer_block(last_inner)) + + return tuple(results) diff --git a/mmaction/models/localizers/drn/drn_utils/__init__.py b/mmaction/models/localizers/drn/drn_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..863f5fbed17032f97a6ae0896e4d23a8d26ef16d --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbone import Backbone +from .fcos import FCOSModule +from .FPN import FPN +from .language_module import QueryEncoder + +__all__ = ['Backbone', 'FPN', 'QueryEncoder', 'FCOSModule'] diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-310.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b46973748a542234b676825f20f31b99594c3993 Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-310.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8e6dd35faf72e72146a41a3de8e34228acac6a3 Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-310.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47dedf107b372d7741ceae712e7d1c6065fddd8f Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-310.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e7c8f5e3769f090cb49190309ef05b613d40323 Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-310.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1932df09e07a5ff0a7da978a5eb3f749a30254b8 Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-310.pyc differ diff --git a/mmaction/models/localizers/drn/drn_utils/backbone.py b/mmaction/models/localizers/drn/drn_utils/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..5745e14d58b0cde5f5a2d4ec0bd01bb71d4a99e7 --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/backbone.py @@ -0,0 +1,48 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +from torch import Tensor, nn + + +def conv_block(in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1) -> nn.Module: + module = nn.Sequential( + nn.Conv1d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + bias=False), nn.BatchNorm1d(out_channels), nn.ReLU()) + return module + + +class Backbone(nn.Module): + + def __init__(self, channels_list: List[tuple]) -> None: + super(Backbone, self).__init__() + + self.num_layers = len(channels_list) + layers = [] + for idx, channels_config in enumerate(channels_list): + layer = conv_block(*channels_config) + layers.append(layer) + self.layers = nn.ModuleList(layers) + + def forward(self, x: Tensor, query_fts: Tensor, + position_fts: Tensor) -> Tuple[Tensor]: + results = [] + + for idx in range(self.num_layers): + query_ft = query_fts[idx].unsqueeze(1).permute(0, 2, 1) + position_ft = position_fts[idx] + x = query_ft * x + if idx == 0: + x = torch.cat([x, position_ft], dim=1) + x = self.layers[idx](x) + results.append(x) + + return tuple(results) diff --git a/mmaction/models/localizers/drn/drn_utils/fcos.py b/mmaction/models/localizers/drn/drn_utils/fcos.py new file mode 100644 index 0000000000000000000000000000000000000000..7c33c18aaf36331848467e04e8a491405de98a6c --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/fcos.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from torch import nn + +from .inference import make_fcos_postprocessor +from .loss import make_fcos_loss_evaluator + + +class Scale(nn.Module): + + def __init__(self, init_value=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.FloatTensor([init_value])) + + def forward(self, x): + return x * self.scale + + +class FCOSHead(torch.nn.Module): + + def __init__(self, in_channels: int, fcos_num_class: int, + fcos_conv_layers: int, fcos_prior_prob: float, + is_second_stage: bool) -> None: + super(FCOSHead, self).__init__() + num_classes = fcos_num_class - 1 + + cls_tower = [] + bbox_tower = [] + for i in range(fcos_conv_layers): + cls_tower.append( + nn.Conv1d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1)) + cls_tower.append(nn.BatchNorm1d(in_channels)) + cls_tower.append(nn.ReLU()) + bbox_tower.append( + nn.Conv1d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1)) + bbox_tower.append(nn.BatchNorm1d(in_channels)) + bbox_tower.append(nn.ReLU()) + + self.cls_tower = nn.Sequential(*cls_tower) + self.bbox_tower = nn.Sequential(*bbox_tower) + self.cls_logits = nn.Conv1d( + in_channels, num_classes, kernel_size=3, stride=1, padding=1) + + self.bbox_pred = nn.Conv1d( + in_channels, 2, kernel_size=3, stride=1, padding=1) + + self.mix_fc = nn.Sequential( + nn.Conv1d(2 * in_channels, in_channels, kernel_size=1, stride=1), + nn.BatchNorm1d(in_channels), nn.ReLU()) + + self.iou_scores = nn.Sequential( + nn.Conv1d( + in_channels, + in_channels // 2, + kernel_size=3, + stride=1, + padding=1), + nn.BatchNorm1d(in_channels // 2), + nn.ReLU(), + nn.Conv1d(in_channels // 2, 1, kernel_size=1, stride=1), + ) + + # initialization + for module in self.modules(): + if isinstance(module, nn.Conv1d): + torch.nn.init.normal_(module.weight, std=0.01) + torch.nn.init.constant_(module.bias, 0) + + # initialize the bias for focal loss + bias_value = -math.log((1 - fcos_prior_prob) / fcos_prior_prob) + torch.nn.init.constant_(self.cls_logits.bias, bias_value) + + self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(3)]) + self.is_second_stage = is_second_stage + + def forward(self, x): + logits = [] + bbox_reg = [] + iou_scores = [] + for idx, feature in enumerate(x): + cls_tower = self.cls_tower(feature) + box_tower = self.bbox_tower(feature) + logits.append(self.cls_logits(cls_tower)) + + bbox_reg_ = torch.exp(self.scales[idx](self.bbox_pred(box_tower))) + if self.is_second_stage: + bbox_reg_ = bbox_reg_.detach() + bbox_reg.append(bbox_reg_) + + mix_feature = torch.cat([cls_tower, box_tower], dim=1) + if self.is_second_stage: + mix_feature = mix_feature.detach() + mix_feature = self.mix_fc(mix_feature) + iou_scores.append(self.iou_scores(mix_feature)) + return logits, bbox_reg, iou_scores + + +class FCOSModule(torch.nn.Module): + + def __init__(self, in_channels: int, fcos_num_class: int, + fcos_conv_layers: int, fcos_prior_prob: float, + fcos_inference_thr: float, fcos_pre_nms_top_n: int, + fcos_nms_thr: float, test_detections_per_img: int, + fpn_stride: int, focal_alpha: float, focal_gamma: float, + is_first_stage: bool, is_second_stage: bool) -> None: + super(FCOSModule, self).__init__() + + head = FCOSHead( + in_channels=in_channels, + fcos_num_class=fcos_num_class, + fcos_conv_layers=fcos_conv_layers, + fcos_prior_prob=fcos_prior_prob, + is_second_stage=is_second_stage) + + self.is_first_stage = is_first_stage + self.is_second_stage = is_second_stage + box_selector_test = make_fcos_postprocessor(fcos_num_class, + fcos_inference_thr, + fcos_pre_nms_top_n, + fcos_nms_thr, + test_detections_per_img, + is_first_stage) + loss_evaluator = make_fcos_loss_evaluator(focal_alpha, focal_gamma) + self.head = head + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + self.fpn_strides = fpn_stride + + def forward(self, features, targets=None): + box_cls, box_regression, iou_scores = self.head(features) + locations = self.compute_locations(features) + + if self.training: + return self._forward_train(locations, box_cls, box_regression, + targets, iou_scores) + else: + return self._forward_test(locations, box_cls, box_regression, + targets, iou_scores) + + def _forward_train(self, locations, box_cls, box_regression, targets, + iou_scores): + loss_box_cls, loss_box_reg, loss_iou = self.loss_evaluator( + locations, box_cls, box_regression, targets, iou_scores, + self.is_first_stage) + + if self.is_second_stage: + loss_box_cls = loss_box_cls.detach() + loss_box_reg = loss_box_reg.detach() + if self.is_first_stage: + loss_iou = loss_iou.detach() + + losses = { + 'loss_cls': loss_box_cls, + 'loss_reg': loss_box_reg, + 'loss_iou': loss_iou + } + return None, losses + + def _forward_test(self, locations, box_cls, box_regression, targets, + iou_scores): + boxes = self.box_selector_test(locations, box_cls, box_regression, + iou_scores) + losses = None + return boxes, losses + + def compute_locations(self, features): + locations = [] + for level, feature in enumerate(features): + t = feature.size(-1) + locations_per_level = self.compute_locations_per_level( + t, self.fpn_strides[level], feature.device) + locations.append(locations_per_level) + return locations + + def compute_locations_per_level(self, t, stride, device): + shifts_t = torch.arange( + 0, t * stride, step=stride, dtype=torch.float32, device=device) + shifts_t = shifts_t.reshape(-1) + locations = shifts_t + stride / 2 + return locations diff --git a/mmaction/models/localizers/drn/drn_utils/inference.py b/mmaction/models/localizers/drn/drn_utils/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..6cd1b3156a0a6ce5eb19f5b83072519a3ae58756 --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/inference.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Copied from https://github.com/Alvin-Zeng/DRN/""" + +import torch + + +class FCOSPostProcessor(torch.nn.Module): + """Performs post-processing on the outputs of the RetinaNet boxes. + + This is only used in the testing. + """ + + def __init__(self, pre_nms_thresh, pre_nms_top_n, nms_thresh, + fpn_post_nms_top_n, min_size, num_classes, is_first_stage): + """ + Arguments: + pre_nms_thresh (float) + pre_nms_top_n (int) + nms_thresh (float) + fpn_post_nms_top_n (int) + min_size (int) + num_classes (int) + box_coder (BoxCoder) + """ + super(FCOSPostProcessor, self).__init__() + self.pre_nms_thresh = pre_nms_thresh + self.pre_nms_top_n = pre_nms_top_n + self.nms_thresh = nms_thresh + self.fpn_post_nms_top_n = fpn_post_nms_top_n + self.min_size = min_size + self.num_classes = num_classes + self.innerness_threshold = 0.15 + self.downsample_scale = 32 + self.is_first_stage = is_first_stage + + def forward_for_single_feature_map(self, locations, box_cls, + box_regression, level, iou_scores): + """ + Arguments: + anchors: list[BoxList] + box_cls: tensor of size N, A * C, H, W + box_regression: tensor of size N, A * 4, H, W + """ + N, C, T = box_cls.shape + + # put in the same format as locations + box_cls = box_cls.permute(0, 2, 1).contiguous().sigmoid() + iou_scores = iou_scores.permute(0, 2, 1).contiguous().sigmoid() + box_regression = box_regression.permute(0, 2, 1) + + # centerness = centerness.permute(0, 2, 1) + # centerness = centerness.reshape(N, -1).sigmoid() + # inner = inner.squeeze().sigmoid() + + candidate_inds = (box_cls > self.pre_nms_thresh) + pre_nms_top_n = candidate_inds.view(N, -1).sum(1) + pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) + + # multiply the classification scores with centerness scores + # box_cls = box_cls * centerness[:, :, None] + # box_cls = box_cls + centerness[:, :, None] + if not self.is_first_stage: + box_cls = box_cls * iou_scores + + results = [] + for i in range(N): + + # per_centerness = centerness[i] + + per_box_cls = box_cls[i] + per_candidate_inds = candidate_inds[i] + per_box_cls = per_box_cls[per_candidate_inds] + + per_candidate_nonzeros = per_candidate_inds.nonzero() + per_box_loc = per_candidate_nonzeros[:, 0] + per_class = per_candidate_nonzeros[:, 1] + 1 + + per_box_regression = box_regression[i] + per_box_regression = per_box_regression[per_box_loc] + per_locations = locations[per_box_loc] + + # per_centerness = per_centerness[per_box_loc] + + per_pre_nms_top_n = pre_nms_top_n[i] + + if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): + per_box_cls, top_k_indices = \ + per_box_cls.topk(per_pre_nms_top_n, sorted=False) + per_class = per_class[top_k_indices] + per_box_regression = per_box_regression[top_k_indices] + per_locations = per_locations[top_k_indices] + + # per_centerness = per_centerness[top_k_indices] + + detections = torch.stack([ + per_locations - per_box_regression[:, 0], + per_locations + per_box_regression[:, 1], + ], + dim=1) / self.downsample_scale + + detections[:, 0].clamp_(min=0, max=1) + detections[:, 1].clamp_(min=0, max=1) + + # remove small boxes + p_start, p_end = detections.unbind(dim=1) + duration = p_end - p_start + keep = (duration >= self.min_size).nonzero().squeeze(1) + detections = detections[keep] + + temp_dict = {} + temp_dict['detections'] = detections + temp_dict['labels'] = per_class + temp_dict['scores'] = torch.sqrt(per_box_cls) + temp_dict['level'] = [level] + # temp_dict['centerness'] = per_centerness + temp_dict['locations'] = per_locations / 32 + + results.append(temp_dict) + + return results + + def forward(self, locations, box_cls, box_regression, iou_scores): + """ + Arguments: + anchors: list[list[BoxList]] + box_cls: list[tensor] + box_regression: list[tensor] + image_sizes: list[(h, w)] + Returns: + boxlists (list[BoxList]): the post-processed anchors, after + applying box decoding and NMS + """ + sampled_boxes = [] + for i, (l, o, b, iou_s) in enumerate( + zip(locations, box_cls, box_regression, iou_scores)): + sampled_boxes.append( + self.forward_for_single_feature_map(l, o, b, i, iou_s)) + + boxlists = list(zip(*sampled_boxes)) + # boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + boxlists = self.select_over_all_levels(boxlists) + + return boxlists + + # TODO very similar to filter_results from PostProcessor + # but filter_results is per image + # TODO Yang: solve this issue in the future. No good solution + # right now. + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + results = [] + for i in range(num_images): + dicts = boxlists[i] + per_vid_scores = [] + per_vid_detections = [] + per_vid_labels = [] + # add level number + per_vid_level = [] + per_vid_locations = [] + # per_vid_centerness = [] + for per_scale_dict in dicts: + if len(per_scale_dict['detections']) != 0: + per_vid_detections.append(per_scale_dict['detections']) + if len(per_scale_dict['scores']) != 0: + per_vid_scores.append(per_scale_dict['scores']) + if len(per_scale_dict['level']) != 0: + per_vid_level.append(per_scale_dict['level'] * + len(per_scale_dict['detections'])) + + if len(per_scale_dict['locations']) != 0: + per_vid_locations.append(per_scale_dict['locations']) + + # if len(per_scale_dict['centerness']) != 0: + # per_vid_centerness.append(per_scale_dict['centerness']) + if len(per_vid_detections) == 0: + per_vid_detections = torch.Tensor([0, 1]).unsqueeze(0) + per_vid_scores = torch.Tensor([1]) + per_vid_level = [[-1]] + per_vid_locations = torch.Tensor([0.5]) + # per_vid_centerness = torch.Tensor([0.5]).cuda() + else: + per_vid_detections = torch.cat(per_vid_detections, dim=0) + per_vid_scores = torch.cat(per_vid_scores, dim=0) + per_vid_level = per_vid_level + per_vid_locations = torch.cat(per_vid_locations, dim=0) + # per_vid_centerness = torch.cat(per_vid_centerness, dim=0) + + temp_dict = {} + temp_dict['detections'] = per_vid_detections + temp_dict['labels'] = per_vid_labels + temp_dict['scores'] = per_vid_scores + temp_dict['level'] = per_vid_level + # temp_dict['centerness'] = per_vid_centerness + temp_dict['locations'] = per_vid_locations + results.append(temp_dict) + + return results + + +def make_fcos_postprocessor(fcos_num_class, fcos_inference_thr, + fcos_pre_nms_top_n, fcos_nms_thr, + test_detections_per_img, is_first_stage): + box_selector = FCOSPostProcessor( + pre_nms_thresh=fcos_inference_thr, + pre_nms_top_n=fcos_pre_nms_top_n, + nms_thresh=fcos_nms_thr, + fpn_post_nms_top_n=test_detections_per_img, + min_size=0, + num_classes=fcos_num_class, + is_first_stage=is_first_stage) + + return box_selector diff --git a/mmaction/models/localizers/drn/drn_utils/language_module.py b/mmaction/models/localizers/drn/drn_utils/language_module.py new file mode 100644 index 0000000000000000000000000000000000000000..6a6d03bdd9ab742853f0f50aa1c9b11b5cb58038 --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/language_module.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +from torch import Tensor, nn +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + + +class QueryEncoder(nn.Module): + + def __init__(self, + vocab_size: int, + hidden_dim: int = 512, + embed_dim: int = 300, + num_layers: int = 1, + bidirection: bool = True) -> None: + super(QueryEncoder, self).__init__() + self.hidden_dim = hidden_dim + self.embed_dim = embed_dim + self.embedding = nn.Embedding( + num_embeddings=vocab_size + 1, + embedding_dim=embed_dim, + padding_idx=0) + # self.embedding.weight.data.copy_(torch.load('glove_weights')) + self.biLSTM = nn.LSTM( + input_size=embed_dim, + hidden_size=self.hidden_dim, + num_layers=num_layers, + dropout=0.0, + batch_first=True, + bidirectional=bidirection) + + self.W3 = nn.Linear(hidden_dim * 4, hidden_dim) + self.W2 = nn.ModuleList( + [nn.Linear(hidden_dim, hidden_dim * 2) for _ in range(3)]) + self.W1 = nn.Linear(hidden_dim * 2, 1) + + def extract_textual(self, q_encoding: Tensor, lstm_outputs: Tensor, + q_length: Tensor, t: int): + q_cmd = self.W3(q_encoding).relu() + q_cmd = self.W2[t](q_cmd) + q_cmd = q_cmd[:, None, :] * lstm_outputs + raw_att = self.W1(q_cmd).squeeze(-1) + + raw_att = apply_mask1d(raw_att, q_length) + att = raw_att.softmax(dim=-1) + cmd = torch.bmm(att[:, None, :], lstm_outputs).squeeze(1) + return cmd + + def forward(self, query_tokens: Tensor, + query_length: Tensor) -> List[Tensor]: + self.biLSTM.flatten_parameters() + + query_embedding = self.embedding(query_tokens) + + # output denotes the forward and backward hidden states in Eq 2. + query_embedding = pack_padded_sequence( + query_embedding, query_length.cpu(), batch_first=True) + output, _ = self.biLSTM(query_embedding) + output, _ = pad_packed_sequence(output, batch_first=True) + + # q_vector denotes the global representation `g` in Eq 2. + q_vector_list = [] + + for i, length in enumerate(query_length): + h1 = output[i][0] + hs = output[i][length - 1] + q_vector = torch.cat((h1, hs), dim=-1) + q_vector_list.append(q_vector) + q_vector = torch.stack(q_vector_list) + # outputs denotes the query feature in Eq3 in 3 levels. + outputs = [] + for cmd_t in range(3): + query_feat = self.extract_textual(q_vector, output, query_length, + cmd_t) + outputs.append(query_feat) + + # Note: the output here is zero-padded + # we need slice the non-zero items for the following operations. + return outputs + + +def apply_mask1d(attention: Tensor, image_locs: Tensor) -> Tensor: + batch_size, num_loc = attention.size() + tmp1 = torch.arange( + num_loc, dtype=attention.dtype, device=attention.device) + tmp1 = tmp1.expand(batch_size, num_loc) + + tmp2 = image_locs.unsqueeze(dim=1).expand(batch_size, num_loc) + mask = tmp1 >= tmp2.to(tmp1.dtype) + attention = attention.masked_fill(mask, -1e30) + return attention diff --git a/mmaction/models/localizers/drn/drn_utils/loss.py b/mmaction/models/localizers/drn/drn_utils/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..28631c99b5dfbd4fce0756e7ac8834844644c70b --- /dev/null +++ b/mmaction/models/localizers/drn/drn_utils/loss.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Adapted from https://github.com/Alvin-Zeng/DRN/""" + +import torch +import torchvision +from torch import nn + +INF = 100000000 + + +def SigmoidFocalLoss(alpha, gamma): + + def loss_fn(inputs, targets): + loss = torchvision.ops.sigmoid_focal_loss( + inputs=inputs, + targets=targets, + alpha=alpha, + gamma=gamma, + reduction='sum') + return loss + + return loss_fn + + +def IOULoss(): + + def loss_fn(pred, target): + pred_left = pred[:, 0] + pred_right = pred[:, 1] + + target_left = target[:, 0] + target_right = target[:, 1] + + intersect = torch.min(pred_right, target_right) + torch.min( + pred_left, target_left) + target_area = target_left + target_right + pred_area = pred_left + pred_right + union = target_area + pred_area - intersect + + losses = -torch.log((intersect + 1e-8) / (union + 1e-8)) + return losses.mean() + + return loss_fn + + +class FCOSLossComputation(object): + """This class computes the FCOS losses.""" + + def __init__(self, focal_alpha, focal_gamma): + self.cls_loss_fn = SigmoidFocalLoss(focal_alpha, focal_gamma) + self.box_reg_loss_fn = IOULoss() + self.centerness_loss_fn = nn.BCEWithLogitsLoss() + self.iou_loss_fn = nn.SmoothL1Loss() + + def prepare_targets(self, points, targets): + object_sizes_of_interest = [ + [-1, 6], + [5.6, 11], + [11, INF], + ] + expanded_object_sizes_of_interest = [] + for idx, points_per_level in enumerate(points): + object_sizes_of_interest_per_level = \ + points_per_level.new_tensor(object_sizes_of_interest[idx]) + expanded_object_sizes_of_interest.append( + object_sizes_of_interest_per_level[None].expand( + len(points_per_level), -1)) + + expanded_object_sizes_of_interest = torch.cat( + expanded_object_sizes_of_interest, dim=0) + num_points_per_level = [ + len(points_per_level) for points_per_level in points + ] + points_all_level = torch.cat(points, dim=0) + labels, reg_targets = self.compute_targets_for_locations( + points_all_level, targets, expanded_object_sizes_of_interest) + + for i in range(len(labels)): + labels[i] = torch.split(labels[i], num_points_per_level, dim=0) + reg_targets[i] = torch.split( + reg_targets[i], num_points_per_level, dim=0) + + labels_level_first = [] + reg_targets_level_first = [] + for level in range(len(points)): + labels_level_first.append( + torch.cat([labels_per_im[level] for labels_per_im in labels], + dim=0)) + reg_targets_level_first.append( + torch.cat([ + reg_targets_per_im[level] + for reg_targets_per_im in reg_targets + ], + dim=0)) + + return labels_level_first, reg_targets_level_first + + def compute_targets_for_locations(self, locations, targets, + object_sizes_of_interest): + labels = [] + reg_targets = [] + ts = locations + + for im_i in range(len(targets)): + targets_per_im = targets[im_i] + bboxes = targets_per_im * 32 + + left = ts[:, None] - bboxes[None, 0] + right = bboxes[None, 1] - ts[:, None] + reg_targets_per_im = torch.cat([left, right], dim=1) + + is_in_boxes = reg_targets_per_im.min(dim=1)[0] > 0 + max_reg_targets_per_im = reg_targets_per_im.max(dim=1)[0] + is_cared_in_the_level = \ + (max_reg_targets_per_im >= object_sizes_of_interest[:, 0]) & \ + (max_reg_targets_per_im <= object_sizes_of_interest[:, 1]) + + locations_to_gt_area = bboxes[1] - bboxes[0] + locations_to_gt_area = locations_to_gt_area.repeat( + len(locations), 1) + locations_to_gt_area[is_in_boxes == 0] = INF + locations_to_gt_area[is_cared_in_the_level == 0] = INF + + _ = locations_to_gt_area.min(dim=1) + locations_to_min_area, locations_to_gt_inds = _ + + labels_per_im = reg_targets_per_im.new_ones( + len(reg_targets_per_im)) + labels_per_im[locations_to_min_area == INF] = 0 + + labels.append(labels_per_im) + reg_targets.append(reg_targets_per_im) + + return labels, reg_targets + + def __call__(self, + locations, + box_cls, + box_regression, + targets, + iou_scores, + is_first_stage=True): + N = box_cls[0].size(0) + num_classes = box_cls[0].size(1) + labels, reg_targets = self.prepare_targets(locations, targets) + + box_cls_flatten = [] + box_regression_flatten = [] + # centerness_flatten = [] + labels_flatten = [] + reg_targets_flatten = [] + + for idx in range(len(labels)): + box_cls_flatten.append(box_cls[idx].permute(0, 2, 1).reshape( + -1, num_classes)) + box_regression_flatten.append(box_regression[idx].permute( + 0, 2, 1).reshape(-1, 2)) + labels_flatten.append(labels[idx].reshape(-1)) + reg_targets_flatten.append(reg_targets[idx].reshape(-1, 2)) + + if not is_first_stage: + # [batch, 56, 2] + merged_box_regression = torch.cat( + box_regression, dim=-1).transpose(2, 1) + # [56] + merged_locations = torch.cat(locations, dim=0) + # [batch, 56] + full_locations = merged_locations[None, :].expand( + merged_box_regression.size(0), -1).contiguous() + pred_start = full_locations - merged_box_regression[:, :, 0] + pred_end = full_locations + merged_box_regression[:, :, 1] + # [batch, 56, 2] + predictions = torch.cat( + [pred_start.unsqueeze(-1), + pred_end.unsqueeze(-1)], dim=-1) / 32 + # TODO: make sure the predictions are legal. (e.g. start < end) + predictions.clamp_(min=0, max=1) + # gt: [batch, 2] + gt_box = targets[:, None, :] + + iou_target = segment_tiou(predictions, gt_box) + iou_pred = torch.cat(iou_scores, dim=-1).squeeze().sigmoid() + iou_pos_ind = iou_target > 0.9 + pos_iou_target = iou_target[iou_pos_ind] + + pos_iou_pred = iou_pred[iou_pos_ind] + + if iou_pos_ind.sum().item() == 0: + iou_loss = torch.tensor([0.]).to(iou_pos_ind.device) + else: + iou_loss = self.iou_loss_fn(pos_iou_pred, pos_iou_target) + + box_cls_flatten = torch.cat(box_cls_flatten, dim=0) + box_regression_flatten = torch.cat(box_regression_flatten, dim=0) + labels_flatten = torch.cat(labels_flatten, dim=0) + reg_targets_flatten = torch.cat(reg_targets_flatten, dim=0) + + pos_inds = torch.nonzero(labels_flatten > 0).squeeze(1) + cls_loss = self.cls_loss_fn( + box_cls_flatten, labels_flatten.unsqueeze(1)) / ( + pos_inds.numel() + N) # add N to avoid dividing by a zero + + box_regression_flatten = box_regression_flatten[pos_inds] + reg_targets_flatten = reg_targets_flatten[pos_inds] + + if pos_inds.numel() > 0: + reg_loss = self.box_reg_loss_fn( + box_regression_flatten, + reg_targets_flatten, + ) + else: + reg_loss = box_regression_flatten.sum() + + if not is_first_stage: + return cls_loss, reg_loss, iou_loss + + return cls_loss, reg_loss, torch.tensor([0.]).to(cls_loss.device) + + +def segment_tiou(box_a, box_b): + + # gt: [batch, 1, 2], detections: [batch, 56, 2] + # calculate interaction + inter_max_xy = torch.min(box_a[:, :, -1], box_b[:, :, -1]) + inter_min_xy = torch.max(box_a[:, :, 0], box_b[:, :, 0]) + inter = torch.clamp((inter_max_xy - inter_min_xy), min=0) + + # calculate union + union_max_xy = torch.max(box_a[:, :, -1], box_b[:, :, -1]) + union_min_xy = torch.min(box_a[:, :, 0], box_b[:, :, 0]) + union = torch.clamp((union_max_xy - union_min_xy), min=0) + + iou = inter / (union + 1e-6) + + return iou + + +def make_fcos_loss_evaluator(focal_alpha, focal_gamma): + loss_evaluator = FCOSLossComputation(focal_alpha, focal_gamma) + return loss_evaluator diff --git a/mmaction/models/localizers/tcanet.py b/mmaction/models/localizers/tcanet.py new file mode 100644 index 0000000000000000000000000000000000000000..13474edb74f2f04f5999af78b12ffe8425f30f0f --- /dev/null +++ b/mmaction/models/localizers/tcanet.py @@ -0,0 +1,513 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn.functional as F +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmengine.model import BaseModel +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import OptConfigType +from .utils import (batch_iou, bbox_se_transform_batch, bbox_se_transform_inv, + bbox_xw_transform_batch, bbox_xw_transform_inv, + post_processing) + + +class LGTE(BaseModel): + """Local-Global Temporal Encoder (LGTE) + + Args: + input_dim (int): Input feature dimension. + dropout (float): the dropout rate for the residual branch of + self-attention and ffn. + temporal_dim (int): Total frames selected for each video. + Defaults to 100. + window_size (int): the window size for Local Temporal Encoder. + Defaults to 9. + init_cfg (dict or ConfigDict, optional): The Config for + initialization. Defaults to None. + """ + + def __init__(self, + input_dim: int, + dropout: float, + temporal_dim: int = 100, + window_size: int = 9, + num_heads: int = 8, + init_cfg: OptConfigType = None, + **kwargs) -> None: + super(LGTE, self).__init__(init_cfg) + + self.atten = MultiheadAttention( + embed_dims=input_dim, + num_heads=num_heads, + proj_drop=dropout, + attn_drop=0.1) + self.ffn = FFN( + embed_dims=input_dim, feedforward_channels=256, ffn_drop=dropout) + + norm_cfg = dict(type='LN', eps=1e-6) + self.norm1 = build_norm_layer(norm_cfg, input_dim)[1] + self.norm2 = build_norm_layer(norm_cfg, input_dim)[1] + + mask = self._mask_matrix(num_heads, temporal_dim, window_size) + self.register_buffer('mask', mask) + + def forward(self, x: Tensor) -> Tensor: + """Forward call for LGTE. + + Args: + x (torch.Tensor): The input tensor with shape (B, C, L) + """ + x = x.permute(2, 0, 1) + mask = self.mask.repeat(x.size(1), 1, 1, 1) + L = x.shape[0] + x = self.atten(x, attn_mask=mask.reshape(-1, L, L)) + x = self.norm1(x) + x = self.ffn(x) + x = self.norm2(x) + x = x.permute(1, 2, 0) + return x + + @staticmethod + def _mask_matrix(num_heads: int, temporal_dim: int, + window_size: int) -> Tensor: + mask = torch.zeros(num_heads, temporal_dim, temporal_dim) + index = torch.arange(temporal_dim) + + for i in range(num_heads // 2): + for j in range(temporal_dim): + ignored = (index - j).abs() > window_size / 2 + mask[i, j] = ignored + + return mask.unsqueeze(0).bool() + + +def StartEndRegressor(sample_num: int, feat_dim: int) -> nn.Module: + """Start and End Regressor in the Temporal Boundary Regressor. + + Args: + sample_num (int): number of samples for the start & end. + feat_dim (int): feature dimension. + + Returns: + A pytorch module that works as the start and end regressor. The input + of the module should have a shape of (B, feat_dim * 2, sample_num). + """ + hidden_dim = 128 + regressor = nn.Sequential( + nn.Conv1d( + feat_dim * 2, + hidden_dim * 2, + kernel_size=3, + padding=1, + groups=8, + stride=2), nn.ReLU(inplace=True), + nn.Conv1d( + hidden_dim * 2, + hidden_dim * 2, + kernel_size=3, + padding=1, + groups=8, + stride=2), nn.ReLU(inplace=True), + nn.Conv1d(hidden_dim * 2, 2, kernel_size=sample_num // 4, groups=2), + nn.Flatten()) + return regressor + + +def CenterWidthRegressor(temporal_len: int, feat_dim: int) -> nn.Module: + """Center Width in the Temporal Boundary Regressor. + + Args: + temporal_len (int): temporal dimension of the inputs. + feat_dim (int): feature dimension. + + Returns: + A pytorch module that works as the start and end regressor. The input + of the module should have a shape of (B, feat_dim, temporal_len). + """ + hidden_dim = 512 + regressor = nn.Sequential( + nn.Conv1d( + feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4, + stride=2), nn.ReLU(inplace=True), + nn.Conv1d( + hidden_dim, + hidden_dim, + kernel_size=3, + padding=1, + groups=4, + stride=2), nn.ReLU(inplace=True), + nn.Conv1d( + hidden_dim, hidden_dim, kernel_size=temporal_len // 4, groups=4), + nn.ReLU(inplace=True), nn.Conv1d(hidden_dim, 3, kernel_size=1)) + return regressor + + +class TemporalTransform: + """Temporal Transform to sample temporal features.""" + + def __init__(self, prop_boundary_ratio: float, action_sample_num: int, + se_sample_num: int, temporal_interval: int): + super(TemporalTransform, self).__init__() + self.temporal_interval = temporal_interval + self.prop_boundary_ratio = prop_boundary_ratio + self.action_sample_num = action_sample_num + self.se_sample_num = se_sample_num + + def __call__(self, segments: Tensor, features: Tensor) -> List[Tensor]: + s_len = segments[:, 1] - segments[:, 0] + starts_segments = [ + segments[:, 0] - self.prop_boundary_ratio * s_len, segments[:, 0] + ] + starts_segments = torch.stack(starts_segments, dim=1) + + ends_segments = [ + segments[:, 1], segments[:, 1] + self.prop_boundary_ratio * s_len + ] + ends_segments = torch.stack(ends_segments, dim=1) + + starts_feature = self._sample_one_temporal(starts_segments, + self.se_sample_num, + features) + ends_feature = self._sample_one_temporal(ends_segments, + self.se_sample_num, features) + actions_feature = self._sample_one_temporal(segments, + self.action_sample_num, + features) + return starts_feature, actions_feature, ends_feature + + def _sample_one_temporal(self, segments: Tensor, out_len: int, + features: Tensor) -> Tensor: + segments = segments.clamp(0, 1) * 2 - 1 + theta = segments.new_zeros((features.size(0), 2, 3)) + theta[:, 1, 1] = 1.0 + theta[:, 0, 0] = (segments[:, 1] - segments[:, 0]) / 2.0 + theta[:, 0, 2] = (segments[:, 1] + segments[:, 0]) / 2.0 + + size = torch.Size((*features.shape[:2], 1, out_len)) + grid = F.affine_grid(theta, size) + stn_feature = F.grid_sample(features.unsqueeze(2), grid) + stn_feature = stn_feature.view(*features.shape[:2], out_len) + return stn_feature + + +class TBR(BaseModel): + """Temporal Boundary Regressor (TBR)""" + + def __init__(self, + se_sample_num: int, + action_sample_num: int, + temporal_dim: int, + prop_boundary_ratio: float = 0.5, + init_cfg: OptConfigType = None, + **kwargs) -> None: + super(TBR, self).__init__(init_cfg) + + hidden_dim = 512 + + self.reg1se = StartEndRegressor(se_sample_num, hidden_dim) + temporal_len = se_sample_num * 2 + action_sample_num + self.reg1xw = CenterWidthRegressor(temporal_len, hidden_dim) + self.ttn = TemporalTransform(prop_boundary_ratio, action_sample_num, + se_sample_num, temporal_dim) + + def forward(self, proposals: Tensor, features: Tensor, gt_boxes: Tensor, + iou_thres: float, training: bool) -> tuple: + proposals1 = proposals[:, :2] + starts_feat1, actions_feat1, ends_feat1 = self.ttn( + proposals1, features) + + reg1se = self.reg1se(torch.cat([starts_feat1, ends_feat1], dim=1)) + + features1xw = torch.cat([starts_feat1, actions_feat1, ends_feat1], + dim=2) + reg1xw = self.reg1xw(features1xw).squeeze(2) + + preds_iou1 = reg1xw[:, 2].sigmoid() + reg1xw = reg1xw[:, :2] + + if training: + proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2) + proposals2se = bbox_se_transform_inv(proposals1, reg1se, 1.0) + + iou1 = batch_iou(proposals1, gt_boxes) + targets1se = bbox_se_transform_batch(proposals1, gt_boxes) + targets1xw = bbox_xw_transform_batch(proposals1, gt_boxes) + rloss1se = self.regress_loss(reg1se, targets1se, iou1, iou_thres) + rloss1xw = self.regress_loss(reg1xw, targets1xw, iou1, iou_thres) + rloss1 = rloss1se + rloss1xw + iloss1 = self.iou_loss(preds_iou1, iou1, iou_thres=iou_thres) + else: + proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2) + proposals2se = bbox_se_transform_inv(proposals1, reg1se, 0.2) + rloss1 = iloss1 = 0 + proposals2 = (proposals2se + proposals2xw) / 2.0 + proposals2 = torch.clamp(proposals2, min=0.) + return preds_iou1, proposals2, rloss1, iloss1 + + def regress_loss(self, regression, targets, iou_with_gt, iou_thres): + weight = (iou_with_gt >= iou_thres).float().unsqueeze(1) + reg_loss = F.smooth_l1_loss(regression, targets, reduction='none') + if weight.sum() > 0: + reg_loss = (weight * reg_loss).sum() / weight.sum() + else: + reg_loss = (weight * reg_loss).sum() + return reg_loss + + def iou_loss(self, preds_iou, match_iou, iou_thres): + preds_iou = preds_iou.view(-1) + u_hmask = (match_iou > iou_thres).float() + u_mmask = ((match_iou <= iou_thres) & (match_iou > 0.3)).float() + u_lmask = (match_iou <= 0.3).float() + + num_h, num_m, num_l = u_hmask.sum(), u_mmask.sum(), u_lmask.sum() + + bs, device = u_hmask.size()[0], u_hmask.device + + r_m = min(num_h / num_m, 1) + u_smmask = torch.rand(bs, device=device) * u_mmask + u_smmask = (u_smmask > (1. - r_m)).float() + + r_l = min(num_h / num_l, 1) + u_slmask = torch.rand(bs, device=device) * u_lmask + u_slmask = (u_slmask > (1. - r_l)).float() + + iou_weights = u_hmask + u_smmask + u_slmask + iou_loss = F.smooth_l1_loss(preds_iou, match_iou, reduction='none') + if iou_weights.sum() > 0: + iou_loss = (iou_loss * iou_weights).sum() / iou_weights.sum() + else: + iou_loss = (iou_loss * iou_weights).sum() + return iou_loss + + +@MODELS.register_module() +class TCANet(BaseModel): + """Temporal Context Aggregation Network. + + Please refer `Temporal Context Aggregation Network for Temporal Action + Proposal Refinement `_. + Code Reference: + https://github.com/qinzhi-0110/Temporal-Context-Aggregation-Network-Pytorch + """ + + def __init__(self, + feat_dim: int = 2304, + se_sample_num: int = 32, + action_sample_num: int = 64, + temporal_dim: int = 100, + window_size: int = 9, + lgte_num: int = 2, + soft_nms_alpha: float = 0.4, + soft_nms_low_threshold: float = 0.0, + soft_nms_high_threshold: float = 0.0, + post_process_top_k: int = 100, + feature_extraction_interval: int = 16, + init_cfg: OptConfigType = None, + **kwargs) -> None: + super(TCANet, self).__init__(init_cfg) + + self.soft_nms_alpha = soft_nms_alpha + self.soft_nms_low_threshold = soft_nms_low_threshold + self.soft_nms_high_threshold = soft_nms_high_threshold + self.feature_extraction_interval = feature_extraction_interval + self.post_process_top_k = post_process_top_k + + hidden_dim = 512 + self.x_1d_b_f = nn.Sequential( + nn.Conv1d( + feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4), + nn.ReLU(inplace=True), + nn.Conv1d( + hidden_dim, hidden_dim, kernel_size=3, padding=1, groups=4), + nn.ReLU(inplace=True), + ) + + for i in 1, 2, 3: + tbr = TBR( + se_sample_num=se_sample_num, + action_sample_num=action_sample_num, + temporal_dim=temporal_dim, + init_cfg=init_cfg, + **kwargs) + setattr(self, f'tbr{i}', tbr) + + self.lgtes = nn.ModuleList([ + LGTE( + input_dim=hidden_dim, + dropout=0.1, + temporal_dim=temporal_dim, + window_size=window_size, + init_cfg=init_cfg, + **kwargs) for i in range(lgte_num) + ]) + + def forward(self, inputs, data_samples, mode, **kwargs): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (List[:obj:`ActionDataSample`], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + if not isinstance(input, Tensor): + inputs = torch.stack(inputs) + if mode == 'tensor': + return self._forward(inputs, **kwargs) + if mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def _forward(self, x): + """Define the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + Returns: + torch.Tensor: The output of the module. + """ + x = self.x_1d_b_f(x) + for layer in self.lgtes: + x = layer(x) + return x + + def loss(self, batch_inputs, batch_data_samples, **kwargs): + features = self._forward(batch_inputs) + proposals_ = [ + sample.proposals['proposals'] for sample in batch_data_samples + ] + + batch_size = len(proposals_) + proposals_num = max([_.shape[0] for _ in proposals_]) + + proposals = torch.zeros((batch_size, proposals_num, 3), + device=features.device) + for i, proposal in enumerate(proposals_): + proposals[i, :proposal.shape[0]] = proposal + + gt_boxes_ = [ + sample.gt_instances['gt_bbox'] for sample in batch_data_samples + ] + gt_boxes = torch.zeros((batch_size, proposals_num, 2), + device=features.device) + for i, gt_box in enumerate(gt_boxes_): + L = gt_box.shape[0] + if L <= proposals_num: + gt_boxes[i, :L] = gt_box + else: + random_index = torch.randperm(L)[:proposals_num] + gt_boxes[i] = gt_box[random_index] + + for i in range(batch_size): + proposals[i, :, 2] = i + proposals = proposals.view(batch_size * proposals_num, 3) + proposals_select = proposals[:, 0:2].sum(dim=1) > 0 + proposals = proposals[proposals_select, :] + + features = features[proposals[:, 2].long()] + + gt_boxes = gt_boxes.view(batch_size * proposals_num, 2) + gt_boxes = gt_boxes[proposals_select, :] + + _, proposals1, rloss1, iloss1 = self.tbr1(proposals, features, + gt_boxes, 0.5, True) + _, proposals2, rloss2, iloss2 = self.tbr2(proposals1, features, + gt_boxes, 0.6, True) + _, _, rloss3, iloss3 = self.tbr3(proposals2, features, gt_boxes, 0.7, + True) + + loss_dict = dict( + rloss1=rloss1, + rloss2=rloss2, + rloss3=rloss3, + iloss1=iloss1, + iloss2=iloss2, + iloss3=iloss3) + return loss_dict + + def predict(self, batch_inputs, batch_data_samples, **kwargs): + features = self._forward(batch_inputs) + proposals_ = [ + sample.proposals['proposals'] for sample in batch_data_samples + ] + + batch_size = len(proposals_) + proposals_num = max([_.shape[0] for _ in proposals_]) + + proposals = torch.zeros((batch_size, proposals_num, 3), + device=features.device) + for i, proposal in enumerate(proposals_): + proposals[i, :proposal.shape[0]] = proposal + + scores = proposals[:, :, 2] + for i in range(batch_size): + proposals[i, :, 2] = i + + proposals = proposals.view(batch_size * proposals_num, 3) + proposals_select = proposals[:, 0:2].sum(dim=1) > 0 + proposals = proposals[proposals_select, :] + scores = scores.view(-1)[proposals_select] + + features = features[proposals[:, 2].long()] + + preds_iou1, proposals1 = self.tbr1(proposals, features, None, 0.5, + False)[:2] + preds_iou2, proposals2 = self.tbr2(proposals1, features, None, 0.6, + False)[:2] + preds_iou3, proposals3 = self.tbr3(proposals2, features, None, 0.7, + False)[:2] + + all_proposals = [] + # all_proposals = [proposals] + all_proposals += [ + torch.cat([proposals1, (scores * preds_iou1).view(-1, 1)], dim=1) + ] + all_proposals += [ + torch.cat([proposals2, (scores * preds_iou2).view(-1, 1)], dim=1) + ] + all_proposals += [ + torch.cat([proposals3, (scores * preds_iou3).view(-1, 1)], dim=1) + ] + + all_proposals = torch.cat(all_proposals, dim=0).cpu().numpy() + video_info = batch_data_samples[0].metainfo + proposal_list = post_processing(all_proposals, video_info, + self.soft_nms_alpha, + self.soft_nms_low_threshold, + self.soft_nms_high_threshold, + self.post_process_top_k, + self.feature_extraction_interval) + output = [ + dict( + video_name=video_info['video_name'], + proposal_list=proposal_list) + ] + return output diff --git a/mmaction/models/localizers/utils/__init__.py b/mmaction/models/localizers/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53880641d5e611acc208ed0fa067a977b39b7d41 --- /dev/null +++ b/mmaction/models/localizers/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bsn_utils import generate_bsp_feature, generate_candidate_proposals +from .proposal_utils import (post_processing, soft_nms, temporal_iop, + temporal_iou) +from .tcanet_utils import (batch_iou, bbox_se_transform_batch, + bbox_se_transform_inv, bbox_xw_transform_batch, + bbox_xw_transform_inv) + +__all__ = [ + 'batch_iou', 'bbox_se_transform_batch', 'bbox_se_transform_inv', + 'bbox_xw_transform_batch', 'bbox_xw_transform_inv', 'generate_bsp_feature', + 'generate_candidate_proposals', 'post_processing', 'soft_nms', + 'temporal_iop', 'temporal_iou' +] diff --git a/mmaction/models/localizers/utils/__pycache__/__init__.cpython-310.pyc b/mmaction/models/localizers/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b39c4d5f334f34d9b70ec0a14221a84a92c9a75d Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-310.pyc b/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9feb489684011d9b62c25b34b217fc0583a11eb7 Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-310.pyc differ diff --git a/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-310.pyc b/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57b3708115e6d3bf4277d34d05c2e804a26d6500 Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-310.pyc differ diff --git a/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-310.pyc b/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1d1044834cbb0b985d49e49505498ded2091a08 Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-310.pyc differ diff --git a/mmaction/models/localizers/utils/bsn_utils.py b/mmaction/models/localizers/utils/bsn_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..247dc8a0a850110627e8d5297a02c0cca881f492 --- /dev/null +++ b/mmaction/models/localizers/utils/bsn_utils.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import numpy as np + +from .proposal_utils import temporal_iop, temporal_iou + + +def generate_candidate_proposals(video_list, + video_infos, + tem_results_dir, + temporal_scale, + peak_threshold, + tem_results_ext='.csv', + result_dict=None): + """Generate Candidate Proposals with given temporal evaluation results. + + Each proposal file will contain: + 'tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa'. + Args: + video_list (list[int]): List of video indexes to generate proposals. + video_infos (list[dict]): List of video_info dict that contains + 'video_name', 'duration_frame', 'duration_second', + 'feature_frame', and 'annotations'. + tem_results_dir (str): Directory to load temporal evaluation + results. + temporal_scale (int): The number (scale) on temporal axis. + peak_threshold (float): The threshold for proposal generation. + tem_results_ext (str): File extension for temporal evaluation + model output. Default: '.csv'. + result_dict (dict | None): The dict to save the results. Default: None. + Returns: + dict: A dict contains video_name as keys and proposal list as value. + If result_dict is not None, save the results to it. + """ + if tem_results_ext != '.csv': + raise NotImplementedError('Only support csv format now.') + + tscale = temporal_scale + tgap = 1. / tscale + proposal_dict = {} + for video_index in video_list: + video_name = video_infos[video_index]['video_name'] + tem_path = osp.join(tem_results_dir, video_name + tem_results_ext) + tem_results = np.loadtxt( + tem_path, dtype=np.float32, delimiter=',', skiprows=1) + start_scores = tem_results[:, 1] + end_scores = tem_results[:, 2] + + max_start = max(start_scores) + max_end = max(end_scores) + + start_bins = np.zeros(len(start_scores)) + start_bins[[0, -1]] = 1 + end_bins = np.zeros(len(end_scores)) + end_bins[[0, -1]] = 1 + for idx in range(1, tscale - 1): + if start_scores[idx] > start_scores[ + idx + 1] and start_scores[idx] > start_scores[idx - 1]: + start_bins[idx] = 1 + elif start_scores[idx] > (peak_threshold * max_start): + start_bins[idx] = 1 + if end_scores[idx] > end_scores[ + idx + 1] and end_scores[idx] > end_scores[idx - 1]: + end_bins[idx] = 1 + elif end_scores[idx] > (peak_threshold * max_end): + end_bins[idx] = 1 + + tmin_list = [] + tmin_score_list = [] + tmax_list = [] + tmax_score_list = [] + for idx in range(tscale): + if start_bins[idx] == 1: + tmin_list.append(tgap / 2 + tgap * idx) + tmin_score_list.append(start_scores[idx]) + if end_bins[idx] == 1: + tmax_list.append(tgap / 2 + tgap * idx) + tmax_score_list.append(end_scores[idx]) + + new_props = [] + for tmax, tmax_score in zip(tmax_list, tmax_score_list): + for tmin, tmin_score in zip(tmin_list, tmin_score_list): + if tmin >= tmax: + break + new_props.append([tmin, tmax, tmin_score, tmax_score]) + + new_props = np.stack(new_props) + + score = (new_props[:, 2] * new_props[:, 3]).reshape(-1, 1) + new_props = np.concatenate((new_props, score), axis=1) + + new_props = new_props[new_props[:, -1].argsort()[::-1]] + video_info = video_infos[video_index] + video_frame = video_info['duration_frame'] + video_second = video_info['duration_second'] + feature_frame = video_info['feature_frame'] + corrected_second = float(feature_frame) / video_frame * video_second + + gt_tmins = [] + gt_tmaxs = [] + for annotations in video_info['annotations']: + gt_tmins.append(annotations['segment'][0] / corrected_second) + gt_tmaxs.append(annotations['segment'][1] / corrected_second) + + new_iou_list = [] + new_ioa_list = [] + for new_prop in new_props: + new_iou = max( + temporal_iou(new_prop[0], new_prop[1], gt_tmins, gt_tmaxs)) + new_ioa = max( + temporal_iop(new_prop[0], new_prop[1], gt_tmins, gt_tmaxs)) + new_iou_list.append(new_iou) + new_ioa_list.append(new_ioa) + + new_iou_list = np.array(new_iou_list).reshape(-1, 1) + new_ioa_list = np.array(new_ioa_list).reshape(-1, 1) + new_props = np.concatenate((new_props, new_iou_list), axis=1) + new_props = np.concatenate((new_props, new_ioa_list), axis=1) + proposal_dict[video_name] = new_props + if result_dict is not None: + result_dict[video_name] = new_props + return proposal_dict + + +def generate_bsp_feature(video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + top_k=1000, + bsp_boundary_ratio=0.2, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + tem_results_ext='.csv', + pgm_proposal_ext='.csv', + result_dict=None): + """Generate Boundary-Sensitive Proposal Feature with given proposals. + + Args: + video_list (list[int]): List of video indexes to generate bsp_feature. + video_infos (list[dict]): List of video_info dict that contains + 'video_name'. + tem_results_dir (str): Directory to load temporal evaluation + results. + pgm_proposals_dir (str): Directory to load proposals. + top_k (int): Number of proposals to be considered. Default: 1000 + bsp_boundary_ratio (float): Ratio for proposal boundary + (start/end). Default: 0.2. + num_sample_start (int): Num of samples for actionness in + start region. Default: 8. + num_sample_end (int): Num of samples for actionness in end region. + Default: 8. + num_sample_action (int): Num of samples for actionness in center + region. Default: 16. + num_sample_interp (int): Num of samples for interpolation for + each sample point. Default: 3. + tem_results_ext (str): File extension for temporal evaluation + model output. Default: '.csv'. + pgm_proposal_ext (str): File extension for proposals. Default: '.csv'. + result_dict (dict | None): The dict to save the results. Default: None. + Returns: + bsp_feature_dict (dict): A dict contains video_name as keys and + bsp_feature as value. If result_dict is not None, save the + results to it. + """ + if tem_results_ext != '.csv' or pgm_proposal_ext != '.csv': + raise NotImplementedError('Only support csv format now.') + + bsp_feature_dict = {} + for video_index in video_list: + video_name = video_infos[video_index]['video_name'] + + # Load temporal evaluation results + tem_path = osp.join(tem_results_dir, video_name + tem_results_ext) + tem_results = np.loadtxt( + tem_path, dtype=np.float32, delimiter=',', skiprows=1) + score_action = tem_results[:, 0] + seg_tmins = tem_results[:, 3] + seg_tmaxs = tem_results[:, 4] + video_scale = len(tem_results) + video_gap = seg_tmaxs[0] - seg_tmins[0] + video_extend = int(video_scale / 4 + 10) + + # Load proposals results + proposal_path = osp.join(pgm_proposals_dir, + video_name + pgm_proposal_ext) + pgm_proposals = np.loadtxt( + proposal_path, dtype=np.float32, delimiter=',', skiprows=1) + pgm_proposals = pgm_proposals[:top_k] + + # Generate temporal sample points + boundary_zeros = np.zeros([video_extend]) + score_action = np.concatenate( + (boundary_zeros, score_action, boundary_zeros)) + begin_tp = [] + middle_tp = [] + end_tp = [] + for i in range(video_extend): + begin_tp.append(-video_gap / 2 - + (video_extend - 1 - i) * video_gap) + end_tp.append(video_gap / 2 + seg_tmaxs[-1] + i * video_gap) + for i in range(video_scale): + middle_tp.append(video_gap / 2 + i * video_gap) + t_points = begin_tp + middle_tp + end_tp + + bsp_feature = [] + for pgm_proposal in pgm_proposals: + tmin = pgm_proposal[0] + tmax = pgm_proposal[1] + + tlen = tmax - tmin + # Temporal range for start + tmin_0 = tmin - tlen * bsp_boundary_ratio + tmin_1 = tmin + tlen * bsp_boundary_ratio + # Temporal range for end + tmax_0 = tmax - tlen * bsp_boundary_ratio + tmax_1 = tmax + tlen * bsp_boundary_ratio + + # Generate features at start boundary + tlen_start = (tmin_1 - tmin_0) / (num_sample_start - 1) + tlen_start_sample = tlen_start / num_sample_interp + t_new = [ + tmin_0 - tlen_start / 2 + tlen_start_sample * i + for i in range(num_sample_start * num_sample_interp + 1) + ] + y_new_start_action = np.interp(t_new, t_points, score_action) + y_new_start = [ + np.mean(y_new_start_action[i * num_sample_interp:(i + 1) * + num_sample_interp + 1]) + for i in range(num_sample_start) + ] + # Generate features at end boundary + tlen_end = (tmax_1 - tmax_0) / (num_sample_end - 1) + tlen_end_sample = tlen_end / num_sample_interp + t_new = [ + tmax_0 - tlen_end / 2 + tlen_end_sample * i + for i in range(num_sample_end * num_sample_interp + 1) + ] + y_new_end_action = np.interp(t_new, t_points, score_action) + y_new_end = [ + np.mean(y_new_end_action[i * num_sample_interp:(i + 1) * + num_sample_interp + 1]) + for i in range(num_sample_end) + ] + # Generate features for action + tlen_action = (tmax - tmin) / (num_sample_action - 1) + tlen_action_sample = tlen_action / num_sample_interp + t_new = [ + tmin - tlen_action / 2 + tlen_action_sample * i + for i in range(num_sample_action * num_sample_interp + 1) + ] + y_new_action = np.interp(t_new, t_points, score_action) + y_new_action = [ + np.mean(y_new_action[i * num_sample_interp:(i + 1) * + num_sample_interp + 1]) + for i in range(num_sample_action) + ] + feature = np.concatenate([y_new_action, y_new_start, y_new_end]) + bsp_feature.append(feature) + bsp_feature = np.array(bsp_feature) + bsp_feature_dict[video_name] = bsp_feature + if result_dict is not None: + result_dict[video_name] = bsp_feature + return bsp_feature_dict diff --git a/mmaction/models/localizers/utils/proposal_utils.py b/mmaction/models/localizers/utils/proposal_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..925084bf37df82a4f2b2d584d628310804748d79 --- /dev/null +++ b/mmaction/models/localizers/utils/proposal_utils.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +def temporal_iou(proposal_min, proposal_max, gt_min, gt_max): + """Compute IoU score between a groundtruth bbox and the proposals. + + Args: + proposal_min (list[float]): List of temporal anchor min. + proposal_max (list[float]): List of temporal anchor max. + gt_min (float): Groundtruth temporal box min. + gt_max (float): Groundtruth temporal box max. + Returns: + list[float]: List of iou scores. + """ + len_anchors = proposal_max - proposal_min + int_tmin = np.maximum(proposal_min, gt_min) + int_tmax = np.minimum(proposal_max, gt_max) + inter_len = np.maximum(int_tmax - int_tmin, 0.) + union_len = len_anchors - inter_len + gt_max - gt_min + jaccard = np.divide(inter_len, union_len) + return jaccard + + +def temporal_iop(proposal_min, proposal_max, gt_min, gt_max): + """Compute IoP score between a groundtruth bbox and the proposals. + + Compute the IoP which is defined as the overlap ratio with + groundtruth proportional to the duration of this proposal. + Args: + proposal_min (list[float]): List of temporal anchor min. + proposal_max (list[float]): List of temporal anchor max. + gt_min (float): Groundtruth temporal box min. + gt_max (float): Groundtruth temporal box max. + Returns: + list[float]: List of intersection over anchor scores. + """ + len_anchors = np.array(proposal_max - proposal_min) + int_tmin = np.maximum(proposal_min, gt_min) + int_tmax = np.minimum(proposal_max, gt_max) + inter_len = np.maximum(int_tmax - int_tmin, 0.) + scores = np.divide(inter_len, len_anchors) + return scores + + +def soft_nms(proposals, alpha, low_threshold, high_threshold, top_k): + """Soft NMS for temporal proposals. + + Args: + proposals (np.ndarray): Proposals generated by network. + alpha (float): Alpha value of Gaussian decaying function. + low_threshold (float): Low threshold for soft nms. + high_threshold (float): High threshold for soft nms. + top_k (int): Top k values to be considered. + Returns: + np.ndarray: The updated proposals. + """ + proposals = proposals[proposals[:, -1].argsort()[::-1]] + tstart = list(proposals[:, 0]) + tend = list(proposals[:, 1]) + tscore = list(proposals[:, -1]) + rstart = [] + rend = [] + rscore = [] + + while len(tscore) > 0 and len(rscore) <= top_k: + max_index = np.argmax(tscore) + max_width = tend[max_index] - tstart[max_index] + iou_list = temporal_iou(tstart[max_index], tend[max_index], + np.array(tstart), np.array(tend)) + iou_exp_list = np.exp(-np.square(iou_list) / alpha) + + for idx, _ in enumerate(tscore): + if idx != max_index: + current_iou = iou_list[idx] + if current_iou > low_threshold + (high_threshold - + low_threshold) * max_width: + tscore[idx] = tscore[idx] * iou_exp_list[idx] + + rstart.append(tstart[max_index]) + rend.append(tend[max_index]) + rscore.append(tscore[max_index]) + tstart.pop(max_index) + tend.pop(max_index) + tscore.pop(max_index) + + rstart = np.array(rstart).reshape(-1, 1) + rend = np.array(rend).reshape(-1, 1) + rscore = np.array(rscore).reshape(-1, 1) + new_proposals = np.concatenate((rstart, rend, rscore), axis=1) + return new_proposals + + +def post_processing(result, video_info, soft_nms_alpha, soft_nms_low_threshold, + soft_nms_high_threshold, post_process_top_k, + feature_extraction_interval): + """Post process for temporal proposals generation. + Args: + result (np.ndarray): Proposals generated by network. + video_info (dict): Meta data of video. Required keys are + 'duration_frame', 'duration_second'. + soft_nms_alpha (float): Alpha value of Gaussian decaying function. + soft_nms_low_threshold (float): Low threshold for soft nms. + soft_nms_high_threshold (float): High threshold for soft nms. + post_process_top_k (int): Top k values to be considered. + feature_extraction_interval (int): Interval used in feature extraction. + Returns: + list[dict]: The updated proposals, e.g. + [{'score': 0.9, 'segment': [0, 1]}, + {'score': 0.8, 'segment': [0, 2]}, + ...]. + """ + if len(result) > 1: + result = soft_nms(result, soft_nms_alpha, soft_nms_low_threshold, + soft_nms_high_threshold, post_process_top_k) + + result = result[result[:, -1].argsort()[::-1]] + video_duration = float( + video_info['duration_frame'] // feature_extraction_interval * + feature_extraction_interval + ) / video_info['duration_frame'] * video_info['duration_second'] + proposal_list = [] + + for j in range(min(post_process_top_k, len(result))): + proposal = {} + proposal['score'] = float(result[j, -1]) + proposal['segment'] = [ + max(0, result[j, 0]) * video_duration, + min(1, result[j, 1]) * video_duration + ] + proposal_list.append(proposal) + return proposal_list diff --git a/mmaction/models/localizers/utils/tcanet_utils.py b/mmaction/models/localizers/utils/tcanet_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe387b43c523df346b457f09784a7555480747a --- /dev/null +++ b/mmaction/models/localizers/utils/tcanet_utils.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copied from +# 'https://github.com/qinzhi-0110/' +# 'Temporal-Context-Aggregation-Network-Pytorch/' +# 'blob/main/utils.py' +# TODO: refactor +import torch + + +def batch_iou(proposals, gt_boxes): + len_proposals = proposals[:, 1] - proposals[:, 0] + int_xmin = torch.max(proposals[:, 0], gt_boxes[:, 0]) + int_xmax = torch.min(proposals[:, 1], gt_boxes[:, 1]) + inter_len = torch.clamp(int_xmax - int_xmin, min=0.) + union_len = len_proposals - inter_len + gt_boxes[:, 1] - gt_boxes[:, 0] + jaccard = inter_len / (union_len + 0.00001) + return jaccard + + +def bbox_xw_transform_inv(boxes, deltas, dx_w, dw_w): + widths = boxes[:, 1] - boxes[:, 0] + ctr_x = boxes[:, 0] + 0.5 * widths + + dx = deltas[:, 0] * dx_w + dw = deltas[:, 1] * dw_w + + pred_ctr_x = dx * widths + ctr_x + pred_w = torch.exp(dw) * widths + + pred_boxes = deltas.clone() + # x1 + pred_boxes[:, 0] = pred_ctr_x - 0.5 * pred_w + # x2 + pred_boxes[:, 1] = pred_ctr_x + 0.5 * pred_w + + return pred_boxes + + +def bbox_xw_transform_batch(ex_rois, gt_rois): + ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001) + ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths + + gt_widths = torch.clamp(gt_rois[:, 1] - gt_rois[:, 0], min=0.00001) + gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths + + targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dw = torch.log(gt_widths / ex_widths) + targets = torch.stack((targets_dx, targets_dw), dim=1) + return targets + + +def bbox_se_transform_batch(ex_rois, gt_rois): + ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001) + + s_offset = gt_rois[:, 0] - ex_rois[:, 0] + e_offset = gt_rois[:, 1] - ex_rois[:, 1] + + targets_s = s_offset / ex_widths + targets_e = e_offset / ex_widths + targets = torch.stack((targets_s, targets_e), dim=1) + return targets + + +def bbox_se_transform_inv(boxes, deltas, dse_w): + widths = boxes[:, 1] - boxes[:, 0] + s_offset = deltas[:, 0] * widths * dse_w + e_offset = deltas[:, 1] * widths * dse_w + pred_boxes = deltas.clone() + pred_boxes[:, 0] = boxes[:, 0] + s_offset + pred_boxes[:, 1] = boxes[:, 1] + e_offset + return pred_boxes diff --git a/mmaction/models/losses/__init__.py b/mmaction/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d137fe010c490299e4bcc3fbe7fcf1f9f59d694d --- /dev/null +++ b/mmaction/models/losses/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseWeightedLoss +from .binary_logistic_regression_loss import BinaryLogisticRegressionLoss +from .bmn_loss import BMNLoss +from .cross_entropy_loss import (BCELossWithLogits, CBFocalLoss, + CrossEntropyLoss) +from .hvu_loss import HVULoss +from .nll_loss import NLLLoss +from .ohem_hinge_loss import OHEMHingeLoss +from .ssn_loss import SSNLoss + +__all__ = [ + 'BaseWeightedLoss', 'CrossEntropyLoss', 'NLLLoss', 'BCELossWithLogits', + 'BinaryLogisticRegressionLoss', 'BMNLoss', 'OHEMHingeLoss', 'SSNLoss', + 'HVULoss', 'CBFocalLoss' +] diff --git a/mmaction/models/losses/base.py b/mmaction/models/losses/base.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd3c797e3a500154221293016b75e47bb208e3f --- /dev/null +++ b/mmaction/models/losses/base.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +import torch.nn as nn + + +class BaseWeightedLoss(nn.Module, metaclass=ABCMeta): + """Base class for loss. + + All subclass should overwrite the ``_forward()`` method which returns the + normal loss without loss weights. + + Args: + loss_weight (float): Factor scalar multiplied on the loss. + Default: 1.0. + """ + + def __init__(self, loss_weight=1.0): + super().__init__() + self.loss_weight = loss_weight + + @abstractmethod + def _forward(self, *args, **kwargs): + """Forward function.""" + pass + + def forward(self, *args, **kwargs): + """Defines the computation performed at every call. + + Args: + *args: The positional arguments for the corresponding + loss. + **kwargs: The keyword arguments for the corresponding + loss. + + Returns: + torch.Tensor: The calculated loss. + """ + ret = self._forward(*args, **kwargs) + if isinstance(ret, dict): + for k in ret: + if 'loss' in k: + ret[k] *= self.loss_weight + else: + ret *= self.loss_weight + return ret diff --git a/mmaction/models/losses/binary_logistic_regression_loss.py b/mmaction/models/losses/binary_logistic_regression_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..537d03f12f9957aaa5b9dec7f737d2a93f6f1bb9 --- /dev/null +++ b/mmaction/models/losses/binary_logistic_regression_loss.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.registry import MODELS + + +def binary_logistic_regression_loss(reg_score, + label, + threshold=0.5, + ratio_range=(1.05, 21), + eps=1e-5): + """Binary Logistic Regression Loss.""" + label = label.view(-1).to(reg_score.device) + reg_score = reg_score.contiguous().view(-1) + + pmask = (label > threshold).float().to(reg_score.device) + num_positive = max(torch.sum(pmask), 1) + num_entries = len(label) + ratio = num_entries / num_positive + # clip ratio value between ratio_range + ratio = min(max(ratio, ratio_range[0]), ratio_range[1]) + + coef_0 = 0.5 * ratio / (ratio - 1) + coef_1 = 0.5 * ratio + loss = coef_1 * pmask * torch.log(reg_score + eps) + coef_0 * ( + 1.0 - pmask) * torch.log(1.0 - reg_score + eps) + loss = -torch.mean(loss) + return loss + + +@MODELS.register_module() +class BinaryLogisticRegressionLoss(nn.Module): + """Binary Logistic Regression Loss. + + It will calculate binary logistic regression loss given reg_score and + label. + """ + + def forward(self, + reg_score, + label, + threshold=0.5, + ratio_range=(1.05, 21), + eps=1e-5): + """Calculate Binary Logistic Regression Loss. + + Args: + reg_score (torch.Tensor): Predicted score by model. + label (torch.Tensor): Groundtruth labels. + threshold (float): Threshold for positive instances. + Default: 0.5. + ratio_range (tuple): Lower bound and upper bound for ratio. + Default: (1.05, 21) + eps (float): Epsilon for small value. Default: 1e-5. + + Returns: + torch.Tensor: Returned binary logistic loss. + """ + + return binary_logistic_regression_loss(reg_score, label, threshold, + ratio_range, eps) diff --git a/mmaction/models/losses/bmn_loss.py b/mmaction/models/losses/bmn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..df9bacc842c2e22cd2149373bd882466e2b9f150 --- /dev/null +++ b/mmaction/models/losses/bmn_loss.py @@ -0,0 +1,181 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmaction.registry import MODELS +from .binary_logistic_regression_loss import binary_logistic_regression_loss + + +@MODELS.register_module() +class BMNLoss(nn.Module): + """BMN Loss. + + From paper https://arxiv.org/abs/1907.09702, + code https://github.com/JJBOY/BMN-Boundary-Matching-Network. + It will calculate loss for BMN Model. This loss is a weighted sum of + + 1) temporal evaluation loss based on confidence score of start and + end positions. + 2) proposal evaluation regression loss based on confidence scores of + candidate proposals. + 3) proposal evaluation classification loss based on classification + results of candidate proposals. + """ + + @staticmethod + def tem_loss(pred_start, pred_end, gt_start, gt_end): + """Calculate Temporal Evaluation Module Loss. + + This function calculate the binary_logistic_regression_loss for start + and end respectively and returns the sum of their losses. + + Args: + pred_start (torch.Tensor): Predicted start score by BMN model. + pred_end (torch.Tensor): Predicted end score by BMN model. + gt_start (torch.Tensor): Groundtruth confidence score for start. + gt_end (torch.Tensor): Groundtruth confidence score for end. + + Returns: + torch.Tensor: Returned binary logistic loss. + """ + loss_start = binary_logistic_regression_loss(pred_start, gt_start) + loss_end = binary_logistic_regression_loss(pred_end, gt_end) + loss = loss_start + loss_end + return loss + + @staticmethod + def pem_reg_loss(pred_score, + gt_iou_map, + mask, + high_temporal_iou_threshold=0.7, + low_temporal_iou_threshold=0.3): + """Calculate Proposal Evaluation Module Regression Loss. + + Args: + pred_score (torch.Tensor): Predicted temporal_iou score by BMN. + gt_iou_map (torch.Tensor): Groundtruth temporal_iou score. + mask (torch.Tensor): Boundary-Matching mask. + high_temporal_iou_threshold (float): Higher threshold of + temporal_iou. Default: 0.7. + low_temporal_iou_threshold (float): Higher threshold of + temporal_iou. Default: 0.3. + + Returns: + torch.Tensor: Proposal evaluation regression loss. + """ + u_hmask = (gt_iou_map > high_temporal_iou_threshold).float() + u_mmask = ((gt_iou_map <= high_temporal_iou_threshold) & + (gt_iou_map > low_temporal_iou_threshold)).float() + u_lmask = ((gt_iou_map <= low_temporal_iou_threshold) & + (gt_iou_map > 0.)).float() + u_lmask = u_lmask * mask + + num_h = torch.sum(u_hmask) + num_m = torch.sum(u_mmask) + num_l = torch.sum(u_lmask) + + r_m = num_h / num_m + u_smmask = torch.rand_like(gt_iou_map) + u_smmask = u_mmask * u_smmask + u_smmask = (u_smmask > (1. - r_m)).float() + + r_l = num_h / num_l + u_slmask = torch.rand_like(gt_iou_map) + u_slmask = u_lmask * u_slmask + u_slmask = (u_slmask > (1. - r_l)).float() + + weights = u_hmask + u_smmask + u_slmask + + loss = F.mse_loss(pred_score * weights, gt_iou_map * weights) + loss = 0.5 * torch.sum( + loss * torch.ones_like(weights)) / torch.sum(weights) + + return loss + + @staticmethod + def pem_cls_loss(pred_score, + gt_iou_map, + mask, + threshold=0.9, + ratio_range=(1.05, 21), + eps=1e-5): + """Calculate Proposal Evaluation Module Classification Loss. + + Args: + pred_score (torch.Tensor): Predicted temporal_iou score by BMN. + gt_iou_map (torch.Tensor): Groundtruth temporal_iou score. + mask (torch.Tensor): Boundary-Matching mask. + threshold (float): Threshold of temporal_iou for positive + instances. Default: 0.9. + ratio_range (tuple): Lower bound and upper bound for ratio. + Default: (1.05, 21) + eps (float): Epsilon for small value. Default: 1e-5 + + Returns: + torch.Tensor: Proposal evaluation classification loss. + """ + pmask = (gt_iou_map > threshold).float() + nmask = (gt_iou_map <= threshold).float() + nmask = nmask * mask + + num_positive = max(torch.sum(pmask), 1) + num_entries = num_positive + torch.sum(nmask) + ratio = num_entries / num_positive + ratio = torch.clamp(ratio, ratio_range[0], ratio_range[1]) + + coef_0 = 0.5 * ratio / (ratio - 1) + coef_1 = 0.5 * ratio + + loss_pos = coef_1 * torch.log(pred_score + eps) * pmask + loss_neg = coef_0 * torch.log(1.0 - pred_score + eps) * nmask + loss = -1 * torch.sum(loss_pos + loss_neg) / num_entries + return loss + + def forward(self, + pred_bm, + pred_start, + pred_end, + gt_iou_map, + gt_start, + gt_end, + bm_mask, + weight_tem=1.0, + weight_pem_reg=10.0, + weight_pem_cls=1.0): + """Calculate Boundary Matching Network Loss. + + Args: + pred_bm (torch.Tensor): Predicted confidence score for boundary + matching map. + pred_start (torch.Tensor): Predicted confidence score for start. + pred_end (torch.Tensor): Predicted confidence score for end. + gt_iou_map (torch.Tensor): Groundtruth score for boundary matching + map. + gt_start (torch.Tensor): Groundtruth temporal_iou score for start. + gt_end (torch.Tensor): Groundtruth temporal_iou score for end. + bm_mask (torch.Tensor): Boundary-Matching mask. + weight_tem (float): Weight for tem loss. Default: 1.0. + weight_pem_reg (float): Weight for pem regression loss. + Default: 10.0. + weight_pem_cls (float): Weight for pem classification loss. + Default: 1.0. + + Returns: + tuple([torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): + (loss, tem_loss, pem_reg_loss, pem_cls_loss). Loss is the bmn + loss, tem_loss is the temporal evaluation loss, pem_reg_loss is + the proposal evaluation regression loss, pem_cls_loss is the + proposal evaluation classification loss. + """ + pred_bm_reg = pred_bm[:, 0].contiguous() + pred_bm_cls = pred_bm[:, 1].contiguous() + gt_iou_map = gt_iou_map * bm_mask + + pem_reg_loss = self.pem_reg_loss(pred_bm_reg, gt_iou_map, bm_mask) + pem_cls_loss = self.pem_cls_loss(pred_bm_cls, gt_iou_map, bm_mask) + tem_loss = self.tem_loss(pred_start, pred_end, gt_start, gt_end) + loss = ( + weight_tem * tem_loss + weight_pem_reg * pem_reg_loss + + weight_pem_cls * pem_cls_loss) + return loss, tem_loss, pem_reg_loss, pem_cls_loss diff --git a/mmaction/models/losses/cross_entropy_loss.py b/mmaction/models/losses/cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..7324f20619851c0fa06b64458f14720175ae8229 --- /dev/null +++ b/mmaction/models/losses/cross_entropy_loss.py @@ -0,0 +1,200 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +import numpy as np +import torch +import torch.nn.functional as F + +from mmaction.registry import MODELS +from .base import BaseWeightedLoss + + +@MODELS.register_module() +class CrossEntropyLoss(BaseWeightedLoss): + """Cross Entropy Loss. + + Support two kinds of labels and their corresponding loss type. It's worth + mentioning that loss type will be detected by the shape of ``cls_score`` + and ``label``. + 1) Hard label: This label is an integer array and all of the elements are + in the range [0, num_classes - 1]. This label's shape should be + ``cls_score``'s shape with the `num_classes` dimension removed. + 2) Soft label(probability distribution over classes): This label is a + probability distribution and all of the elements are in the range + [0, 1]. This label's shape must be the same as ``cls_score``. For now, + only 2-dim soft label is supported. + + Args: + loss_weight (float): Factor scalar multiplied on the loss. + Defaults to 1.0. + class_weight (list[float] | None): Loss weight for each class. If set + as None, use the same weight 1 for all classes. Only applies + to CrossEntropyLoss and BCELossWithLogits (should not be set when + using other losses). Defaults to None. + """ + + def __init__(self, + loss_weight: float = 1.0, + class_weight: Optional[List[float]] = None) -> None: + super().__init__(loss_weight=loss_weight) + self.class_weight = None + if class_weight is not None: + self.class_weight = torch.Tensor(class_weight) + + def _forward(self, cls_score: torch.Tensor, label: torch.Tensor, + **kwargs) -> torch.Tensor: + """Forward function. + + Args: + cls_score (torch.Tensor): The class score. + label (torch.Tensor): The ground truth label. + kwargs: Any keyword argument to be used to calculate + CrossEntropy loss. + + Returns: + torch.Tensor: The returned CrossEntropy loss. + """ + if cls_score.size() == label.size(): + # calculate loss for soft label + + assert cls_score.dim() == 2, 'Only support 2-dim soft label' + assert len(kwargs) == 0, \ + ('For now, no extra args are supported for soft label, ' + f'but get {kwargs}') + + lsm = F.log_softmax(cls_score, 1) + if self.class_weight is not None: + self.class_weight = self.class_weight.to(cls_score.device) + lsm = lsm * self.class_weight.unsqueeze(0) + loss_cls = -(label * lsm).sum(1) + + # default reduction 'mean' + if self.class_weight is not None: + # Use weighted average as pytorch CrossEntropyLoss does. + # For more information, please visit https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html # noqa + loss_cls = loss_cls.sum() / torch.sum( + self.class_weight.unsqueeze(0) * label) + else: + loss_cls = loss_cls.mean() + else: + # calculate loss for hard label + + if self.class_weight is not None: + assert 'weight' not in kwargs, \ + "The key 'weight' already exists." + kwargs['weight'] = self.class_weight.to(cls_score.device) + loss_cls = F.cross_entropy(cls_score, label, **kwargs) + + return loss_cls + + +@MODELS.register_module() +class BCELossWithLogits(BaseWeightedLoss): + """Binary Cross Entropy Loss with logits. + + Args: + loss_weight (float): Factor scalar multiplied on the loss. + Defaults to 1.0. + class_weight (list[float] | None): Loss weight for each class. If set + as None, use the same weight 1 for all classes. Only applies + to CrossEntropyLoss and BCELossWithLogits (should not be set when + using other losses). Defaults to None. + """ + + def __init__(self, + loss_weight: float = 1.0, + class_weight: Optional[List[float]] = None) -> None: + super().__init__(loss_weight=loss_weight) + self.class_weight = None + if class_weight is not None: + self.class_weight = torch.Tensor(class_weight) + + def _forward(self, cls_score: torch.Tensor, label: torch.Tensor, + **kwargs) -> torch.Tensor: + """Forward function. + + Args: + cls_score (torch.Tensor): The class score. + label (torch.Tensor): The ground truth label. + kwargs: Any keyword argument to be used to calculate + bce loss with logits. + + Returns: + torch.Tensor: The returned bce loss with logits. + """ + if self.class_weight is not None: + assert 'weight' not in kwargs, "The key 'weight' already exists." + kwargs['weight'] = self.class_weight.to(cls_score.device) + loss_cls = F.binary_cross_entropy_with_logits(cls_score, label, + **kwargs) + return loss_cls + + +@MODELS.register_module() +class CBFocalLoss(BaseWeightedLoss): + """Class Balanced Focal Loss. Adapted from https://github.com/abhinanda- + punnakkal/BABEL/. This loss is used in the skeleton-based action + recognition baseline for BABEL. + + Args: + loss_weight (float): Factor scalar multiplied on the loss. + Defaults to 1.0. + samples_per_cls (list[int]): The number of samples per class. + Defaults to []. + beta (float): Hyperparameter that controls the per class loss weight. + Defaults to 0.9999. + gamma (float): Hyperparameter of the focal loss. Defaults to 2.0. + """ + + def __init__(self, + loss_weight: float = 1.0, + samples_per_cls: List[int] = [], + beta: float = 0.9999, + gamma: float = 2.) -> None: + super().__init__(loss_weight=loss_weight) + self.samples_per_cls = samples_per_cls + self.beta = beta + self.gamma = gamma + effective_num = 1.0 - np.power(beta, samples_per_cls) + weights = (1.0 - beta) / np.array(effective_num) + weights = weights / np.sum(weights) * len(weights) + self.weights = weights + self.num_classes = len(weights) + + def _forward(self, cls_score: torch.Tensor, label: torch.Tensor, + **kwargs) -> torch.Tensor: + """Forward function. + + Args: + cls_score (torch.Tensor): The class score. + label (torch.Tensor): The ground truth label. + kwargs: Any keyword argument to be used to calculate + bce loss with logits. + + Returns: + torch.Tensor: The returned bce loss with logits. + """ + weights = torch.tensor(self.weights).float().to(cls_score.device) + label_one_hot = F.one_hot(label, self.num_classes).float() + weights = weights.unsqueeze(0) + weights = weights.repeat(label_one_hot.shape[0], 1) * label_one_hot + weights = weights.sum(1) + weights = weights.unsqueeze(1) + weights = weights.repeat(1, self.num_classes) + + BCELoss = F.binary_cross_entropy_with_logits( + input=cls_score, target=label_one_hot, reduction='none') + + modulator = 1.0 + if self.gamma: + modulator = torch.exp(-self.gamma * label_one_hot * cls_score - + self.gamma * + torch.log(1 + torch.exp(-1.0 * cls_score))) + + loss = modulator * BCELoss + weighted_loss = weights * loss + + focal_loss = torch.sum(weighted_loss) + focal_loss /= torch.sum(label_one_hot) + + return focal_loss diff --git a/mmaction/models/losses/hvu_loss.py b/mmaction/models/losses/hvu_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..a79803fbcfc4eebfa8b4b89f7b4a98343533f911 --- /dev/null +++ b/mmaction/models/losses/hvu_loss.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from mmengine.device import get_device + +from mmaction.registry import MODELS +from .base import BaseWeightedLoss + + +@MODELS.register_module() +class HVULoss(BaseWeightedLoss): + """Calculate the BCELoss for HVU. + + Args: + categories (tuple[str]): Names of tag categories, tags are organized in + this order. Default: ['action', 'attribute', 'concept', 'event', + 'object', 'scene']. + category_nums (tuple[int]): Number of tags for each category. Default: + (739, 117, 291, 69, 1678, 248). + category_loss_weights (tuple[float]): Loss weights of categories, it + applies only if `loss_type == 'individual'`. The loss weights will + be normalized so that the sum equals to 1, so that you can give any + positive number as loss weight. Default: (1, 1, 1, 1, 1, 1). + loss_type (str): The loss type we calculate, we can either calculate + the BCELoss for all tags, or calculate the BCELoss for tags in each + category. Choices are 'individual' or 'all'. Default: 'all'. + with_mask (bool): Since some tag categories are missing for some video + clips. If `with_mask == True`, we will not calculate loss for these + missing categories. Otherwise, these missing categories are treated + as negative samples. + reduction (str): Reduction way. Choices are 'mean' or 'sum'. Default: + 'mean'. + loss_weight (float): The loss weight. Default: 1.0. + """ + + def __init__(self, + categories=('action', 'attribute', 'concept', 'event', + 'object', 'scene'), + category_nums=(739, 117, 291, 69, 1678, 248), + category_loss_weights=(1, 1, 1, 1, 1, 1), + loss_type='all', + with_mask=False, + reduction='mean', + loss_weight=1.0): + + super().__init__(loss_weight) + self.categories = categories + self.category_nums = category_nums + self.category_loss_weights = category_loss_weights + assert len(self.category_nums) == len(self.category_loss_weights) + for category_loss_weight in self.category_loss_weights: + assert category_loss_weight >= 0 + self.loss_type = loss_type + self.with_mask = with_mask + self.reduction = reduction + self.category_startidx = [0] + for i in range(len(self.category_nums) - 1): + self.category_startidx.append(self.category_startidx[-1] + + self.category_nums[i]) + assert self.loss_type in ['individual', 'all'] + assert self.reduction in ['mean', 'sum'] + + def _forward(self, cls_score, label, mask, category_mask): + """Forward function. + + Args: + cls_score (torch.Tensor): The class score. + label (torch.Tensor): The ground truth label. + mask (torch.Tensor): The mask of tags. 0 indicates that the + category of this tag is missing in the label of the video. + category_mask (torch.Tensor): The category mask. For each sample, + it's a tensor with length `len(self.categories)`, denotes that + if the category is labeled for this video. + + Returns: + torch.Tensor: The returned CrossEntropy loss. + """ + + if self.loss_type == 'all': + loss_cls = F.binary_cross_entropy_with_logits( + cls_score, label, reduction='none') + if self.with_mask: + w_loss_cls = mask * loss_cls + w_loss_cls = torch.sum(w_loss_cls, dim=1) + if self.reduction == 'mean': + w_loss_cls = w_loss_cls / torch.sum(mask, dim=1) + w_loss_cls = torch.mean(w_loss_cls) + return dict(loss_cls=w_loss_cls) + + if self.reduction == 'sum': + loss_cls = torch.sum(loss_cls, dim=-1) + return dict(loss_cls=torch.mean(loss_cls)) + + if self.loss_type == 'individual': + losses = {} + loss_weights = {} + for name, num, start_idx in zip(self.categories, + self.category_nums, + self.category_startidx): + category_score = cls_score[:, start_idx:start_idx + num] + category_label = label[:, start_idx:start_idx + num] + category_loss = F.binary_cross_entropy_with_logits( + category_score, category_label, reduction='none') + if self.reduction == 'mean': + category_loss = torch.mean(category_loss, dim=1) + elif self.reduction == 'sum': + category_loss = torch.sum(category_loss, dim=1) + + idx = self.categories.index(name) + if self.with_mask: + category_mask_i = category_mask[:, idx].reshape(-1) + # there should be at least one sample which contains tags + # in this category + if torch.sum(category_mask_i) < 0.5: + losses[f'{name}_LOSS'] = torch.tensor( + .0, device=get_device()) + loss_weights[f'{name}_LOSS'] = .0 + continue + category_loss = torch.sum(category_loss * category_mask_i) + category_loss = category_loss / torch.sum(category_mask_i) + else: + category_loss = torch.mean(category_loss) + # We name the loss of each category as 'LOSS', since we only + # want to monitor them, not backward them. We will also provide + # the loss used for backward in the losses dictionary + losses[f'{name}_LOSS'] = category_loss + loss_weights[f'{name}_LOSS'] = self.category_loss_weights[idx] + loss_weight_sum = sum(loss_weights.values()) + loss_weights = { + k: v / loss_weight_sum + for k, v in loss_weights.items() + } + loss_cls = sum([losses[k] * loss_weights[k] for k in losses]) + losses['loss_cls'] = loss_cls + # We also trace the loss weights + losses.update({ + k + '_weight': torch.tensor(v).to(losses[k].device) + for k, v in loss_weights.items() + }) + # Note that the loss weights are just for reference. + return losses + else: + raise ValueError("loss_type should be 'all' or 'individual', " + f'but got {self.loss_type}') diff --git a/mmaction/models/losses/nll_loss.py b/mmaction/models/losses/nll_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..48577199464540ee612387684928490e0d0d7bb1 --- /dev/null +++ b/mmaction/models/losses/nll_loss.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn.functional as F + +from mmaction.registry import MODELS +from .base import BaseWeightedLoss + + +@MODELS.register_module() +class NLLLoss(BaseWeightedLoss): + """NLL Loss. + + It will calculate NLL loss given cls_score and label. + """ + + def _forward(self, cls_score, label, **kwargs): + """Forward function. + + Args: + cls_score (torch.Tensor): The class score. + label (torch.Tensor): The ground truth label. + kwargs: Any keyword argument to be used to calculate nll loss. + + Returns: + torch.Tensor: The returned nll loss. + """ + loss_cls = F.nll_loss(cls_score, label, **kwargs) + return loss_cls diff --git a/mmaction/models/losses/ohem_hinge_loss.py b/mmaction/models/losses/ohem_hinge_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..11d6da7c698e280c1ac0fbed95503816381927a3 --- /dev/null +++ b/mmaction/models/losses/ohem_hinge_loss.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +class OHEMHingeLoss(torch.autograd.Function): + """This class is the core implementation for the completeness loss in + paper. + + It compute class-wise hinge loss and performs online hard example mining + (OHEM). + """ + + @staticmethod + def forward(ctx, pred, labels, is_positive, ohem_ratio, group_size): + """Calculate OHEM hinge loss. + + Args: + pred (torch.Tensor): Predicted completeness score. + labels (torch.Tensor): Groundtruth class label. + is_positive (int): Set to 1 when proposals are positive and + set to -1 when proposals are incomplete. + ohem_ratio (float): Ratio of hard examples. + group_size (int): Number of proposals sampled per video. + + Returns: + torch.Tensor: Returned class-wise hinge loss. + """ + num_samples = pred.size(0) + if num_samples != len(labels): + raise ValueError(f'Number of samples should be equal to that ' + f'of labels, but got {num_samples} samples and ' + f'{len(labels)} labels.') + + losses = torch.zeros(num_samples, device=pred.device) + slopes = torch.zeros(num_samples, device=pred.device) + for i in range(num_samples): + losses[i] = max(0, 1 - is_positive * pred[i, labels[i] - 1]) + slopes[i] = -is_positive if losses[i] != 0 else 0 + + losses = losses.view(-1, group_size).contiguous() + sorted_losses, indices = torch.sort(losses, dim=1, descending=True) + keep_length = int(group_size * ohem_ratio) + loss = torch.zeros(1, device=pred.device) + for i in range(losses.size(0)): + loss += sorted_losses[i, :keep_length].sum() + ctx.loss_index = indices[:, :keep_length] + ctx.labels = labels + ctx.slopes = slopes + ctx.shape = pred.size() + ctx.group_size = group_size + ctx.num_groups = losses.size(0) + return loss + + @staticmethod + def backward(ctx, grad_output): + """Defines a formula for differentiating the operation with backward + mode automatic differentiation.""" + + labels = ctx.labels + slopes = ctx.slopes + + grad_in = torch.zeros(ctx.shape, device=ctx.slopes.device) + for group in range(ctx.num_groups): + for idx in ctx.loss_index[group]: + loc = idx + group * ctx.group_size + grad_in[loc, labels[loc] - 1] = ( + slopes[loc] * grad_output.data[0]) + return torch.autograd.Variable(grad_in), None, None, None, None diff --git a/mmaction/models/losses/ssn_loss.py b/mmaction/models/losses/ssn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..82ac90deb29450cc28446e8f1427650e175ce797 --- /dev/null +++ b/mmaction/models/losses/ssn_loss.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmaction.registry import MODELS +from .ohem_hinge_loss import OHEMHingeLoss + + +@MODELS.register_module() +class SSNLoss(nn.Module): + + @staticmethod + def activity_loss(activity_score, labels, activity_indexer): + """Activity Loss. + + It will calculate activity loss given activity_score and label. + + Args๏ผš + activity_score (torch.Tensor): Predicted activity score. + labels (torch.Tensor): Groundtruth class label. + activity_indexer (torch.Tensor): Index slices of proposals. + + Returns: + torch.Tensor: Returned cross entropy loss. + """ + pred = activity_score[activity_indexer, :] + gt = labels[activity_indexer] + return F.cross_entropy(pred, gt) + + @staticmethod + def completeness_loss(completeness_score, + labels, + completeness_indexer, + positive_per_video, + incomplete_per_video, + ohem_ratio=0.17): + """Completeness Loss. + + It will calculate completeness loss given completeness_score and label. + + Args๏ผš + completeness_score (torch.Tensor): Predicted completeness score. + labels (torch.Tensor): Groundtruth class label. + completeness_indexer (torch.Tensor): Index slices of positive and + incomplete proposals. + positive_per_video (int): Number of positive proposals sampled + per video. + incomplete_per_video (int): Number of incomplete proposals sampled + pre video. + ohem_ratio (float): Ratio of online hard example mining. + Default: 0.17. + + Returns: + torch.Tensor: Returned class-wise completeness loss. + """ + pred = completeness_score[completeness_indexer, :] + gt = labels[completeness_indexer] + + pred_dim = pred.size(1) + pred = pred.view(-1, positive_per_video + incomplete_per_video, + pred_dim) + gt = gt.view(-1, positive_per_video + incomplete_per_video) + + # yapf:disable + positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim) # noqa:E501 + incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim) # noqa:E501 + # yapf:enable + + positive_loss = OHEMHingeLoss.apply( + positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1, + 1.0, positive_per_video) + incomplete_loss = OHEMHingeLoss.apply( + incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1), + -1, ohem_ratio, incomplete_per_video) + num_positives = positive_pred.size(0) + num_incompletes = int(incomplete_pred.size(0) * ohem_ratio) + + return ((positive_loss + incomplete_loss) / + float(num_positives + num_incompletes)) + + @staticmethod + def classwise_regression_loss(bbox_pred, labels, bbox_targets, + regression_indexer): + """Classwise Regression Loss. + + It will calculate classwise_regression loss given + class_reg_pred and targets. + + Args๏ผš + bbox_pred (torch.Tensor): Predicted interval center and span + of positive proposals. + labels (torch.Tensor): Groundtruth class label. + bbox_targets (torch.Tensor): Groundtruth center and span + of positive proposals. + regression_indexer (torch.Tensor): Index slices of + positive proposals. + + Returns: + torch.Tensor: Returned class-wise regression loss. + """ + pred = bbox_pred[regression_indexer, :, :] + gt = labels[regression_indexer] + reg_target = bbox_targets[regression_indexer, :] + + class_idx = gt.data - 1 + classwise_pred = pred[:, class_idx, :] + classwise_reg_pred = torch.cat( + (torch.diag(classwise_pred[:, :, 0]).view( + -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)), + dim=1) + loss = F.smooth_l1_loss( + classwise_reg_pred.view(-1), reg_target.view(-1)) * 2 + return loss + + def forward(self, activity_score, completeness_score, bbox_pred, + proposal_type, labels, bbox_targets, train_cfg): + """Calculate Boundary Matching Network Loss. + + Args: + activity_score (torch.Tensor): Predicted activity score. + completeness_score (torch.Tensor): Predicted completeness score. + bbox_pred (torch.Tensor): Predicted interval center and span + of positive proposals. + proposal_type (torch.Tensor): Type index slices of proposals. + labels (torch.Tensor): Groundtruth class label. + bbox_targets (torch.Tensor): Groundtruth center and span + of positive proposals. + train_cfg (dict): Config for training. + + Returns: + dict([torch.Tensor, torch.Tensor, torch.Tensor]): + (loss_activity, loss_completeness, loss_reg). + Loss_activity is the activity loss, loss_completeness is + the class-wise completeness loss, + loss_reg is the class-wise regression loss. + """ + self.sampler = train_cfg.ssn.sampler + self.loss_weight = train_cfg.ssn.loss_weight + losses = dict() + + proposal_type = proposal_type.view(-1) + labels = labels.view(-1) + activity_indexer = ((proposal_type == 0) + + (proposal_type == 2)).nonzero().squeeze(1) + completeness_indexer = ((proposal_type == 0) + + (proposal_type == 1)).nonzero().squeeze(1) + + total_ratio = ( + self.sampler.positive_ratio + self.sampler.background_ratio + + self.sampler.incomplete_ratio) + positive_per_video = int(self.sampler.num_per_video * + (self.sampler.positive_ratio / total_ratio)) + background_per_video = int( + self.sampler.num_per_video * + (self.sampler.background_ratio / total_ratio)) + incomplete_per_video = ( + self.sampler.num_per_video - positive_per_video - + background_per_video) + + losses['loss_activity'] = self.activity_loss(activity_score, labels, + activity_indexer) + + losses['loss_completeness'] = self.completeness_loss( + completeness_score, + labels, + completeness_indexer, + positive_per_video, + incomplete_per_video, + ohem_ratio=positive_per_video / incomplete_per_video) + losses['loss_completeness'] *= self.loss_weight.comp_loss_weight + + if bbox_pred is not None: + regression_indexer = (proposal_type == 0).nonzero().squeeze(1) + bbox_targets = bbox_targets.view(-1, 2) + losses['loss_reg'] = self.classwise_regression_loss( + bbox_pred, labels, bbox_targets, regression_indexer) + losses['loss_reg'] *= self.loss_weight.reg_loss_weight + + return losses diff --git a/mmaction/models/multimodal/__init__.py b/mmaction/models/multimodal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7a34211fc5f01ae4ec882b4dd18b7fd854d61993 --- /dev/null +++ b/mmaction/models/multimodal/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.utils.dependency import WITH_MULTIMODAL + +if WITH_MULTIMODAL: + from .vindlu import * # noqa: F401,F403 + +else: + from mmaction.registry import MODELS + from mmaction.utils.dependency import register_multimodal_placeholder + + register_multimodal_placeholder( + ['VindLUVQA', 'VindLURetrievalMC', 'VindLURetrieval'], MODELS) diff --git a/mmaction/models/multimodal/vindlu/__init__.py b/mmaction/models/multimodal/vindlu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78fb497a6bdef65dca416cf9ae97b091b52bfe0b --- /dev/null +++ b/mmaction/models/multimodal/vindlu/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .beit3d import BeitModel3D +from .tokenizer import VindLUTokenizer +from .vindlu_ret import VindLURetrieval +from .vindlu_ret_mc import VindLURetrievalMC +from .vindlu_vqa import VindLUVQA +from .xbert import BertDecoder, BertModel + +__all__ = [ + 'VindLUVQA', 'VindLURetrievalMC', 'VindLURetrieval', 'VindLUTokenizer', + 'BeitModel3D', 'BertDecoder', 'BertModel' +] diff --git a/mmaction/models/multimodal/vindlu/beit3d.py b/mmaction/models/multimodal/vindlu/beit3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0e29f42cae779f0345c4869d9afdd42debddee13 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/beit3d.py @@ -0,0 +1,350 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import importlib +from typing import Dict, Optional, Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.models.beit import BeitConfig, BeitModel +from transformers.models.beit.modeling_beit import BeitAttention, BeitDropPath +from transformers.models.beit.modeling_beit import \ + BeitEmbeddings as BeitEmbeddings2D +from transformers.models.beit.modeling_beit import BeitLayer as BeitLayer2D +from transformers.models.beit.modeling_beit import BeitRelativePositionBias +from transformers.models.beit.modeling_beit import \ + BeitRelativePositionBias as BeitRelativePositionBias2D + +from mmaction.registry import MODELS +from .temporal_model import (X_CLIP, STAdapter, TemporalAttention, + WindowTemporalAttention) + + +def interpolate_temporal_pos_embed(temp_embed_old, num_frames_new): + """ + temp_embed_old: (1, num_frames_old, 1, d) + Returns: + temp_embed_new: (1, num_frames_new, 1, d) + """ + temp_embed_old = temp_embed_old.squeeze(2).permute( + 0, 2, 1) # (1, d, num_frames_old) + temp_embed_new = F.interpolate( + temp_embed_old, num_frames_new, + mode='linear') # (1, d, num_frames_new) + temp_embed_new = temp_embed_new.permute(0, 2, 1).unsqueeze( + 2) # (1, num_frames_new, 1, d) + return temp_embed_new + + +class TemporalAttentionBeit(nn.Module): + """temporal attention using BeitAttention.""" + + def __init__(self, config: BeitConfig): + """TODO: to be defined.""" + super().__init__() + + self.layernorm_before = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.attention = BeitAttention(config, window_size=None) + self.scale = nn.Parameter( + config.temporal_model_init_value * torch.ones( + (config.hidden_size)), + requires_grad=True, + ) + self.drop_path = BeitDropPath(config.drop_path_rate) + + def forward(self, hidden_states: torch.Tensor): + """forward function. + + Args: + hidden_states (torch.Tensor): The input. Shape: [b,t,l,c] + + Returns: TODO + """ + b = hidden_states.shape[0] + output = einops.rearrange(hidden_states, 'b t l c -> (b l) t c') + output = self.layernorm_before(output) + output = self.attention(output) + output = einops.rearrange(output[0], '(b l) t c -> b t l c', b=b) + return hidden_states + self.drop_path(output[0]) * self.scale + + +class BeitPooler3D(nn.Module): + + def __init__(self, config: BeitConfig) -> None: + super().__init__() + self.num_prompts = config.add_k_prompts + self.layernorm = ( + nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + if config.use_mean_pooling else None) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Args: + hidden_states (torch.Tensor): Shape: [B,T,L,C] + """ + if self.layernorm is not None: + # Mean pool the final hidden states of the patch tokens + # patch_tokens = hidden_states[:, 1 + self.num_prompts :, :] + if self.num_prompts > 0: + patch_tokens = hidden_states[:, :, 1:-self.num_prompts, :] + else: + patch_tokens = hidden_states[:, :, 1:, :] + pooled_output = self.layernorm(patch_tokens.mean(2)) + else: + # Pool by simply taking the final hidden state of the [CLS] token + pooled_output = hidden_states[:, :, 0] + + return pooled_output + + +class BeitRelativePositionBias3D(BeitRelativePositionBias2D): + + def __init__(self, config: BeitConfig, window_size: tuple) -> None: + super().__init__(config, window_size) + + # add bias for prompts + self.k = config.add_k_prompts + if self.k > 0: + self.prompt_bias_table = nn.parameter.Parameter( + torch.zeros((2 + self.k) * self.k, config.num_attention_heads) + ) # k prompt-to-token, k token-to-prompt, k*k prompt-to-promt + else: + self.prompt_bias_table = None + + def forward(self) -> torch.Tensor: + # relative position bias 2d + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + + # add bias for prompts + k = self.k + if k > 0: + l = self.window_size[0] * self.window_size[1] + 1 # noqa: E741 + bias = torch.zeros(l + k, l + k, + relative_position_bias.shape[-1]).to( + relative_position_bias.device) + bias[:l, :l] = relative_position_bias + bias[l:, :l] = self.prompt_bias_table[:k].view( + k, 1, -1) # prompt to token + bias[:l, + l:] = self.prompt_bias_table[k:2 * + k].view(1, k, + -1) # token to prompt + bias[l:, l:] = self.prompt_bias_table[2 * k, :].view( + k, k, -1) # prompt to prompt + else: + bias = relative_position_bias + + return bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class BeitEmbeddings3D(BeitEmbeddings2D): + """Construct the CLS token, position and patch embeddings. + + Optionally, also the mask token. + """ + + def __init__(self, config: BeitConfig) -> None: + super().__init__(config) + + if config.use_temporal_position_embedding: + self.temporal_position_embeddings = nn.parameter.Parameter( + torch.zeros(1, config.num_frames, 1, config.hidden_size)) + else: + self.temporal_position_embeddings = None + + if config.add_k_prompts > 0: + self.prompt_tokens = nn.parameter.Parameter( + torch.zeros(1, config.add_k_prompts, config.hidden_size)) + else: + self.prompt_tokens = None + + def forward(self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None + ) -> torch.Tensor: + """ + Args: + pixel_values (torch.Tensor): The input image patches. + Shape: [B, T, C, H, W]. + + + """ + t = pixel_values.shape[1] + pixel_values = einops.rearrange(pixel_values, + 'b t c h w -> (b t) c h w') + + embeddings = self.patch_embeddings(pixel_values) + batch_size, seq_len, _ = embeddings.size() # [(b t) l c] + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1 - w) + mask_tokens * w + + if self.prompt_tokens is not None: + prompt_tokens = self.prompt_tokens.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings, prompt_tokens), + dim=1) + else: + embeddings = torch.cat((cls_tokens, embeddings), + dim=1) # [B*T, L, C] + if self.position_embeddings is not None: + embeddings = embeddings + self.position_embeddings + + embeddings = einops.rearrange(embeddings, '(b t) l c -> b t l c', t=t) + if self.temporal_position_embeddings is not None: + if t <= self.temporal_position_embeddings.shape[1]: + embeddings = embeddings + \ + self.temporal_position_embeddings[:, :t] + else: + tpe = interpolate_temporal_pos_embed( + self.temporal_position_embeddings, t) + embeddings = embeddings + tpe + + embeddings = self.dropout(embeddings) + + return embeddings + + +class BeitLayer3D(BeitLayer2D): + + def __init__(self, + config: BeitConfig, + window_size: Optional[tuple] = None, + drop_path_rate: float = 0.0) -> None: + super().__init__(config, window_size, drop_path_rate) + + self.temporal_model_position = config.temporal_model_position + if config.temporal_model_block == 'st_adapter': + self.temp_model = STAdapter(**config.temporal_model_config) + elif config.temporal_model_block == 'timesformer': + self.temp_model = TemporalAttention(**config.temporal_model_config) + elif config.temporal_model_block == 'ta_beit': + self.temp_model = TemporalAttentionBeit(config) + elif config.temporal_model_block == 'window_attention': + self.temp_model = WindowTemporalAttention( + **config.temporal_model_config) + elif config.temporal_model_block == 'xclip': + self.temp_model = X_CLIP(**config.temporal_model_config) + elif config.temporal_model_block == 'none': + self.temp_model = None + else: + raise ValueError( + f'not accepted temporal model: {config.temporal_model_block}') + + self.temporal_model_block = config.temporal_model_block + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + relative_position_bias: Optional['BeitRelativePositionBias'] = None, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + + b, t, l, c = hidden_states.shape + + if self.temporal_model_block == 'xclip': + assert (self.temporal_model_position == 'first' + and self.config.add_k_prompts + == 1), ('xclip must be put before the attention and' + 'add_k_prompts must be 1.') + + if self.temp_model is not None and \ + self.temporal_model_position == 'first': + hidden_states = self.temp_model(hidden_states) + + hidden_states = einops.rearrange(hidden_states, 'b t l c -> (b t) l c') + + self_attention_outputs = self.attention( + self.layernorm_before( + hidden_states + ), # in BEiT, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + relative_position_bias=relative_position_bias, + ) + attention_output = self_attention_outputs[0] + + # add self attentions if we output attention weights + outputs = self_attention_outputs[1:] + + # apply lambda_1 if present + if self.lambda_1 is not None: + attention_output = self.lambda_1 * attention_output + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in BEiT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + + layer_output = self.intermediate(layer_output) + layer_output = self.output(layer_output) + + if self.lambda_2 is not None: + layer_output = self.lambda_2 * layer_output + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + layer_output = einops.rearrange( + layer_output, '(b t) l c -> b t l c', b=b) + + # apply temporal modeling block + if self.temp_model is not None and \ + self.temporal_model_position == 'last': + layer_output = self.temp_model(layer_output) + + outputs = (layer_output, ) + outputs + + return outputs + + +class BeitConfig3D(BeitConfig): + + def __init__(self, + num_frames=1, + temporal_model_block='none', + temporal_model_position='last', + temporal_model_init_value=0.0, + temporal_model_config={}, + use_temporal_position_embedding=False, + add_k_prompts=0, + **kwargs) -> None: + + super().__init__(**kwargs) + self.temporal_model_block = temporal_model_block + self.temporal_model_config = temporal_model_config + self.temporal_model_position = temporal_model_position + self.temporal_model_init_value = temporal_model_init_value + self.use_temporal_position_embedding = use_temporal_position_embedding + self.add_k_prompts = add_k_prompts + self.num_frames = num_frames + + +@MODELS.register_module() +class BeitModel3D(BeitModel): + + def __init__(self, + config: BeitConfig, + tem_config: Dict, + add_pooling_layer: bool = True) -> None: + # hack to replace original 2D modules with 3D modules + beit_package = importlib.import_module( + 'transformers.models.beit.modeling_beit') + beit_package.BeitEmbeddings = BeitEmbeddings3D + beit_package.BeitPooler = BeitPooler3D + beit_package.BeitLayer = BeitLayer3D + beit_package.BeitRelativePositionBias = BeitRelativePositionBias3D + + config = BeitConfig3D.from_pretrained(config, **tem_config) + super().__init__(config, add_pooling_layer) diff --git a/mmaction/models/multimodal/vindlu/modeling_bert.py b/mmaction/models/multimodal/vindlu/modeling_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..de56451efd0fbd2fbe6c3ba737f462284f0d4745 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/modeling_bert.py @@ -0,0 +1,1740 @@ +# flake8: noqa +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from mmengine.logging import MMLogger +from torch import Tensor, device, dtype, nn +from torch.nn import CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +# from transformers.models.bert.configuration_bert import BertConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.file_utils import (ModelOutput, add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, MaskedLMOutput, + MultipleChoiceModelOutput, NextSentencePredictorOutput, + QuestionAnsweringModelOutput, SequenceClassifierOutput, + TokenClassifierOutput) +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +transformers.logging.set_verbosity_error() + +_CONFIG_FOR_DOC = 'BertConfig' +_TOKENIZER_FOR_DOC = 'BertTokenizer' + +BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + 'bert-base-uncased', + 'bert-large-uncased', + 'bert-base-cased', + 'bert-large-cased', + 'bert-base-multilingual-uncased', + 'bert-base-multilingual-cased', + 'bert-base-chinese', + 'bert-base-german-cased', + 'bert-large-uncased-whole-word-masking', + 'bert-large-cased-whole-word-masking', + 'bert-large-uncased-whole-word-masking-finetuned-squad', + 'bert-large-cased-whole-word-masking-finetuned-squad', + 'bert-base-cased-finetuned-mrpc', + 'bert-base-german-dbmdz-cased', + 'bert-base-german-dbmdz-uncased', + 'cl-tohoku/bert-base-japanese', + 'cl-tohoku/bert-base-japanese-whole-word-masking', + 'cl-tohoku/bert-base-japanese-char', + 'cl-tohoku/bert-base-japanese-char-whole-word-masking', + 'TurkuNLP/bert-base-finnish-cased-v1', + 'TurkuNLP/bert-base-finnish-uncased-v1', + 'wietsedv/bert-base-dutch-cased', + # See all BERT models at https://huggingface.co/models?filter=bert +] + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to + instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BERT + [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = 'bert' + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type='absolute', + use_cache=True, + classifier_dropout=None, + cross_module='ca', + encoder_width=768, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + self.cross_module = cross_module + self.encoder_width = encoder_width + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + logger = MMLogger.get_current_instance() + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + 'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see ' + 'https://www.tensorflow.org/install/ for installation instructions.' + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info('Converting TensorFlow checkpoint from {}'.format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info('Loading TF weight {} with shape {}'.format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in [ + 'adam_v', + 'adam_m', + 'AdamWeightDecayOptimizer', + 'AdamWeightDecayOptimizer_1', + 'global_step', + ] for n in name): + logger.info('Skipping {}'.format('/'.join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + scope_names = re.split(r'_(\d+)', m_name) + else: + scope_names = [m_name] + if scope_names[0] == 'kernel' or scope_names[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif scope_names[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info('Skipping {}'.format('/'.join(name))) + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert ( + pointer.shape == array.shape + ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched' + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + + logger.info('Initialize PyTorch weight {}'.format(name)) + pointer.data = torch.from_numpy(array) + return model + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type + embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + + self.config = config + + def forward( + self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length: + seq_length + + past_key_values_length] + + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + 'The hidden size (%d) is not a multiple of the number of attention ' + 'heads (%d)' % + (config.hidden_size, config.num_attention_heads)) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / + config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + if (self.position_embedding_type == 'relative_key' + or self.position_embedding_type == 'relative_key_query'): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if (self.position_embedding_type == 'relative_key' + or self.position_embedding_type == 'relative_key_query'): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = ( + attention_scores + relative_position_scores_query + + relative_position_scores_key) + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + # added `attention_scores` to return tuple + outputs = ((context_layer, attention_probs, + attention_scores) if output_attentions else + (context_layer, )) + + outputs = outputs + (past_key_value, ) + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config, is_cross_attention=False): + super().__init__() + + self.self = BertSelfAttention(config, is_cross_attention) + + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + # add attentions if we output them + outputs = (attention_output, ) + self_outputs[1:] + return outputs # (context_layer, attention_probs, attention_scores, past_key_value,) + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + + self.has_cross_attention = layer_num >= config.fusion_layer + if self.has_cross_attention: + self.crossattention = BertAttention( + config, is_cross_attention=True) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) # (context_layer, attention_probs, attention_scores, past_key_value,) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if self.has_cross_attention: + assert ( + encoder_hidden_states is not None + ), 'encoder_hidden_states must be given for cross-attention layers' + + if type(encoder_hidden_states) == list: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states[(self.layer_num - + self.config.fusion_layer) % + len(encoder_hidden_states)], + encoder_attention_mask[(self.layer_num - + self.config.fusion_layer) % + len(encoder_hidden_states)], + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] + + else: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) # (context_layer, attention_probs, attention_scores, past_key_value,) + attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output, ) + outputs + + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)]) + logger = MMLogger.get_current_instance() + logger.info(f'build bert with cross_module: {config.cross_module}') + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + mode='multi_modal', + normalize_attention=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + # all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + if (mode == 'text' or mode == 'temporal' + ): # temporal is added and used for temporal att module. + start_layer = 0 + output_layer = self.config.fusion_layer + + elif mode == 'fusion': + start_layer = self.config.fusion_layer + output_layer = self.config.num_hidden_layers + + elif mode == 'multi_modal': + start_layer = 0 + output_layer = self.config.num_hidden_layers + + for i in range(start_layer, output_layer): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if getattr(self.config, 'gradient_checkpointing', + False) and self.training: + + if use_cache: + logger = MMLogger.get_current_instance() + logger.warn( + '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting ' + '`use_cache=False`...') + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + use_reentrant=False, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) # (context_layer, attention_probs, attention_scores, past_key_value,) + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + # whether to output normalized attention, + # note for unnormalized attention, there is a mask added + offset = int(normalize_attention) + # all_self_attentions = all_self_attentions + (layer_outputs[1], ) + all_self_attentions = all_self_attentions + ( + layer_outputs[2 - offset], ) + if hasattr(layer_module, 'crossattention'): + # all_cross_attentions = all_cross_attentions + (layer_outputs[3], ) + all_cross_attentions = all_cross_attentions + ( + layer_outputs[4 - offset], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """An abstract class to handle weights initialization and a simple + interface for downloading and loading pretrained models.""" + + config_class = BertConfig + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = 'bert' + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """Output type of :class:`~transformers.BertForPreTraining`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +BERT_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + Parameters: + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + 'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.', + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """The model can behave as an encoder (with only self-attention) as well as + a decoder, in which case a layer of cross-attention is added between the + self-attention layers, following the architecture described in `Attention + is all you need `__ by Ashish Vaswani, + Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. + + Gomez, Lukasz Kaiser and Illia Polosukhin. argument and + :obj:`add_cross_attention` set to :obj:`True`; an + :obj:`encoder_hidden_states` is then expected as an input to the forward + pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """Prunes heads of the model. + + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask(self, attention_mask: Tensor, + input_shape: Tuple[int], device: device, + is_decoder: bool) -> Tensor: + """Makes broadcastable attention and causal masks so that future and + masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= + seq_ids[None, :, None]) + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[ + 1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = ( + causal_mask[:, None, :, :] * + attention_mask[:, None, None, :]) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + 'Wrong shape for input_ids (shape {}) or attention_mask (shape {})' + .format(input_shape, attention_mask.shape)) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode='multi_modal', + normalize_attention=True, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions if output_attentions is not None else + self.config.output_attentions) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds or encoder_embeds' + ) + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] + if past_key_values is not None else 0) + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) + for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + mode=mode, + normalize_attention=normalize_attention, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + Returns: + Example:: + >>> from transformers import BertTokenizer, BertForPreTraining + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls( + sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2), + next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, + BERT_START_DOCSTRING, +) +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=True, + reduction='mean', + mode='multi_modal', + normalize_attention=True, + soft_labels=None, + alpha=0, + return_logits=False, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + mode=mode, + normalize_attention=normalize_attention, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, : + -1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if soft_labels is not None: + loss_distill = -torch.sum( + F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels, + dim=-1) + loss_distill = (loss_distill * (labels != -100)).sum(1) + lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((lm_loss, ) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'input_ids': + input_ids, + 'attention_mask': + attention_mask, + 'past_key_values': + past, + 'encoder_hidden_states': + model_kwargs.get('encoder_hidden_states', None), + 'encoder_attention_mask': + model_kwargs.get('encoder_attention_mask', None), + 'is_decoder': + True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple( + past_state.index_select(0, beam_idx) + for past_state in layer_past), ) + return reordered_past + + +@dataclass +class MaskedLMOutputWithDistill(MaskedLMOutput): + loss_aux: Optional[torch.FloatTensor] = None + loss_distill: Optional[torch.FloatTensor] = None + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def tie_aux_decoder_weights(self, module, aux_modules): + """Tie decoder weights of all `aux_modules` to `module`, (not bias)""" + for m in aux_modules: + m.predictions.decoder.weight = module.predictions.decoder.weight + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode='multi_modal', + normalize_attention=True, + soft_labels=None, + alpha=0, + return_logits=False, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_embeds=encoder_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + mode=mode, + normalize_attention=normalize_attention, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores + + masked_lm_loss = None + masked_lm_loss_aux = 0.0 + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if soft_labels is not None: + loss_distill = -torch.sum( + F.log_softmax(prediction_scores, dim=1) * soft_labels, dim=-1) + loss_distill = loss_distill[labels != -100].mean() + masked_lm_loss = (1 - + alpha) * masked_lm_loss + alpha * loss_distill + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + # changed from MaskedLMOutput to MaskedLMOutputWithDistill + return MaskedLMOutputWithDistill( + loss=masked_lm_loss, + loss_aux=masked_lm_loss_aux, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert (self.config.pad_token_id + is not None), 'The PAD token should be defined for generation' + attention_mask = torch.cat([ + attention_mask, + attention_mask.new_zeros((attention_mask.shape[0], 1)) + ], + dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} diff --git a/mmaction/models/multimodal/vindlu/temporal_model.py b/mmaction/models/multimodal/vindlu/temporal_model.py new file mode 100644 index 0000000000000000000000000000000000000000..579d62c524b6ccbf325177b4ed94183275d5ad97 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/temporal_model.py @@ -0,0 +1,213 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import einops +import torch +from einops import rearrange +from timm.models.layers import DropPath +from torch import nn +from torch.nn import LayerNorm, Linear, MultiheadAttention + + +class STAdapter(nn.Module): + """ST Adapter.""" + + def __init__( + self, + kernel_size=(3, 3, 3), + input_dim=768, + hidden_dim=384, + img_size=224, + patch_size=16, + drop_prob=0.1, + ): + super(STAdapter, self).__init__() + self.kernel_size = kernel_size + self.input_dim = input_dim + self.hidden_dim = hidden_dim + + self.h = self.w = img_size // patch_size + + self.linear1 = nn.Linear(input_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, input_dim) + self.act = nn.ReLU() + self.conv = nn.Conv3d( + hidden_dim, + hidden_dim, + kernel_size=kernel_size, + padding='same', + groups=hidden_dim) + self.droppath = DropPath(drop_prob=drop_prob) + + self.scale = nn.parameter.Parameter(torch.zeros([])) + + def forward(self, x: torch.Tensor): + """forward. + + Args: + x (torch.Tensor): input features. + Shape: [bs, nframes, l, c]. l = 1 + h*w + + Returns: features after adapter. The same shape as input. + """ + if x.shape[1] == 1: # for single frame, return itself. + return x + + shortcut = x + x = self.linear1(x) + cls = x[:, :, :1, :] + tokens = x[:, :, 1:, :] + tokens = einops.rearrange( + tokens, 'b t (h w) c -> b c t h w', h=self.h).contiguous() + tokens = self.conv(tokens) + tokens = einops.rearrange(tokens, 'b c t h w -> b t (h w) c') + x = torch.cat([cls, tokens], dim=2) # [b, t, 1+h*w, c] + x = self.act(x) + x = self.linear2(x) + + return shortcut + self.scale * self.droppath(x) + + +class TemporalAttention(nn.Module): + """perform temporal self-attention.""" + + def __init__(self, input_dim=768, droppath_rate=0.1): + """ + + Kwargs: + input_dim (int): The input feature dimension. + + + """ + super().__init__() + + self._input_dim = input_dim + self.temporal_attn = MultiheadAttention( + input_dim, num_heads=input_dim // 64) + self.norm = LayerNorm(input_dim, eps=1e-12) + self.linear = Linear(input_dim, input_dim) + self.droppath = DropPath(droppath_rate) + self.scale = nn.parameter.Parameter(torch.zeros([])) + + def forward(self, x: torch.Tensor): + """forward. + + Args: + x (torch.Tensor): input features. + Shape: [bs, nframes, l, c]. l = 1 + h*w + + Returns: features after adapter. The same shape as input. + """ + if x.shape[1] == 1: # for single frame, return itself. + return x + + shortcut = x + x = einops.rearrange(x, 'b t l c -> t (b l) c') + x = self.norm(x) + x = self.temporal_attn(x, x, x)[0] + x = einops.rearrange(x, 't (b l) c -> b t l c', b=shortcut.shape[0]) + return shortcut + self.scale * self.droppath(x) + + +class WindowTemporalAttention(nn.Module): + """perform windowed temporal self-attention.""" + + def __init__(self, input_dim=768, droppath_rate=0.1, window_size=(2, 2)): + """ + + Kwargs: + input_dim (int): The input feature dimension. + + + """ + super().__init__() + + self._input_dim = input_dim + self.temporal_attn = MultiheadAttention( + input_dim, num_heads=input_dim // 64) + self.norm = LayerNorm(input_dim, eps=1e-12) + self.droppath = DropPath(droppath_rate) + self.scale = nn.parameter.Parameter(torch.zeros([])) + self.wh, self.ww = window_size + + def forward(self, x: torch.Tensor): + """forward. + + Args: + x (torch.Tensor): input features. + Shape: [bs, nframes, l, c]. l = 1 + h*w + + Returns: features after adapter. The same shape as input. + """ + if x.shape[1] == 1: # for single frame, return itself. + return x + shortcut = x + + h = w = int(math.sqrt(x.shape[2] - 1)) + cls_token = x[:, :, :1, :] + x = einops.rearrange( + x[:, :, 1:, :], + 'b t (nh wh nw ww) c -> (t wh ww) (b nh nw) c', + nh=h // self.wh, + wh=self.wh, + nw=w // self.ww, + ww=self.ww, + ) + x = self.norm(x) + x = self.temporal_attn(x, x, x)[0] + x = einops.rearrange( + x, + '(t wh ww) (b nh nw) c -> b t (nh wh nw ww) c', + wh=self.wh, + ww=self.ww, + nh=h // self.wh, + nw=w // self.ww, + ) + # add back cls token. + x = torch.concat([cls_token, x], dim=2) + return shortcut + self.scale * self.droppath(x) + + +class X_CLIP(nn.Module): + """perform windowed temporal self-attention.""" + + def __init__(self, input_dim=768, droppath_rate=0.1, num_prompts=1): + """ + + Kwargs: + input_dim (int): The input feature dimension. + + + """ + super().__init__() + + d_model = input_dim + + self.message_fc = nn.Linear(d_model, d_model) + self.message_ln = LayerNorm(d_model, eps=1e-12) + self.message_attn = nn.MultiheadAttention(d_model, d_model // 64) + self.num_prompts = num_prompts + + self.droppath = DropPath(droppath_rate) + + def forward(self, x: torch.Tensor): + """forward. + + Args: + x (torch.Tensor): input features. + Shape: [bs, nframes, l, c]. l = 1 + h*w + + Returns: features after adapter. The same shape as input. + """ + if x.shape[1] == 1: # for single frame, return itself. + return x + msg_token = self.message_ln(self.message_fc(x[:, :, + 0, :])) # [b, t, c] + msg_token = rearrange(msg_token, 'b t c -> t b c') + msg_token = msg_token + self.droppath( + self.message_attn(msg_token, msg_token, msg_token)[0]) + msg_token = rearrange(msg_token, 't b c -> b t c') + # replace the last prompt token with msg_token. + x = torch.cat([x[:, :, :-1, :], + msg_token.unsqueeze(2)], dim=2) # [b, t, l+1, c] + return x diff --git a/mmaction/models/multimodal/vindlu/tokenizer.py b/mmaction/models/multimodal/vindlu/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..c8897c4e79c71d1e2da596030b3c638e85068edb --- /dev/null +++ b/mmaction/models/multimodal/vindlu/tokenizer.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +from transformers import BertTokenizer + +from mmaction.registry import TOKENIZER + + +class VindLUTokenizer(BertTokenizer): + """VindLUTokenizer inherit BertTokenizer. + + The main difference from BertTokenizer is removing the last separate token + for a single sequence. + """ + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """Build model inputs from a sequence or a pair of sequence for + sequence classification tasks by concatenating and adding special + tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with + the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + +TOKENIZER.register_module( + 'VindLUTokenizer', module=VindLUTokenizer.from_pretrained) diff --git a/mmaction/models/multimodal/vindlu/utils.py b/mmaction/models/multimodal/vindlu/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9116b7d9a273f3992b3e0edc418876648795d24c --- /dev/null +++ b/mmaction/models/multimodal/vindlu/utils.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmengine.dist as dist +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.logging import MMLogger +from scipy import interpolate + + +def all_gather_concat(data: torch.Tensor) -> torch.Tensor: + """Gather tensors with different first-dimension size and concat to one + tenosr. + + Note: + Only the first dimension should be different. + + Args: + data (Tensor): Tensor to be gathered. + + Returns: + torch.Tensor: The concatenated tenosr. + """ + if dist.get_world_size() == 1: + return data + + data_size = torch.tensor(data.size(0), device=data.device) + sizes_list = dist.all_gather(data_size) + + total_length = sum(sizes_list) + max_length = max(sizes_list) + size_diff = max_length.item() - data_size.item() + if size_diff: + padding = torch.zeros( + size_diff, *data.size()[1:], device=data.device, dtype=data.dtype) + data = torch.cat((data, padding)) + + gather_list = dist.all_gather(data) + + # gather all data according to the default DDP sampler. For instance, + # 8 samples on 2 GPUs, GPU0: [0,2,4,6], GPU1: [1,3,5,7], will be gathered + # as [0,1,2,3,4,5,6,7] + all_data = [] + for gather_batch in zip(*gather_list): + all_data.extend(gather_batch) + + return torch.stack(all_data)[:total_length] + + +def interpolate_pos_embed_beit(state_dict, new_model): + """interpolate the positional embeddings. The spatial pe is relative and + temporal pe is absolute. additional temporal pe is padded with 0. + + Args: + state_dict (dict): The state_dict. + new_model (nn.Module): The created model. + + Returns: dict. The state_dict with updated positional embeddings. + """ + state_dict = interpolate_pos_relative_bias_beit( + state_dict_old=state_dict, + state_dict_new=new_model.state_dict(), + patch_shape_new=new_model.vision_encoder.embeddings.patch_embeddings. + patch_shape, + ) + # absolute temporal pos bias + temporal_pe_key = 'vision_encoder.embeddings.temporal_position_embeddings' + if temporal_pe_key in state_dict: + logger = MMLogger.get_current_instance() + logger.info( + f'interpolate temporal positional embeddings: {temporal_pe_key}') + state_dict[temporal_pe_key] = load_temp_embed_with_mismatch( + temp_embed_old=state_dict[temporal_pe_key], + temp_embed_new=new_model.state_dict()[temporal_pe_key], + ) + return state_dict + + +def load_temp_embed_with_mismatch(temp_embed_old, + temp_embed_new, + add_zero=True): + """Add/Remove extra temporal_embeddings as needed. + https://arxiv.org/abs/2104.00650 shows adding zero paddings works. + + temp_embed_old: (1, num_frames_old, 1, d) + temp_embed_new: (1, num_frames_new, 1, d) + add_zero: bool, if True, add zero, else, interpolate trained embeddings. + """ + # TODO zero pad + num_frms_new = temp_embed_new.shape[1] + num_frms_old = temp_embed_old.shape[1] + logger = MMLogger.get_current_instance() + logger.info( + f'Load temporal_embeddings, lengths: {num_frms_old}-->{num_frms_new}') + if num_frms_new > num_frms_old: + if add_zero: + temp_embed_new[:, :num_frms_old] \ + = temp_embed_old # untrained embeddings are zeros. + else: + temp_embed_new = interpolate_temporal_pos_embed( + temp_embed_old, num_frms_new) + elif num_frms_new < num_frms_old: + temp_embed_new = temp_embed_old[:, :num_frms_new] + else: # = + temp_embed_new = temp_embed_old + return temp_embed_new + + +def interpolate_temporal_pos_embed(temp_embed_old, num_frames_new): + """ + temp_embed_old: (1, num_frames_old, 1, d) + Returns: + temp_embed_new: (1, num_frames_new, 1, d) + """ + temp_embed_old = temp_embed_old.squeeze(2).permute( + 0, 2, 1) # (1, d, num_frames_old) + temp_embed_new = F.interpolate( + temp_embed_old, num_frames_new, + mode='linear') # (1, d, num_frames_new) + temp_embed_new = temp_embed_new.permute(0, 2, 1).unsqueeze( + 2) # (1, num_frames_new, 1, d) + return temp_embed_new + + +def interpolate_pos_relative_bias_beit(state_dict_old, state_dict_new, + patch_shape_new): + """ + Args: + state_dict_old: loaded state dict + state_dict_new: state dict for model with new image size + patch_shape_new: new model patch_shape + ref: https://github.com/microsoft/unilm/blob/master/beit/run_class_finetuning.py # noqa: E501 + """ + all_keys = list(state_dict_old.keys()) + for key in all_keys: + if 'relative_position_index' in key: + state_dict_old.pop(key) + + if 'relative_position_bias_table' in key: + rel_pos_bias = state_dict_old[key] + src_num_pos, num_attn_heads = rel_pos_bias.size() + dst_num_pos, _ = state_dict_new[key].size() + dst_patch_shape = patch_shape_new + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * ( + dst_patch_shape[1] * 2 - 1) + src_size = int((src_num_pos - num_extra_tokens)**0.5) + dst_size = int((dst_num_pos - num_extra_tokens)**0.5) + if src_size != dst_size: + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r**n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q**(i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, + src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to( + rel_pos_bias.device)) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), + dim=0) + state_dict_old[key] = new_rel_pos_bias + return state_dict_old diff --git a/mmaction/models/multimodal/vindlu/vindlu.py b/mmaction/models/multimodal/vindlu/vindlu.py new file mode 100644 index 0000000000000000000000000000000000000000..cc7eaf88261765ad159eb8d8cfaec649c7398321 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/vindlu.py @@ -0,0 +1,227 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod +from typing import Optional + +import torch +from mmengine.logging import MMLogger +from mmengine.model import BaseModel +from mmengine.runner.checkpoint import _load_checkpoint +from torch import nn + +from mmaction.registry import MODELS, TOKENIZER +from mmaction.utils import ForwardResults, SampleList +from .utils import (interpolate_pos_embed_beit, + interpolate_pos_relative_bias_beit) + + +class VindLUBase(BaseModel): + """VindLU base Model. + + Args: + tokenizer: (dict): The config for tokenizer. + vision_encoder (dict): Backbone for extracting image features. + text_encoder (dict): Backbone for extracting text features. + temperature (float): Temperature parameter that controls the + concentration level of the distribution. Defaults to 0.07. + gradient_checkpointing (bool): Whether to do gradient_checkpointing. + Using checkpoint will save some memory while slowing down the + training speed. Defaults to False. + data_preprocessor (Optional[dict]): The config for preprocessing input + data. + init_cfg (Optional[dict]): the config to control the initialization. + Defaults to None. + """ + + def __init__( + self, + tokenizer: dict, + vision_encoder: dict, + text_encoder: dict, + proj_dim: int = 256, + temperature: float = 0.07, + gradient_checkpointing: bool = False, + pretrined_vl: bool = True, + data_preprocessor: Optional[dict] = None, + init_cfg: Optional[dict] = None, + ): + if data_preprocessor is None: + data_preprocessor = dict(type='ActionDataPreprocessor') + super().__init__( + init_cfg=init_cfg, data_preprocessor=data_preprocessor) + + self.tokenizer = TOKENIZER.build(tokenizer) + self.vision_cfg = vision_encoder + self.text_encoder_cfg = text_encoder + self.gradient_checkpointing = gradient_checkpointing + self.text_encoder_cfg.gradient_checkpointing = gradient_checkpointing + + self.vision_width = vision_encoder.pop('encoder_width') + self.text_width = text_encoder.encoder_width + self.pretrined_vl = pretrined_vl + + if self.vision_cfg.pop('add_ln'): + self.vision_layernorm = nn.LayerNorm(self.vision_width, eps=1e-12) + else: + self.vision_layernorm = nn.Identity() + + self.vision_encoder = MODELS.build(self.vision_cfg) + + if gradient_checkpointing: + self.vision_encoder.gradient_checkpointing_enable() + + self.text_encoder = MODELS.build(self.text_encoder_cfg) + + self.vision_proj = nn.Linear(self.vision_width, proj_dim) + self.text_proj = nn.Linear(self.text_width, proj_dim) + + self.temp = nn.parameter.Parameter(torch.ones([]) * temperature) + self.itm_head = nn.Linear(self.text_width, 2) + + def extract_feat(self, inputs: torch.Tensor, **kwargs) -> ForwardResults: + """Extract features from raw inputs.""" + + @abstractmethod + def loss(self, inputs: torch.Tensor, data_samples: SampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + def forward(self, inputs, data_samples, mode: str = 'loss'): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (List[``ActionDataSample], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + + if mode == 'tensor': + return self.extract_feat(inputs, data_samples) + elif mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode "{mode}".') + + def encode_vision(self, image): + """encode image / videos as features. + + Args: + image (torch.Tensor): The input images. + + Returns: tuple. + - vision_embeds (torch.Tensor): The features of all patches. + Shape: [B,T,L,C]. + - pooled_vision_embeds (torch.Tensor): The pooled features. + Shape: [B,T,C]. + """ + output_dict = self.vision_encoder(image) + vision_embeds = self.vision_layernorm(output_dict.last_hidden_state) + pooled_vision_embeds = output_dict.pooler_output + + return vision_embeds, pooled_vision_embeds + + def encode_text(self, text): + """encode text. + Args: + text (dict): The output of huggingface's `PreTrainedTokenizer`. + contains keys: + - input_ids (torch.Tensor): Token ids to be fed to a model. + Shape: [B,L]. + - attention_mask (torch.Tensor): The mask indicate padded tokens. + Shape: [B,L]. 0 is padded token. + - other keys refer to "https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__". # noqa: E501 + Returns: tuple. + - text_embeds (torch.Tensor): The features of all tokens. Shape: [B,L,C]. + - pooled_text_embeds (torch.Tensor): The pooled features. Shape: [B,C]. + + """ + text_output = self.text_encoder( + text.input_ids, + attention_mask=text.attention_mask, + return_dict=True, + mode='text', + ) + text_embeds = text_output.last_hidden_state + pooled_text_embeds = text_embeds[:, 0] + return text_embeds, pooled_text_embeds + + @torch.no_grad() + def clip_contrastive_temperature(self, min_val=0.001, max_val=0.5): + """Seems only used during pre-training.""" + self.temp.clamp_(min_val, max_val) + + @property + def device(self): + return next(self.parameters()).device + + def preprocess_state_dict(self, state_dict): + """Preprocess pretrained checkpoint for text_encoder.""" + for key in list(state_dict.keys()): + if 'bert' in key: + encoder_key = key.replace('bert.', '') + state_dict[encoder_key] = state_dict[key] + del state_dict[key] + return state_dict + + def load_from_pretrainded_beit(self): + from transformers.models.beit.modeling_beit import BeitModel + beit2d = BeitModel.from_pretrained( + self.vision_cfg.pretrained_model_name_or_path) + ori_state_dict = beit2d.state_dict() + del beit2d + # interpolate relative pos bias + state_dict = interpolate_pos_relative_bias_beit( + state_dict_old=ori_state_dict, + state_dict_new=self.vision_encoder.state_dict(), + patch_shape_new=self.vision_encoder.embeddings.patch_embeddings. + patch_shape, + ) + + for k in list(state_dict.keys()): + if 'prompt_bias_table' in k: + del state_dict[k] + + msg = self.vision_encoder.load_state_dict(state_dict, strict=False) + logger = MMLogger.get_current_instance() + logger.info(msg) + + def init_weights(self): + if self.vision_cfg.get('pretrained2d', False): + self.load_from_pretrainded_beit() + + if self.pretrined_vl: + assert self.init_cfg.get('type') == 'Pretrained', ( + 'Please specify ' + 'init_cfg to use pretrained video-language checkpoint') + self.pretrained = self.init_cfg.get('checkpoint') + checkpoint = _load_checkpoint(self.pretrained, map_location='cpu') + state_dict = checkpoint['model'] + state_dict = interpolate_pos_embed_beit(state_dict, self) + state_dict = self.preprocess_state_dict(state_dict) + msg = self.load_state_dict(state_dict, strict=False) + logger = MMLogger.get_current_instance() + logger.info(msg) + else: + super().init_weights() diff --git a/mmaction/models/multimodal/vindlu/vindlu_ret.py b/mmaction/models/multimodal/vindlu/vindlu_ret.py new file mode 100644 index 0000000000000000000000000000000000000000..da65951e423019dff14e3e50c38e718a542c69c6 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/vindlu_ret.py @@ -0,0 +1,464 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +import mmengine.dist as dist +import torch +import torch.nn.functional as F +from einops import rearrange +from torch.distributed.nn import all_gather as all_gather_with_grad + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.utils import track_on_main_process +from .utils import all_gather_concat +from .vindlu import VindLUBase + + +@MODELS.register_module() +class VindLURetrieval(VindLUBase): + """VindLU retriever. + + max_txt_len (int): Max text length of input text, used for retrieval + from multiple choices. Defaults to 32. + topk (int): Select topk similarity as candidates for compute matching + scores. Defaults to 256. + negative_all_rank (bool): Whether to sample negative data from all + ranks for image text matching in training. Defaults to False. + fast_match (bool): If False, select topk similarity as candidates and + compute the matching score. If True, return the similarity as the + matching score directly. Defaults to False. + **kwargs: Other keyword arguments to initialize the VindLU base model. + """ + + def __init__(self, + max_txt_len: int = 32, + topk: int = 128, + negative_all_rank: bool = False, + fast_match: bool = False, + **kwargs): + super().__init__(**kwargs) + + self.max_txt_len = max_txt_len + self.topk = topk + self.negative_all_rank = negative_all_rank + self.fast_match = fast_match + + def loss( + self, + inputs: torch.Tensor, + data_samples: Optional[List[ActionDataSample]] = None, + ) -> Dict[str, torch.tensor]: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (dict): A batch of inputs. The input tensor with of + at least one modality. For image, the value is a tensor + of shape (N, C, ...) in general. + For text, the value is a dict of tokenized text inputs. + data_samples (Optional[List[DataSample]]): + The annotation data of every samples. Defaults to None. + + Returns: + Dict[str, torch.tensor]: a dictionary of loss components of + """ + output = self.extract_feat(inputs, data_samples) + + text_embeds = output['text_embeds'] + text_attn_mask = output['text_attn_mask'] + image_embeds = output['image_embeds'] + image_feat = output['image_feat'] + text_feat = output['text_feat'] + + image_atts = torch.ones( + image_embeds.size()[:-1], dtype=torch.long).to(self.device) + + # ITC Loss + # B*world_size, D + image_feat_all = torch.cat(dist.all_gather(image_feat)) + # B*world_size, D + text_feat_all = torch.cat(dist.all_gather(text_feat)) + + # image to text similarity + # B, B*world_size + sim_i2t = torch.einsum('mld,nd->mln', image_feat, + text_feat_all).mean(1) / self.temp + # text-image similarity + # B, B*world_size + sim_t2i = torch.einsum('md,nld->mln', text_feat, + image_feat_all).mean(1) / self.temp + + rank = dist.get_rank() + bs = inputs.size(0) + itc_targets = torch.linspace( + rank * bs, rank * bs + bs - 1, bs, dtype=int).to(self.device) + + itc_loss = (F.cross_entropy(sim_i2t, itc_targets) + + F.cross_entropy(sim_t2i, itc_targets)) / 2 + + # prepare for itm + output_pos = self.text_encoder( + encoder_embeds=text_embeds, + attention_mask=text_attn_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + mode='fusion', + ) + + idx = torch.tensor([i.gt_video_id for i in data_samples]).view(-1, 1) + bs = idx.size(0) + if self.negative_all_rank: + idxs = torch.cat(dist.all_gather(idx)) + image_feat_world = torch.cat(dist.all_gather(image_feat)) + text_feat_world = torch.cat(dist.all_gather(text_feat)) + att_mask_world = torch.cat(dist.all_gather(text_attn_mask)) + text_embeds_world = torch.cat(all_gather_with_grad(text_embeds)) + image_embeds_world = torch.cat(all_gather_with_grad(image_embeds)) + else: + idxs = idx + image_feat_world = image_feat.detach() + text_feat_world = text_feat.detach() + image_embeds_world = image_embeds + text_embeds_world = text_embeds + att_mask_world = text_attn_mask + + with torch.no_grad(): + # compute sample similarity + sim_i2t = torch.einsum('mld,nd->mln', image_feat, + text_feat_world).mean(1) / self.temp + sim_t2i = torch.einsum('md,nld->mln', text_feat, + image_feat_world).mean(1) / self.temp + + mask = torch.eq(idx, idxs.t()).to(self.device) + weights_i2t = F.softmax(sim_i2t + 1e-4, dim=1) + weights_i2t.masked_fill_(mask, 0) + + weights_t2i = F.softmax(sim_t2i + 1e-4, dim=1) + weights_t2i.masked_fill_(mask, 0) + + # select a negative image for each text + neg_idx = torch.multinomial(weights_t2i, 1).squeeze() + image_embeds_neg = image_embeds_world[neg_idx] + + # select a negative text for each image + neg_idx = torch.multinomial(weights_i2t, 1).squeeze() + text_embeds_neg = text_embeds_world[neg_idx] + text_atts_neg = att_mask_world[neg_idx] + + text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0) + text_atts_all = torch.cat([text_attn_mask, text_atts_neg], dim=0) + + image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0) + image_atts_all = torch.cat([image_atts, image_atts], dim=0) + + output_neg = self.text_encoder( + encoder_embeds=text_embeds_all, + attention_mask=text_atts_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + return_dict=True, + mode='fusion', + ) + + vl_embeddings = torch.cat( + [ + output_pos.last_hidden_state[:, 0, :], + output_neg.last_hidden_state[:, 0, :], + ], + dim=0, + ) + + itm_targets = torch.ones((3 * bs, ), + dtype=torch.long, + device=inputs.device) + itm_targets[bs:] = 0 + itm_logit = self.itm_head(vl_embeddings) + itm_loss = F.cross_entropy(itm_logit, itm_targets) + + return dict(itc_loss=itc_loss, itm_loss=itm_loss) + + def preprocess_text(self, data_samples): + sample_item = data_samples[0] + + if sample_item is not None and 'text' in sample_item: + if isinstance(sample_item.get('text'), (list, tuple)): + texts = [] + for sample in data_samples: + texts.extend(sample.get('text')) + elif isinstance(sample_item.get('text'), str): + texts = [sample.get('text') for sample in data_samples] + else: + raise TypeError('text must be a string or a list of strings') + else: + return None + + # perform tokenize first if satisfied conditions + texts = self.tokenizer( + texts, + padding='max_length', + truncation=True, + max_length=self.max_txt_len, + return_tensors='pt', + ).to(self.device) + + return texts + + def extract_feat( + self, + images: torch.Tensor = None, + data_samples: List[ActionDataSample] = None, + return_texts=True, + ) -> Dict[str, torch.Tensor]: + """Extract features from the input dict. + + Args: + images (tensor, optional): The images to extract features. + Defaults to None. + data_samples (list, optional): The data samples containing texts + to extract features. Defaults to None. + return_texts (bool): Whether to return the tokenized text and the + corresponding attention masks. Defaults to True. + + Returns: + Tuple[torch.Tensor]: The output features. + If multimodal_backbone is not exist, tuple of torch.Tensor + will be returned. + """ + if data_samples is not None: + texts = self.preprocess_text(data_samples) + else: + texts = None + + assert images is not None or texts is not None, \ + 'At least single modality should be passed as inputs.' + + results = {} + if texts is not None and return_texts: + results.update({ + 'text_ids': texts.input_ids, + 'text_attn_mask': texts.attention_mask, + }) + + # extract image features + if images is not None: + image_embeds, pooled_image_embeds = self.encode_vision(images) + # concat temporal embeds + image_embeds = rearrange(image_embeds, + 'b t l c -> b (t l) c').contiguous() + results['image_embeds'] = image_embeds + results['image_feat'] = F.normalize( + self.vision_proj(pooled_image_embeds), dim=-1) + + # extract text features + if texts is not None: + texts_output = self.text_encoder( + texts.input_ids, + attention_mask=texts.attention_mask, + return_dict=True, + mode='text') + + text_embeds = texts_output.last_hidden_state + pooled_text_feat = text_embeds[:, 0] + results['text_embeds'] = text_embeds + results['text_feat'] = F.normalize( + self.text_proj(pooled_text_feat), dim=-1) + + return results + + def predict(self, images, data_samples, cal_i2t=True, cal_t2i=True): + feats = self.extract_feat(images, data_samples) + + return self.predict_all( + feats, data_samples, cal_i2t=cal_i2t, cal_t2i=cal_t2i) + + def predict_all(self, + feats, + data_samples, + num_images=None, + num_texts=None, + cal_i2t=True, + cal_t2i=True): + text_attn_mask = feats['text_attn_mask'] + image_embeds = feats.get('image_embeds', None) + image_feat = feats['image_feat'] + text_embeds = feats['text_embeds'] + text_feat = feats['text_feat'] + + num_images = num_images or image_feat.size(0) + num_texts = num_texts or text_feat.size(0) + + image_embeds_all = all_gather_concat(image_embeds)[:num_images] + image_feat_all = all_gather_concat(image_feat)[:num_images] + text_feat_all = all_gather_concat(text_feat)[:num_texts] + text_embeds_all = all_gather_concat(text_embeds)[:num_texts] + text_attn_mask_all = all_gather_concat(text_attn_mask)[:num_texts] + + results = [] + if cal_i2t: + result_i2t = self.compute_score_matrix_i2t( + image_feat, + image_embeds, + text_feat_all, + text_embeds_all, + text_attn_mask_all, + ) + results.append( + self._get_predictions(result_i2t, data_samples, mode='i2t')) + if cal_t2i: + result_t2i = self.compute_score_matrix_t2i( + image_feat_all, + image_embeds_all, + text_feat, + text_embeds, + text_attn_mask, + ) + results.append( + self._get_predictions(result_t2i, data_samples, mode='t2i')) + return tuple(results) + + def compute_score_matrix_i2t(self, img_feats, img_embeds, text_feats, + text_embeds, text_atts): + """Compare the score matrix for image-to-text retrieval. Every image + should compare to all the text features. + + Args: + img_feats (torch.Tensor): The input img feats tensor with shape + (M, C). M stands for numbers of samples on a single GPU. + img_embeds (torch.Tensor): The input img embeds tensor with shape + (M, C). M stands for numbers of samples on a single GPU. + text_feats (torch.Tensor): The input text feats tensor with shape + (N, C). N stands for numbers of all samples on all GPUs. + text_embeds (torch.Tensor): The input tensor with shape (N, C). + text_atts (torch.Tensor): The input tensor with shape (N, C). + + Returns: + torch.Tensor: Score matrix of image-to-text retrieval. + """ + # compute i2t sim matrix + sim_matrix_i2t = torch.einsum('mld,nd->mln', img_feats, + text_feats).mean(1) + if self.fast_match: + return sim_matrix_i2t + + score_matrix_i2t = torch.full((img_feats.size(0), text_feats.size(0)), + -100.0).to(self.device) + for i in track_on_main_process( + range(img_feats.size(0)), 'Compute I2T scores...'): + sims = sim_matrix_i2t[i] + topk_sim, topk_idx = sims.topk(k=self.topk, dim=0) + topk_bz = 32 + encoder_output = img_embeds[i].repeat(topk_bz, 1, 1) + encoder_att = torch.ones( + encoder_output.size()[:-1], dtype=torch.long).to(self.device) + for j in range(0, self.topk // topk_bz): + batch_topk = topk_idx[j * topk_bz:(j + 1) * topk_bz] + output = self.text_encoder( + encoder_embeds=text_embeds[batch_topk], + attention_mask=text_atts[batch_topk], + encoder_hidden_states=encoder_output, + encoder_attention_mask=encoder_att, + return_dict=True, + mode='fusion') + score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_i2t[i, batch_topk] = score + return score_matrix_i2t + + def compute_score_matrix_t2i(self, img_feats, img_embeds, text_feats, + text_embeds, text_atts): + """Compare the score matrix for text-to-image retrieval. Every text + should compare to all the image features. + + Args: + img_feats (torch.Tensor): The input img feats tensor with shape + (M, C). M stands for numbers of samples on a single GPU. + img_embeds (torch.Tensor): The input img embeds tensor with shape + (M, C). M stands for numbers of samples on a single GPU. + text_feats (torch.Tensor): The input text feats tensor with shape + (N, C). N stands for numbers of all samples on all GPUs. + text_embeds (torch.Tensor): The input tensor with shape (M, C). + text_atts (torch.Tensor): The input tensor with shape (M, C). + + Returns: + torch.Tensor: Score matrix of text-to-image retrieval. + """ + # compute t2i sim matrix + sim_matrix_t2i = torch.einsum('md,nld->mln', text_feats, + img_feats).mean(1) + + if self.fast_match: + return sim_matrix_t2i + + score_matrix_t2i = torch.full((text_feats.size(0), img_feats.size(0)), + -100.0).to(self.device) + for i in track_on_main_process( + range(text_feats.size(0)), 'Compute T2I scores...'): + sims = sim_matrix_t2i[i] + topk_sim, topk_idx = sims.topk(k=self.topk, dim=0) + topk_bz = 32 + for j in range(0, self.topk // topk_bz): + batch_topk = topk_idx[j * topk_bz:(j + 1) * topk_bz] + encoder_output = img_embeds[batch_topk] + encoder_att = torch.ones( + encoder_output.size()[:-1], + dtype=torch.long).to(self.device) + output = self.text_encoder( + encoder_embeds=text_embeds[i].repeat(topk_bz, 1, 1), + attention_mask=text_atts[i].repeat(topk_bz, 1), + encoder_hidden_states=encoder_output, + encoder_attention_mask=encoder_att, + return_dict=True, + mode='fusion') + score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1] + score_matrix_t2i[i, batch_topk] = score + return score_matrix_t2i + + def _get_predictions(self, + result: torch.Tensor, + data_samples: List[ActionDataSample], + mode: str = 'i2t'): + """Post-process the output of retriever. + + Args: + result (torch.Tensor): Score matrix of single retrieve, + either from image or text. + data_samples (List[ActionDataSample], optional): The annotation + data of every samples. + mode (str): Retrieve mode, either `i2t` for image to text, or `t2i` + text to image. Defaults to `i2t`. + + Returns: + List[ActionDataSample]: the raw data_samples with + the predicted results. + """ + + # create data sample if not exists + if data_samples is None: + data_samples = [ActionDataSample() for _ in range(result.size(0))] + elif mode == 't2i': + # Process data samples to align with the num of texts. + new_data_samples = [] + for sample in data_samples: + if isinstance(sample.text, (list, tuple)): + texts = sample.text + else: + texts = [sample.text] + for i, text in enumerate(texts): + new_sample = ActionDataSample(text=text) + if 'gt_video_id' in sample: + new_sample.gt_label = sample.gt_video_id[i] + new_data_samples.append(new_sample) + assert len(new_data_samples) == result.size(0) + data_samples = new_data_samples + elif mode == 'i2t': + for sample in data_samples: + if 'gt_text_id' in sample: + sample.gt_label = sample.gt_text_id + else: + raise ValueError(f'Type {mode} is not supported.') + + for data_sample, score in zip(data_samples, result): + idx = score.argmax(keepdim=True).detach() + + data_sample.set_pred_score(score) + data_sample.set_pred_label(idx) + return data_samples diff --git a/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py b/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py new file mode 100644 index 0000000000000000000000000000000000000000..937081575928e3aa70a762ef9571e2e45704a31c --- /dev/null +++ b/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from einops import rearrange + +from mmaction.registry import MODELS +from .vindlu_ret import VindLURetrieval + + +@MODELS.register_module() +class VindLURetrievalMC(VindLURetrieval): + """VindLU VQA retrieval multiple choice. + + score_weight (float): Weight coefficient for itm_head score to compute the + choice score. similarity_weight (float): Weight coefficient for similarity + score to compute the choice score. + """ + + def __init__(self, score_weight=0.7, similarity_weight=0.3, **kwargs): + kwargs.pop('text_decoder') + super().__init__(**kwargs) + self.score_weight = score_weight + self.similarity_weight = similarity_weight + + def predict(self, inputs, data_samples, **kwargs): + """Predict captions from a batch of inputs. + + Args: + images (torch.Tensor): The input images tensor with shape + (N, C, ...) in general. + data_samples (List[DataSample], optional): The annotation + data of every samples. Defaults to None. + **kwargs: Other keyword arguments accepted by the ``predict`` + + Returns: + List[ActionDataSample]: Return list of data samples. + """ + num_options_per_q = len(data_samples[0].caption_options) + for sample in data_samples: + sample.text = sample.caption_options + + output = self.extract_feat(inputs, data_samples) + + text_embeds = output['text_embeds'] + text_attn_mask = output['text_attn_mask'] + image_embeds = output['image_embeds'] + image_feat = output['image_feat'] + text_feat = output['text_feat'] + + # compute similarity between vision feat and caption feat + text_feat = rearrange( + text_feat, '(b n) c -> b c n', n=num_options_per_q) + sim = torch.matmul(image_feat.mean(1, keepdim=True), + text_feat).squeeze(1) / self.temp + sim = F.softmax(sim, dim=1).flatten() + + # cross-modal encode + encoder_output = image_embeds.repeat_interleave( + num_options_per_q, dim=0) + image_atts = torch.ones( + encoder_output.size()[:-1], dtype=torch.long).to(inputs.device) + output = self.text_encoder( + encoder_embeds=text_embeds, + attention_mask=text_attn_mask, + encoder_hidden_states=encoder_output, + encoder_attention_mask=image_atts, + return_dict=True, + mode='fusion', + ) + itm_embeds = output.last_hidden_state[:, 0] # [CLS] + + itm_score = F.softmax(self.itm_head(itm_embeds), dim=1)[:, 1] # [bs*5] + score = itm_score * self.score_weight + sim * self.similarity_weight + + pred_answers = score.view(-1, num_options_per_q).max(1)[1].cpu() + + # assemble predictions + ensemble_scores = score.view(-1, num_options_per_q).cpu() # (bsz, 5) + + out_data_samples = [] + for data_sample, ensemble_score, pred_ans in \ + zip(data_samples, ensemble_scores, pred_answers): + data_sample.pred_label = pred_ans.item() + data_sample.score = ensemble_score.numpy() + out_data_samples.append(data_sample) + + return out_data_samples diff --git a/mmaction/models/multimodal/vindlu/vindlu_vqa.py b/mmaction/models/multimodal/vindlu/vindlu_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..4c1d8a127b64eec3bcd5c51ee24b3f3ee234ac1e --- /dev/null +++ b/mmaction/models/multimodal/vindlu/vindlu_vqa.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import mmengine +import torch +import torch.nn.functional as F +from einops import rearrange + +from mmaction.registry import MODELS +from .vindlu import VindLUBase + + +@MODELS.register_module() +class VindLUVQA(VindLUBase): + """VindLU VQA. + + Args: + text_decoder (dict): Backbone for extracting + multi-modal features. We apply this part as VQA fusion module. + answer_list_path (str, optional): Path to `answer_list.json`. + max_question_len (int): Max text length of question text. + Defaults to 25. + max_answer_len (int): Max text length of answer text. Defaults to 5. + num_ans_candidates (int): Number of answer candidates, used to filter + out answers with low probability. Defaults to 128. + **kwargs: Other keyword arguments accepted by the VindLUBase. + """ + + def __init__(self, + text_decoder: dict, + answer_list_path: Optional[str] = None, + max_question_len: int = 25, + max_answer_len: int = 5, + num_ans_candidates: int = 128, + **kwargs): + super().__init__(**kwargs) + + self.max_question_len = max_question_len + self.max_answer_len = max_answer_len + self.num_ans_candidates = num_ans_candidates + self.answer_list_path = answer_list_path + self.text_decoder_cfg = text_decoder + + # for inference only + if answer_list_path: + self.answer_list = mmengine.load(answer_list_path) + + # delete extra/unnecessary modules inherited from VindLUBase + extra_attributes = ['vision_proj', 'text_proj', 'temp', 'itm_head'] + for attr in extra_attributes: + delattr(self, attr) + + self.text_decoder_cfg.gradient_checkpointing = \ + self.gradient_checkpointing + self.text_decoder = MODELS.build(self.text_decoder_cfg) + + def forward_encoder(self, inputs, data_samples): + # forward vision encoder + image_embeds, _ = self.encode_vision(inputs) + image_embeds = rearrange(image_embeds, 'b t l c -> b (t l) c') + image_atts = torch.ones( + image_embeds.size()[:-1], dtype=torch.long).to(inputs.device) + + # forward text encoder + questions = [sample.question for sample in data_samples] + questions = self.tokenizer( + questions, + padding='max_length', + truncation=True, + max_length=self.max_question_len, + return_tensors='pt').to(inputs.device) + + question_output = self.text_encoder( + questions.input_ids, + attention_mask=questions.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True) + + return questions, question_output + + def loss(self, inputs, data_samples): + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (dict): A batch of inputs. The input tensor with of + at least one modality. For image, the value is a tensor + of shape (N, C, ...) in general. + For text, the value is a dict of tokenized text inputs. + data_samples (Optional[List[DataSample]]): + The annotation data of every samples. Defaults to None. + + Returns: + Dict[str, torch.tensor]: a dictionary of loss components of + """ + + questions, question_output = self.forward_encoder(inputs, data_samples) + + weights = torch.cat( + [torch.tensor(sample.gt_answer_weight) for sample in data_samples], + dim=0).to(inputs.device) + raw_answers = [] + for sample in data_samples: + raw_answers.extend(sample.gt_answer) + answer_count = torch.tensor([ + len(sample.gt_answer) for sample in data_samples + ]).to(inputs.device) + answers = [a + ' ' + '[SEP]' for a in raw_answers] + answers = self.tokenizer( + answers, + padding='max_length', + truncation=True, + max_length=self.max_answer_len, + return_tensors='pt').to(inputs.device) + + answer_targets = answers.input_ids.masked_fill( + answers.input_ids == self.tokenizer.pad_token_id, -100) + + question_states = [] + question_atts = [] + for b, n in enumerate(answer_count): + question_states += [question_output.last_hidden_state[b]] * n + question_atts += [questions.attention_mask[b]] * n + question_states = torch.stack(question_states, 0).to(inputs.device) + question_atts = torch.stack(question_atts, 0).to(inputs.device) + + answer_output = self.text_decoder( + answers.input_ids, + attention_mask=answers.attention_mask, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=answer_targets, + return_dict=True, + reduction='none', + ) + loss = weights * answer_output.loss + loss = loss.sum() / inputs.size(0) + + return dict(loss=loss) + + def predict(self, inputs, data_samples, **kwargs): + + questions, question_output = self.forward_encoder(inputs, data_samples) + + raw_answers = self.answer_list + answers = [a + ' ' + '[SEP]' for a in raw_answers] + answers = self.tokenizer( + answers, + padding='max_length', + truncation=True, + max_length=self.max_answer_len, + return_tensors='pt', + ).to(inputs.device) + + topk_ids, topk_probs = self.rank_answer( + question_output.last_hidden_state, questions.attention_mask, + answers.input_ids, answers.attention_mask, self.num_ans_candidates) + + out_data_samples = [] + for data_sample, topk_id, topk_prob in zip(data_samples, topk_ids, + topk_probs): + _, pred = topk_prob.max(dim=0) + data_sample.pred_answer = raw_answers[topk_id[pred]] + out_data_samples.append(data_sample) + + return out_data_samples + + def rank_answer(self, question_states, question_atts, answer_ids, + answer_atts, k): + """ + question_states: (bsz, Lq, d) + answer_ids: answer input id after tokenization, (#answers, La) + """ + num_ques = question_states.size(0) + start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token + + start_output = self.text_decoder( + start_ids, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + return_dict=True, + reduction='none', + ) + logits = start_output.logits[:, 0, :] # first token's logit + + # topk_probs: top-k probability + # topk_ids: [num_question, k] + answer_first_token = answer_ids[:, 1] + prob_first_token = F.softmax( + logits, dim=1).index_select( + dim=1, index=answer_first_token) + topk_probs, topk_ids = prob_first_token.topk(k, dim=1) + + # answer input: [num_question*k, answer_len] + input_ids = [] + input_atts = [] + for b, topk_id in enumerate(topk_ids): + input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) + input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) + input_ids = torch.cat(input_ids, dim=0) + input_atts = torch.cat(input_atts, dim=0) + + targets_ids = input_ids.masked_fill( + input_ids == self.tokenizer.pad_token_id, -100) + + question_states = question_states.repeat_interleave(k, dim=0) + question_atts = question_atts.repeat_interleave(k, dim=0) + + output = self.text_decoder( + input_ids, + attention_mask=input_atts, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=targets_ids, + return_dict=True, + reduction='none', + ) + + answer_loss = output.loss + answer_loss = answer_loss.view(input_ids.size(0), -1) + + # topk_prob: first token probability + topk_probs = topk_probs.view(-1, 1) + log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1) + + # re-calculate log probabilities for the answer sequences + # using chain rule + log_probs_sum = log_probs.sum(1) + log_probs_sum = log_probs_sum.view(num_ques, k) + + topk_probs = F.softmax(log_probs_sum, dim=-1) + # get top-k after re-ranking + topk_probs, rerank_id = topk_probs.topk(k, dim=1) + topk_ids = torch.gather(topk_ids, 1, rerank_id) + + return topk_ids, topk_probs + + def preprocess_state_dict(self, state_dict): + """Preprocess pretrained checkpoint for text_encoder and + text_decoder.""" + for key in list(state_dict.keys()): + if 'bert' in key: + encoder_key = key.replace('bert.', '') + state_dict[encoder_key] = state_dict[key] + + # init text decoder as multimodal encoder + # (last 6 layers of model.text_encoder) + # only for generation tasks like VQA + if self.text_decoder_cfg and 'text_encoder' in key: + if 'layer' in key: + encoder_keys = key.split('.') + layer_num = int(encoder_keys[4]) + if layer_num < self.text_encoder_cfg.fusion_layer: + del state_dict[key] + continue + else: + decoder_layer_num = layer_num - 9 + encoder_keys[4] = str(decoder_layer_num) + encoder_key = '.'.join(encoder_keys) + else: + encoder_key = key + decoder_key = encoder_key.replace('text_encoder', + 'text_decoder') + state_dict[decoder_key] = state_dict[key] + del state_dict[key] + return state_dict diff --git a/mmaction/models/multimodal/vindlu/xbert.py b/mmaction/models/multimodal/vindlu/xbert.py new file mode 100644 index 0000000000000000000000000000000000000000..783d7413dd3155b5ddd3f28c578bfc7345c21493 --- /dev/null +++ b/mmaction/models/multimodal/vindlu/xbert.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.registry import MODELS +from .modeling_bert import (BertConfig, BertForMaskedLM, BertLMHeadModel, + BertModel) + + +@MODELS.register_module() +class XBertForMaskedLM(BertForMaskedLM): + + def __init__(self, pretrained_model_name_or_path, fusion_layer, + encoder_width, **kwargs): + config = BertConfig.from_pretrained(pretrained_model_name_or_path) + config.fusion_layer = fusion_layer + config.encoder_width = encoder_width + config.update(kwargs) + super().__init__(config) + + +@MODELS.register_module() +class XBertModel(BertModel): + + def __init__(self, pretrained_model_name_or_path, fusion_layer, + encoder_width, add_pooling_layer, **kwargs): + config = BertConfig.from_pretrained(pretrained_model_name_or_path) + config.fusion_layer = fusion_layer + config.encoder_width = encoder_width + config.update(kwargs) + super().__init__(config, add_pooling_layer) + + +@MODELS.register_module() +class BertDecoder(BertLMHeadModel): + + def __init__(self, pretrained_model_name_or_path, fusion_layer, + encoder_width, **kwargs): + config = BertConfig.from_pretrained(pretrained_model_name_or_path) + config.fusion_layer = fusion_layer + config.encoder_width = encoder_width + config.update(kwargs) + super().__init__(config) diff --git a/mmaction/models/necks/__init__.py b/mmaction/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..17220b517e63b9cc44f08446dbfe20bfc4f1c0f2 --- /dev/null +++ b/mmaction/models/necks/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .tpn import TPN + +__all__ = ['TPN'] diff --git a/mmaction/models/necks/tpn.py b/mmaction/models/necks/tpn.py new file mode 100644 index 0000000000000000000000000000000000000000..5d44ce1c5e2250d9f42638878131ef06c255844d --- /dev/null +++ b/mmaction/models/necks/tpn.py @@ -0,0 +1,476 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model.weight_init import constant_init, normal_init, xavier_init + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, OptConfigType, SampleList + + +class DownSample(nn.Module): + """DownSample modules. + + It uses convolution and maxpooling to downsample the input feature, + and specifies downsample position to determine `pool-conv` or `conv-pool`. + + Args: + in_channels (int): Channel number of input features. + out_channels (int): Channel number of output feature. + kernel_size (int or Tuple[int]): Same as :class:`ConvModule`. + Defaults to ``(3, 1, 1)``. + stride (int or Tuple[int]): Same as :class:`ConvModule`. + Defaults to ``(1, 1, 1)``. + padding (int or Tuple[int]): Same as :class:`ConvModule`. + Defaults to ``(1, 0, 0)``. + groups (int): Same as :class:`ConvModule`. Defaults to 1. + bias (bool or str): Same as :class:`ConvModule`. Defaults to False. + conv_cfg (dict or ConfigDict): Same as :class:`ConvModule`. + Defaults to ``dict(type='Conv3d')``. + norm_cfg (dict or ConfigDict, optional): Same as :class:`ConvModule`. + Defaults to None. + act_cfg (dict or ConfigDict, optional): Same as :class:`ConvModule`. + Defaults to None. + downsample_position (str): Type of downsample position. Options are + ``before`` and ``after``. Defaults to ``after``. + downsample_scale (int or Tuple[int]): downsample scale for maxpooling. + It will be used for kernel size and stride of maxpooling. + Defaults to ``(1, 2, 2)``. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]] = (3, 1, 1), + stride: Union[int, Tuple[int]] = (1, 1, 1), + padding: Union[int, Tuple[int]] = (1, 0, 0), + groups: int = 1, + bias: Union[bool, str] = False, + conv_cfg: ConfigType = dict(type='Conv3d'), + norm_cfg: OptConfigType = None, + act_cfg: OptConfigType = None, + downsample_position: str = 'after', + downsample_scale: Union[int, Tuple[int]] = (1, 2, 2) + ) -> None: + super().__init__() + self.conv = ConvModule( + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=groups, + bias=bias, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + assert downsample_position in ['before', 'after'] + self.downsample_position = downsample_position + self.pool = nn.MaxPool3d( + downsample_scale, downsample_scale, (0, 0, 0), ceil_mode=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + if self.downsample_position == 'before': + x = self.pool(x) + x = self.conv(x) + else: + x = self.conv(x) + x = self.pool(x) + return x + + +class LevelFusion(nn.Module): + """Level Fusion module. + + This module is used to aggregate the hierarchical features dynamic in + visual tempos and consistent in spatial semantics. The top/bottom features + for top-down/bottom-up flow would be combined to achieve two additional + options, namely 'Cascade Flow' or 'Parallel Flow'. While applying a + bottom-up flow after a top-down flow will lead to the cascade flow, + applying them simultaneously will result in the parallel flow. + + Args: + in_channels (Tuple[int]): Channel numbers of input features tuple. + mid_channels (Tuple[int]): Channel numbers of middle features tuple. + out_channels (int): Channel numbers of output features. + downsample_scales (Tuple[int | Tuple[int]]): downsample scales for + each :class:`DownSample` module. + Defaults to ``((1, 1, 1), (1, 1, 1))``. + """ + + def __init__( + self, + in_channels: Tuple[int], + mid_channels: Tuple[int], + out_channels: int, + downsample_scales: Tuple[int, Tuple[int]] = ((1, 1, 1), (1, 1, 1)) + ) -> None: + super().__init__() + num_stages = len(in_channels) + + self.downsamples = nn.ModuleList() + for i in range(num_stages): + downsample = DownSample( + in_channels[i], + mid_channels[i], + kernel_size=(1, 1, 1), + stride=(1, 1, 1), + bias=False, + padding=(0, 0, 0), + groups=32, + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True), + downsample_position='before', + downsample_scale=downsample_scales[i]) + self.downsamples.append(downsample) + + self.fusion_conv = ConvModule( + sum(mid_channels), + out_channels, + 1, + stride=1, + padding=0, + bias=False, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True)) + + def forward(self, x: Tuple[torch.Tensor]) -> torch.Tensor: + """Defines the computation performed at every call.""" + out = [self.downsamples[i](feature) for i, feature in enumerate(x)] + out = torch.cat(out, 1) + out = self.fusion_conv(out) + + return out + + +class SpatialModulation(nn.Module): + """Spatial Semantic Modulation. + + This module is used to align spatial semantics of features in the + multi-depth pyramid. For each but the top-level feature, a stack + of convolutions with level-specific stride are applied to it, matching + its spatial shape and receptive field with the top one. + + Args: + in_channels (Tuple[int]): Channel numbers of input features tuple. + out_channels (int): Channel numbers of output features tuple. + """ + + def __init__(self, in_channels: Tuple[int], out_channels: int) -> None: + super().__init__() + + self.spatial_modulation = nn.ModuleList() + for channel in in_channels: + downsample_scale = out_channels // channel + downsample_factor = int(np.log2(downsample_scale)) + op = nn.ModuleList() + if downsample_factor < 1: + op = nn.Identity() + else: + for factor in range(downsample_factor): + in_factor = 2**factor + out_factor = 2**(factor + 1) + op.append( + ConvModule( + channel * in_factor, + channel * out_factor, (1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + bias=False, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True))) + self.spatial_modulation.append(op) + + def forward(self, x: Tuple[torch.Tensor]) -> list: + """Defines the computation performed at every call.""" + out = [] + for i, _ in enumerate(x): + if isinstance(self.spatial_modulation[i], nn.ModuleList): + out_ = x[i] + for op in self.spatial_modulation[i]: + out_ = op(out_) + out.append(out_) + else: + out.append(self.spatial_modulation[i](x[i])) + return out + + +class AuxHead(nn.Module): + """Auxiliary Head. + + This auxiliary head is appended to receive stronger supervision, + leading to enhanced semantics. + + Args: + in_channels (int): Channel number of input features. + out_channels (int): Channel number of output features. + loss_weight (float): weight of loss for the auxiliary head. + Defaults to 0.5. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss')``. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + loss_weight: float = 0.5, + loss_cls: ConfigType = dict(type='CrossEntropyLoss') + ) -> None: + super().__init__() + + self.conv = ConvModule( + in_channels, + in_channels * 2, (1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + bias=False, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True)) + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + self.loss_weight = loss_weight + self.dropout = nn.Dropout(p=0.5) + self.fc = nn.Linear(in_channels * 2, out_channels) + self.loss_cls = MODELS.build(loss_cls) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + for m in self.modules(): + if isinstance(m, nn.Linear): + normal_init(m, std=0.01) + if isinstance(m, nn.Conv3d): + xavier_init(m, distribution='uniform') + if isinstance(m, nn.BatchNorm3d): + constant_init(m, 1) + + def loss(self, x: torch.Tensor, + data_samples: Optional[SampleList]) -> dict: + """Calculate auxiliary loss.""" + x = self(x) + labels = [x.gt_label for x in data_samples] + labels = torch.stack(labels).to(x.device) + labels = labels.squeeze() + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + + losses = dict() + losses['loss_aux'] = self.loss_weight * self.loss_cls(x, labels) + return losses + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Auxiliary head forward function.""" + x = self.conv(x) + x = self.avg_pool(x).squeeze(-1).squeeze(-1).squeeze(-1) + x = self.dropout(x) + x = self.fc(x) + + return x + + +class TemporalModulation(nn.Module): + """Temporal Rate Modulation. + + The module is used to equip TPN with a similar flexibility for temporal + tempo modulation as in the input-level frame pyramid. + + Args: + in_channels (int): Channel number of input features. + out_channels (int): Channel number of output features. + downsample_scale (int): Downsample scale for maxpooling. Defaults to 8. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + downsample_scale: int = 8) -> None: + super().__init__() + + self.conv = ConvModule( + in_channels, + out_channels, (3, 1, 1), + stride=(1, 1, 1), + padding=(1, 0, 0), + bias=False, + groups=32, + conv_cfg=dict(type='Conv3d'), + act_cfg=None) + self.pool = nn.MaxPool3d((downsample_scale, 1, 1), + (downsample_scale, 1, 1), (0, 0, 0), + ceil_mode=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + x = self.conv(x) + x = self.pool(x) + return x + + +@MODELS.register_module() +class TPN(nn.Module): + """TPN neck. + + This module is proposed in `Temporal Pyramid Network for Action Recognition + `_ + + Args: + in_channels (Tuple[int]): Channel numbers of input features tuple. + out_channels (int): Channel number of output feature. + spatial_modulation_cfg (dict or ConfigDict, optional): Config for + spatial modulation layers. Required keys are ``in_channels`` and + ``out_channels``. Defaults to None. + temporal_modulation_cfg (dict or ConfigDict, optional): Config for + temporal modulation layers. Defaults to None. + upsample_cfg (dict or ConfigDict, optional): Config for upsample + layers. The keys are same as that in :class:``nn.Upsample``. + Defaults to None. + downsample_cfg (dict or ConfigDict, optional): Config for downsample + layers. Defaults to None. + level_fusion_cfg (dict or ConfigDict, optional): Config for level + fusion layers. + Required keys are ``in_channels``, ``mid_channels``, + ``out_channels``. Defaults to None. + aux_head_cfg (dict or ConfigDict, optional): Config for aux head + layers. Required keys are ``out_channels``. Defaults to None. + flow_type (str): Flow type to combine the features. Options are + ``cascade`` and ``parallel``. Defaults to ``cascade``. + """ + + def __init__(self, + in_channels: Tuple[int], + out_channels: int, + spatial_modulation_cfg: OptConfigType = None, + temporal_modulation_cfg: OptConfigType = None, + upsample_cfg: OptConfigType = None, + downsample_cfg: OptConfigType = None, + level_fusion_cfg: OptConfigType = None, + aux_head_cfg: OptConfigType = None, + flow_type: str = 'cascade') -> None: + super().__init__() + assert isinstance(in_channels, tuple) + assert isinstance(out_channels, int) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_tpn_stages = len(in_channels) + + assert spatial_modulation_cfg is None or isinstance( + spatial_modulation_cfg, dict) + assert temporal_modulation_cfg is None or isinstance( + temporal_modulation_cfg, dict) + assert upsample_cfg is None or isinstance(upsample_cfg, dict) + assert downsample_cfg is None or isinstance(downsample_cfg, dict) + assert aux_head_cfg is None or isinstance(aux_head_cfg, dict) + assert level_fusion_cfg is None or isinstance(level_fusion_cfg, dict) + + if flow_type not in ['cascade', 'parallel']: + raise ValueError( + f"flow type in TPN should be 'cascade' or 'parallel', " + f'but got {flow_type} instead.') + self.flow_type = flow_type + + self.temporal_modulation_ops = nn.ModuleList() + self.upsample_ops = nn.ModuleList() + self.downsample_ops = nn.ModuleList() + + self.level_fusion_1 = LevelFusion(**level_fusion_cfg) + self.spatial_modulation = SpatialModulation(**spatial_modulation_cfg) + + for i in range(self.num_tpn_stages): + + if temporal_modulation_cfg is not None: + downsample_scale = temporal_modulation_cfg[ + 'downsample_scales'][i] + temporal_modulation = TemporalModulation( + in_channels[-1], out_channels, downsample_scale) + self.temporal_modulation_ops.append(temporal_modulation) + + if i < self.num_tpn_stages - 1: + if upsample_cfg is not None: + upsample = nn.Upsample(**upsample_cfg) + self.upsample_ops.append(upsample) + + if downsample_cfg is not None: + downsample = DownSample(out_channels, out_channels, + **downsample_cfg) + self.downsample_ops.append(downsample) + + out_dims = level_fusion_cfg['out_channels'] + + # two pyramids + self.level_fusion_2 = LevelFusion(**level_fusion_cfg) + + self.pyramid_fusion = ConvModule( + out_dims * 2, + 2048, + 1, + stride=1, + padding=0, + bias=False, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True)) + + if aux_head_cfg is not None: + self.aux_head = AuxHead(self.in_channels[-2], **aux_head_cfg) + else: + self.aux_head = None + + def init_weights(self) -> None: + """Default init_weights for conv(msra) and norm in ConvModule.""" + for m in self.modules(): + if isinstance(m, nn.Conv3d): + xavier_init(m, distribution='uniform') + if isinstance(m, nn.BatchNorm3d): + constant_init(m, 1) + + if self.aux_head is not None: + self.aux_head.init_weights() + + def forward(self, + x: Tuple[torch.Tensor], + data_samples: Optional[SampleList] = None) -> tuple: + """Defines the computation performed at every call.""" + + loss_aux = dict() + # Calculate auxiliary loss if `self.aux_head` + # and `data_samples` are not None. + if self.aux_head is not None and data_samples is not None: + loss_aux = self.aux_head.loss(x[-2], data_samples) + + # Spatial Modulation + spatial_modulation_outs = self.spatial_modulation(x) + + # Temporal Modulation + temporal_modulation_outs = [] + for i, temporal_modulation in enumerate(self.temporal_modulation_ops): + temporal_modulation_outs.append( + temporal_modulation(spatial_modulation_outs[i])) + + outs = [out.clone() for out in temporal_modulation_outs] + if len(self.upsample_ops) != 0: + for i in range(self.num_tpn_stages - 1, 0, -1): + outs[i - 1] = outs[i - 1] + self.upsample_ops[i - 1](outs[i]) + + # Get top-down outs + top_down_outs = self.level_fusion_1(outs) + + # Build bottom-up flow using downsample operation + if self.flow_type == 'parallel': + outs = [out.clone() for out in temporal_modulation_outs] + if len(self.downsample_ops) != 0: + for i in range(self.num_tpn_stages - 1): + outs[i + 1] = outs[i + 1] + self.downsample_ops[i](outs[i]) + + # Get bottom-up outs + botton_up_outs = self.level_fusion_2(outs) + + # fuse two pyramid outs + outs = self.pyramid_fusion( + torch.cat([top_down_outs, botton_up_outs], 1)) + + return outs, loss_aux diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ef36b8e3d0e19ae5315a2479e687ab4d2f09c7c --- /dev/null +++ b/mmaction/models/recognizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseRecognizer +from .recognizer2d import Recognizer2D +from .recognizer3d import Recognizer3D +from .recognizer3d_mm import MMRecognizer3D +from .recognizer_audio import RecognizerAudio +from .recognizer_gcn import RecognizerGCN +from .recognizer_omni import RecognizerOmni + +__all__ = [ + 'BaseRecognizer', 'RecognizerGCN', 'Recognizer2D', 'Recognizer3D', + 'RecognizerAudio', 'RecognizerOmni', 'MMRecognizer3D' +] diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba7216c33b7f836c9f53c7f7e2b8440ca5b8976 --- /dev/null +++ b/mmaction/models/recognizers/base.py @@ -0,0 +1,265 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import warnings +from abc import ABCMeta, abstractmethod + +import torch +import torch.nn as nn +from mmengine.model import BaseModel, merge_dict + +from mmaction.registry import MODELS +from mmaction.utils import (ConfigType, ForwardResults, OptConfigType, + OptSampleList, SampleList) + + +class BaseRecognizer(BaseModel, metaclass=ABCMeta): + """Base class for recognizers. + + Args: + backbone (Union[ConfigDict, dict]): Backbone modules to + extract feature. + cls_head (Union[ConfigDict, dict], optional): Classification head to + process feature. Defaults to None. + neck (Union[ConfigDict, dict], optional): Neck for feature fusion. + Defaults to None. + train_cfg (Union[ConfigDict, dict], optional): Config for training. + Defaults to None. + test_cfg (Union[ConfigDict, dict], optional): Config for testing. + Defaults to None. + data_preprocessor (Union[ConfigDict, dict], optional): The pre-process + config of :class:`ActionDataPreprocessor`. it usually includes, + ``mean``, ``std`` and ``format_shape``. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + cls_head: OptConfigType = None, + neck: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None) -> None: + if data_preprocessor is None: + # This preprocessor will only stack batch data samples. + data_preprocessor = dict(type='ActionDataPreprocessor') + + super(BaseRecognizer, + self).__init__(data_preprocessor=data_preprocessor) + + def is_from(module, pkg_name): + # check whether the backbone is from pkg + model_type = module['type'] + if isinstance(model_type, str): + return model_type.startswith(pkg_name) + elif inspect.isclass(model_type) or inspect.isfunction(model_type): + module_name = model_type.__module__ + return pkg_name in module_name + else: + raise TypeError( + f'Unsupported type of module {type(module["type"])}') + + # Record the source of the backbone. + self.backbone_from = 'mmaction2' + if is_from(backbone, 'mmcls.'): + try: + # Register all mmcls models. + import mmcls.models # noqa: F401 + except (ImportError, ModuleNotFoundError): + raise ImportError('Please install mmcls to use this backbone.') + self.backbone = MODELS.build(backbone) + self.backbone_from = 'mmcls' + elif is_from(backbone, 'mmpretrain.'): + try: + # Register all mmpretrain models. + import mmpretrain.models # noqa: F401 + except (ImportError, ModuleNotFoundError): + raise ImportError( + 'Please install mmpretrain to use this backbone.') + self.backbone = MODELS.build(backbone) + self.backbone_from = 'mmpretrain' + elif is_from(backbone, 'torchvision.'): + try: + import torchvision.models + except (ImportError, ModuleNotFoundError): + raise ImportError('Please install torchvision to use this ' + 'backbone.') + self.backbone_from = 'torchvision' + self.feature_shape = backbone.pop('feature_shape', None) + backbone_type = backbone.pop('type') + if isinstance(backbone_type, str): + backbone_type = backbone_type[12:] + self.backbone = torchvision.models.__dict__[backbone_type]( + **backbone) + else: + self.backbone = backbone_type(**backbone) + # disable the classifier + self.backbone.classifier = nn.Identity() + self.backbone.fc = nn.Identity() + elif is_from(backbone, 'timm.'): + # currently, only support use `str` as backbone type + try: + import timm + except (ImportError, ModuleNotFoundError): + raise ImportError('Please install timm>=0.9.0 to use this ' + 'backbone.') + self.backbone_from = 'timm' + self.feature_shape = backbone.pop('feature_shape', None) + # disable the classifier + backbone['num_classes'] = 0 + backbone_type = backbone.pop('type') + if isinstance(backbone_type, str): + backbone_type = backbone_type[5:] + self.backbone = timm.create_model(backbone_type, **backbone) + else: + raise TypeError( + f'Unsupported timm backbone type: {type(backbone_type)}') + else: + self.backbone = MODELS.build(backbone) + + if neck is not None: + self.neck = MODELS.build(neck) + + if cls_head is not None: + self.cls_head = MODELS.build(cls_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + @abstractmethod + def extract_feat(self, inputs: torch.Tensor, **kwargs) -> ForwardResults: + """Extract features from raw inputs.""" + + @property + def with_neck(self) -> bool: + """bool: whether the recognizer has a neck""" + return hasattr(self, 'neck') and self.neck is not None + + @property + def with_cls_head(self) -> bool: + """bool: whether the recognizer has a cls_head""" + return hasattr(self, 'cls_head') and self.cls_head is not None + + def init_weights(self) -> None: + """Initialize the model network weights.""" + if self.backbone_from in ['torchvision', 'timm']: + warnings.warn('We do not initialize weights for backbones in ' + f'{self.backbone_from}, since the weights for ' + f'backbones in {self.backbone_from} are initialized ' + 'in their __init__ functions.') + + def fake_init(): + pass + + # avoid repeated initialization + self.backbone.init_weights = fake_init + super().init_weights() + + def loss(self, inputs: torch.Tensor, data_samples: SampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (torch.Tensor): Raw Inputs of the recognizer. + These should usually be mean centered and std scaled. + data_samples (List[``ActionDataSample``]): The batch + data samples. It usually includes information such + as ``gt_label``. + + Returns: + dict: A dictionary of loss components. + """ + feats, loss_kwargs = \ + self.extract_feat(inputs, + data_samples=data_samples) + + # loss_aux will be a empty dict if `self.with_neck` is False. + loss_aux = loss_kwargs.get('loss_aux', dict()) + loss_cls = self.cls_head.loss(feats, data_samples, **loss_kwargs) + losses = merge_dict(loss_cls, loss_aux) + return losses + + def predict(self, inputs: torch.Tensor, data_samples: SampleList, + **kwargs) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + inputs (torch.Tensor): Raw Inputs of the recognizer. + These should usually be mean centered and std scaled. + data_samples (List[``ActionDataSample``]): The batch + data samples. It usually includes information such + as ``gt_label``. + + Returns: + List[``ActionDataSample``]: Return the recognition results. + The returns value is ``ActionDataSample``, which usually contains + ``pred_scores``. And the ``pred_scores`` usually contains + following keys. + + - item (torch.Tensor): Classification scores, has a shape + (num_classes, ) + """ + feats, predict_kwargs = self.extract_feat(inputs, test_mode=True) + predictions = self.cls_head.predict(feats, data_samples, + **predict_kwargs) + return predictions + + def _forward(self, + inputs: torch.Tensor, + stage: str = 'backbone', + **kwargs) -> ForwardResults: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + inputs (torch.Tensor): Raw Inputs of the recognizer. + stage (str): Which stage to output the features. + + Returns: + Union[tuple, torch.Tensor]: Features from ``backbone`` or ``neck`` + or ``head`` forward. + """ + feats, _ = self.extract_feat(inputs, stage=stage) + return feats + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSampleList = None, + mode: str = 'tensor', + **kwargs) -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (List[``ActionDataSample], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + if mode == 'tensor': + return self._forward(inputs, **kwargs) + if mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py new file mode 100644 index 0000000000000000000000000000000000000000..99e4e951ca76d55b998f13755ec0047b55b16810 --- /dev/null +++ b/mmaction/models/recognizers/recognizer2d.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.registry import MODELS +from mmaction.utils import SampleList +from .base import BaseRecognizer + + +@MODELS.register_module() +class Recognizer2D(BaseRecognizer): + """2D recognizer model framework.""" + + def extract_feat(self, + inputs: torch.Tensor, + stage: str = 'neck', + data_samples: SampleList = None, + test_mode: bool = False) -> tuple: + """Extract features of different stages. + + Args: + inputs (Tensor): The input data. + stage (str): Which stage to output the feature. + Defaults to ``neck``. + data_samples (List[:obj:`ActionDataSample`]): Action data + samples, which are only needed in training. Defaults to None. + test_mode: (bool): Whether in test mode. Defaults to False. + + Returns: + Tensor: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. These keys are usually included: + ``num_segs``, ``fcn_test``, ``loss_aux``. + """ + + # Record the kwargs required by `loss` and `predict`. + loss_predict_kwargs = dict() + + num_segs = inputs.shape[1] + loss_predict_kwargs['num_segs'] = num_segs + + # [N, num_crops * num_segs, C, H, W] -> + # [N * num_crops * num_segs, C, H, W] + # `num_crops` is calculated by: + # 1) `twice_sample` in `SampleFrames` + # 2) `num_sample_positions` in `DenseSampleFrames` + # 3) `ThreeCrop/TenCrop` in `test_pipeline` + # 4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1` + inputs = inputs.view((-1, ) + inputs.shape[2:]) + + def forward_once(batch_imgs): + # Extract features through backbone. + if (hasattr(self.backbone, 'features') + and self.backbone_from == 'torchvision'): + x = self.backbone.features(batch_imgs) + elif self.backbone_from == 'timm': + x = self.backbone.forward_features(batch_imgs) + elif self.backbone_from in ['mmcls', 'mmpretrain']: + x = self.backbone(batch_imgs) + if isinstance(x, tuple): + assert len(x) == 1 + x = x[0] + else: + x = self.backbone(batch_imgs) + + if self.backbone_from in ['torchvision', 'timm']: + if not self.feature_shape: + # Transformer-based feature shape: B x L x C. + if len(x.shape) == 3: + self.feature_shape = 'NLC' + # Resnet-based feature shape: B x C x Hs x Ws. + elif len(x.shape) == 4: + self.feature_shape = 'NCHW' + + if self.feature_shape == 'NHWC': + x = nn.AdaptiveAvgPool2d(1)(x.permute(0, 3, 1, + 2)) # B x C x 1 x 1 + elif self.feature_shape == 'NCHW': + x = nn.AdaptiveAvgPool2d(1)(x) # B x C x 1 x 1 + elif self.feature_shape == 'NLC': + x = nn.AdaptiveAvgPool1d(1)(x.transpose(1, 2)) # B x C x 1 + + x = x.reshape((x.shape[0], -1)) # B x C + x = x.reshape(x.shape + (1, 1)) # B x C x 1 x 1 + return x + + # Check settings of `fcn_test`. + fcn_test = False + if test_mode: + if self.test_cfg is not None and self.test_cfg.get( + 'fcn_test', False): + fcn_test = True + num_segs = self.test_cfg.get('num_segs', + self.backbone.num_segments) + loss_predict_kwargs['fcn_test'] = fcn_test + + # inference with batch size of `max_testing_views` if set + if self.test_cfg is not None and self.test_cfg.get( + 'max_testing_views', False): + max_testing_views = self.test_cfg.get('max_testing_views') + assert isinstance(max_testing_views, int) + # backbone specify num_segments + num_segments = self.backbone.get('num_segments') + if num_segments is not None: + assert max_testing_views % num_segments == 0, \ + 'make sure that max_testing_views is a multiple of ' \ + 'num_segments, but got {max_testing_views} and '\ + '{num_segments}' + + total_views = inputs.shape[0] + view_ptr = 0 + feats = [] + while view_ptr < total_views: + batch_imgs = inputs[view_ptr:view_ptr + max_testing_views] + feat = forward_once(batch_imgs) + if self.with_neck: + feat, _ = self.neck(feat) + feats.append(feat) + view_ptr += max_testing_views + + def recursively_cat(feats): + # recursively traverse feats until it's a tensor, + # then concat + out_feats = [] + for e_idx, elem in enumerate(feats[0]): + batch_elem = [feat[e_idx] for feat in feats] + if not isinstance(elem, torch.Tensor): + batch_elem = recursively_cat(batch_elem) + else: + batch_elem = torch.cat(batch_elem) + out_feats.append(batch_elem) + + return tuple(out_feats) + + if isinstance(feats[0], tuple): + x = recursively_cat(feats) + else: + x = torch.cat(feats) + else: + x = forward_once(inputs) + else: + x = forward_once(inputs) + + # Return features extracted through backbone. + if stage == 'backbone': + return x, loss_predict_kwargs + + loss_aux = dict() + if self.with_neck: + # x is a tuple with multiple feature maps. + x = [ + each.reshape((-1, num_segs) + + each.shape[1:]).transpose(1, 2).contiguous() + for each in x + ] + x, loss_aux = self.neck(x, data_samples=data_samples) + if not fcn_test: + x = x.squeeze(2) + loss_predict_kwargs['num_segs'] = 1 + elif fcn_test: + # full convolution (fcn) testing when no neck + # [N * num_crops * num_segs, C', H', W'] -> + # [N * num_crops, C', num_segs, H', W'] + x = x.reshape((-1, num_segs) + + x.shape[1:]).transpose(1, 2).contiguous() + + loss_predict_kwargs['loss_aux'] = loss_aux + + # Return features extracted through neck. + if stage == 'neck': + return x, loss_predict_kwargs + + # Return raw logits through head. + if self.with_cls_head and stage == 'head': + # [N * num_crops, num_classes] + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py new file mode 100644 index 0000000000000000000000000000000000000000..d2af5c3a40794e8875f122dbec2091ce4ea50e3d --- /dev/null +++ b/mmaction/models/recognizers/recognizer3d.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +from mmaction.registry import MODELS +from mmaction.utils import OptSampleList +from .base import BaseRecognizer + + +@MODELS.register_module() +class Recognizer3D(BaseRecognizer): + """3D recognizer model framework.""" + + def extract_feat(self, + inputs: Tensor, + stage: str = 'neck', + data_samples: OptSampleList = None, + test_mode: bool = False) -> tuple: + """Extract features of different stages. + + Args: + inputs (torch.Tensor): The input data. + stage (str): Which stage to output the feature. + Defaults to ``'neck'``. + data_samples (list[:obj:`ActionDataSample`], optional): Action data + samples, which are only needed in training. Defaults to None. + test_mode (bool): Whether in test mode. Defaults to False. + + Returns: + torch.Tensor: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. These keys are usually included: + ``loss_aux``. + """ + + # Record the kwargs required by `loss` and `predict` + loss_predict_kwargs = dict() + + num_segs = inputs.shape[1] + # [N, num_crops, C, T, H, W] -> + # [N * num_crops, C, T, H, W] + # `num_crops` is calculated by: + # 1) `twice_sample` in `SampleFrames` + # 2) `num_sample_positions` in `DenseSampleFrames` + # 3) `ThreeCrop/TenCrop` in `test_pipeline` + # 4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1` + inputs = inputs.view((-1, ) + inputs.shape[2:]) + + # Check settings of test + if test_mode: + if self.test_cfg is not None: + loss_predict_kwargs['fcn_test'] = self.test_cfg.get( + 'fcn_test', False) + if self.test_cfg is not None and self.test_cfg.get( + 'max_testing_views', False): + max_testing_views = self.test_cfg.get('max_testing_views') + assert isinstance(max_testing_views, int) + + total_views = inputs.shape[0] + assert num_segs == total_views, ( + 'max_testing_views is only compatible ' + 'with batch_size == 1') + view_ptr = 0 + feats = [] + while view_ptr < total_views: + batch_imgs = inputs[view_ptr:view_ptr + max_testing_views] + feat = self.backbone(batch_imgs) + if self.with_neck: + feat, _ = self.neck(feat) + feats.append(feat) + view_ptr += max_testing_views + + def recursively_cat(feats): + # recursively traverse feats until it's a tensor, + # then concat + out_feats = [] + for e_idx, elem in enumerate(feats[0]): + batch_elem = [feat[e_idx] for feat in feats] + if not isinstance(elem, torch.Tensor): + batch_elem = recursively_cat(batch_elem) + else: + batch_elem = torch.cat(batch_elem) + out_feats.append(batch_elem) + + return tuple(out_feats) + + if isinstance(feats[0], tuple): + x = recursively_cat(feats) + else: + x = torch.cat(feats) + else: + x = self.backbone(inputs) + if self.with_neck: + x, _ = self.neck(x) + + return x, loss_predict_kwargs + else: + # Return features extracted through backbone + x = self.backbone(inputs) + if stage == 'backbone': + return x, loss_predict_kwargs + + loss_aux = dict() + if self.with_neck: + x, loss_aux = self.neck(x, data_samples=data_samples) + + # Return features extracted through neck + loss_predict_kwargs['loss_aux'] = loss_aux + if stage == 'neck': + return x, loss_predict_kwargs + + # Return raw logits through head. + if self.with_cls_head and stage == 'head': + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..12541c5e1c85b0fffaa48389f809a3a33bedb428 --- /dev/null +++ b/mmaction/models/recognizers/recognizer3d_mm.py @@ -0,0 +1,50 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +import torch + +from mmaction.registry import MODELS +from mmaction.utils import OptSampleList +from .base import BaseRecognizer + + +@MODELS.register_module() +class MMRecognizer3D(BaseRecognizer): + """Multi-modal 3D recognizer model framework.""" + + def extract_feat(self, + inputs: Dict[str, torch.Tensor], + stage: str = 'backbone', + data_samples: OptSampleList = None, + test_mode: bool = False) -> Tuple: + """Extract features. + + Args: + inputs (dict[str, torch.Tensor]): The multi-modal input data. + stage (str): Which stage to output the feature. + Defaults to ``'backbone'``. + data_samples (list[:obj:`ActionDataSample`], optional): Action data + samples, which are only needed in training. Defaults to None. + test_mode (bool): Whether in test mode. Defaults to False. + + Returns: + tuple[torch.Tensor]: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. + """ + # [N, num_views, C, T, H, W] -> + # [N * num_views, C, T, H, W] + for m, m_data in inputs.items(): + m_data = m_data.reshape((-1, ) + m_data.shape[2:]) + inputs[m] = m_data + + # Record the kwargs required by `loss` and `predict` + loss_predict_kwargs = dict() + + x = self.backbone(**inputs) + if stage == 'backbone': + return x, loss_predict_kwargs + + if self.with_cls_head and stage == 'head': + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/recognizers/recognizer_audio.py b/mmaction/models/recognizers/recognizer_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..684f482e6eb16bf606a23cc7d42f119fbe3c8df6 --- /dev/null +++ b/mmaction/models/recognizers/recognizer_audio.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import Tensor + +from mmaction.registry import MODELS +from .base import BaseRecognizer + + +@MODELS.register_module() +class RecognizerAudio(BaseRecognizer): + """Audio recognizer model framework.""" + + def extract_feat(self, + batch_inputs: Tensor, + stage: str = 'backbone', + **kwargs) -> tuple: + """Extract features of different stages. + + Args: + batch_inputs (Tensor): The input data. + stage (str): Which stage to output the feature. + Defaults to ``backbone``. + + Returns: + Tensor: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. This will be an empty dict in audio recognizer. + """ + + # Record the kwargs required by `loss` and `predict` + loss_predict_kwargs = dict() + batch_inputs = batch_inputs.view((-1, ) + batch_inputs.shape[2:]) + + x = self.backbone(batch_inputs) + + if stage == 'backbone': + return x, loss_predict_kwargs + + if self.with_cls_head and stage == 'head': + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/recognizers/recognizer_gcn.py b/mmaction/models/recognizers/recognizer_gcn.py new file mode 100644 index 0000000000000000000000000000000000000000..1a8e3df3ae1a1fc03542ee13327e034d7a6034f5 --- /dev/null +++ b/mmaction/models/recognizers/recognizer_gcn.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch + +from mmaction.registry import MODELS +from .base import BaseRecognizer + + +@MODELS.register_module() +class RecognizerGCN(BaseRecognizer): + """GCN-based recognizer for skeleton-based action recognition.""" + + def extract_feat(self, + inputs: torch.Tensor, + stage: str = 'backbone', + **kwargs) -> Tuple: + """Extract features at the given stage. + + Args: + inputs (torch.Tensor): The input skeleton with shape of + `(B, num_clips, num_person, clip_len, num_joints, 3 or 2)`. + stage (str): The stage to output the features. + Defaults to ``'backbone'``. + + Returns: + tuple: THe extracted features and a dict recording the kwargs + for downstream pipeline, which is an empty dict for the + GCN-based recognizer. + """ + + # Record the kwargs required by `loss` and `predict` + loss_predict_kwargs = dict() + + bs, nc = inputs.shape[:2] + inputs = inputs.reshape((bs * nc, ) + inputs.shape[2:]) + + x = self.backbone(inputs) + + if stage == 'backbone': + return x, loss_predict_kwargs + + if self.with_cls_head and stage == 'head': + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/recognizers/recognizer_omni.py b/mmaction/models/recognizers/recognizer_omni.py new file mode 100644 index 0000000000000000000000000000000000000000..df8154182678ead19a1eabe0ec009e6347bfe893 --- /dev/null +++ b/mmaction/models/recognizers/recognizer_omni.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Sequence, Union + +import torch +from mmengine.model import BaseModel + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType, ForwardResults, SampleList + + +@MODELS.register_module() +class RecognizerOmni(BaseModel): + """An Omni-souce recognizer model framework for joint-training of image and + video recognition tasks. + + The `backbone` and `cls_head` should be able to accept both images and + videos as inputs. + """ + + def __init__(self, backbone: ConfigType, cls_head: ConfigType, + data_preprocessor: ConfigType) -> None: + super().__init__(data_preprocessor=data_preprocessor) + self.backbone = MODELS.build(backbone) + self.cls_head = MODELS.build(cls_head) + + def forward(self, *data_samples, mode: str, **kwargs) -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: + + - ``tensor``: Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - ``predict``: Forward and return the predictions, which are fully + processed to a list of :obj:`ActionDataSample`. + - ``loss``: Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + data_samples: should be a sequence of ``SampleList`` if + ``mode="predict"`` or ``mode="loss"``. Each ``SampleList`` is + the annotation data of one data source. + It should be a single torch tensor if ``mode="tensor"``. + mode (str): Return what kind of value. Defaults to ``tensor``. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of ``ActionDataSample``. + - If ``mode="loss"``, return a dict of tensor. + """ + + if mode == 'loss' or mode == 'predict': + if mode == 'loss': + return self.loss(data_samples) + return self.predict(data_samples) + + elif mode == 'tensor': + + assert isinstance(data_samples, torch.Tensor) + + data_ndim = data_samples.ndim + if data_ndim not in [4, 5]: + info = f'Input is a {data_ndim}D tensor. ' + info += 'Only 4D (BCHW) or 5D (BCTHW) tensors are supported!' + raise ValueError(info) + + return self._forward(data_samples, **kwargs) + + def loss(self, data_samples: Sequence[SampleList]) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + data_samples (Sequence[SampleList]): a sequence of SampleList. Each + SampleList contains data samples from the same data source. + + Returns: + dict: A dictionary of loss components. + """ + loss_dict = {} + for idx, data in enumerate(data_samples): + inputs, data_samples = data['inputs'], data['data_samples'] + feats = self.extract_feat(inputs) + loss_cls = self.cls_head.loss(feats, data_samples) + for key in loss_cls: + loss_dict[key + f'_{idx}'] = loss_cls[key] + return loss_dict + + def predict(self, data_samples: Sequence[SampleList]) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + data_samples (Sequence[SampleList]): a sequence of SampleList. Each + SampleList contains data samples from the same data source. + + Returns: + List[``ActionDataSample``]: Return the recognition results. + The returns value is ``ActionDataSample``, which usually contains + ``pred_scores``. And the ``pred_scores`` usually contains + following keys. + + - item (torch.Tensor): Classification scores, has a shape + (num_classes, ) + """ + assert len(data_samples) == 1 + feats = self.extract_feat(data_samples[0]['inputs'], test_mode=True) + predictions = self.cls_head.predict(feats, + data_samples[0]['data_samples']) + return predictions + + def _forward(self, + inputs: torch.Tensor, + stage: str = 'backbone', + **kwargs) -> ForwardResults: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + inputs (torch.Tensor): Raw Inputs of the recognizer. + stage (str): Which stage to output the features. + + Returns: + Union[tuple, torch.Tensor]: Features from ``backbone`` or ``head`` + forward. + """ + feats, _ = self.extract_feat(inputs, stage=stage) + return feats + + def _run_forward(self, data: Union[dict, tuple, list], + mode: str) -> Union[Dict[str, torch.Tensor], list]: + """Unpacks data for :meth:`forward` + Args: + data (dict or tuple or list): Data sampled from dataset. + mode (str): Mode of forward. + Returns: + dict or list: Results of training or testing mode. + """ + if isinstance(data, dict): + data = [data] + results = self(*data, mode=mode) + elif isinstance(data, (list, tuple)): + results = self(*data, mode=mode) + else: + raise TypeError + return results + + def extract_feat(self, + inputs: torch.Tensor, + stage: str = 'backbone', + test_mode: bool = False) -> tuple: + """Extract features of different stages. + + Args: + inputs (torch.Tensor): The input data. + stage (str): Which stage to output the feature. + Defaults to ``'backbone'``. + test_mode (bool): Whether in test mode. Defaults to False. + + Returns: + torch.Tensor: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. These keys are usually included: + ``loss_aux``. + """ + + if len(inputs.shape) == 6: + inputs = inputs.view((-1, ) + inputs.shape[2:]) + + # Check settings of test + if test_mode: + x = self.backbone(inputs) + return x + else: + # Return features extracted through backbone + x = self.backbone(inputs) + if stage == 'backbone': + return x + x = self.cls_head(x) + return x diff --git a/mmaction/models/roi_heads/__init__.py b/mmaction/models/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca01c608ed4a3473e46c35d5771d8367c61b078 --- /dev/null +++ b/mmaction/models/roi_heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +try: + from mmdet.registry import MODELS as MMDET_MODELS + + from .bbox_heads import BBoxHeadAVA + from .roi_extractors import SingleRoIExtractor3D + from .roi_head import AVARoIHead + from .shared_heads import ACRNHead, FBOHead, LFBInferHead + + for module in [ + AVARoIHead, BBoxHeadAVA, SingleRoIExtractor3D, ACRNHead, FBOHead, + LFBInferHead + ]: + + MMDET_MODELS.register_module()(module) + + __all__ = [ + 'AVARoIHead', 'BBoxHeadAVA', 'SingleRoIExtractor3D', 'ACRNHead', + 'FBOHead', 'LFBInferHead' + ] + +except (ImportError, ModuleNotFoundError): + pass diff --git a/mmaction/models/roi_heads/bbox_heads/__init__.py b/mmaction/models/roi_heads/bbox_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52a7c4aebabfb90ec004b5caab1307c5c5ddea47 --- /dev/null +++ b/mmaction/models/roi_heads/bbox_heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_head import BBoxHeadAVA + +__all__ = ['BBoxHeadAVA'] diff --git a/mmaction/models/roi_heads/bbox_heads/bbox_head.py b/mmaction/models/roi_heads/bbox_heads/bbox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..069e09ba2ece75210694cffef9a50c7cbc141890 --- /dev/null +++ b/mmaction/models/roi_heads/bbox_heads/bbox_head.py @@ -0,0 +1,415 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models.task_modules.samplers import SamplingResult +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +# Resolve cross-entropy function to support multi-target in Torch < 1.10 +# This is a very basic 'hack', with minimal functionality to support the +# procedure under prior torch versions +from packaging import version as pv +from torch import Tensor + +from mmaction.structures.bbox import bbox_target +from mmaction.utils import InstanceList + +if pv.parse(torch.__version__) < pv.parse('1.10'): + + def cross_entropy_loss(input, target, reduction='None'): + input = input.log_softmax(dim=-1) # Compute Log of Softmax + loss = -(input * target).sum(dim=-1) # Compute Loss manually + if reduction.lower() == 'mean': + return loss.mean() + elif reduction.lower() == 'sum': + return loss.sum() + else: + return loss +else: + cross_entropy_loss = F.cross_entropy + + +class BBoxHeadAVA(nn.Module): + """Simplest RoI head, with only one fc layer for classification. + + Args: + background_class (bool): Whether set class 0 as background class and + ignore it when calculate loss. + temporal_pool_type (str): The temporal pool type. Choices are ``avg`` + or ``max``. Defaults to ``avg``. + spatial_pool_type (str): The spatial pool type. Choices are ``avg`` or + ``max``. Defaults to ``max``. + in_channels (int): The number of input channels. Defaults to 2048. + focal_alpha (float): The hyper-parameter alpha for Focal Loss. + When ``alpha == 1`` and ``gamma == 0``, Focal Loss degenerates to + BCELossWithLogits. Defaults to 1. + focal_gamma (float): The hyper-parameter gamma for Focal Loss. + When ``alpha == 1`` and ``gamma == 0``, Focal Loss degenerates to + BCELossWithLogits. Defaults to 0. + num_classes (int): The number of classes. Defaults to 81. + dropout_ratio (float): A float in ``[0, 1]``, indicates the + dropout_ratio. Defaults to 0. + dropout_before_pool (bool): Dropout Feature before spatial temporal + pooling. Defaults to True. + topk (int or Tuple[int]): Parameter for evaluating Top-K accuracy. + Defaults to ``(3, 5)``. + multilabel (bool): Whether used for a multilabel task. + Defaults to True. + mlp_head (bool): Whether to use an MLP as the classification head. + Defaults to False, i.e., using a single linear head. + """ + + def __init__( + self, + background_class: bool, + temporal_pool_type: str = 'avg', + spatial_pool_type: str = 'max', + in_channels: int = 2048, + focal_gamma: float = 0., + focal_alpha: float = 1., + num_classes: int = 81, # First class reserved (BBox as pos/neg) + dropout_ratio: float = 0, + dropout_before_pool: bool = True, + topk: Union[int, Tuple[int]] = (3, 5), + multilabel: bool = True, + mlp_head: bool = False) -> None: + super(BBoxHeadAVA, self).__init__() + assert temporal_pool_type in ['max', 'avg'] + assert spatial_pool_type in ['max', 'avg'] + self.temporal_pool_type = temporal_pool_type + self.spatial_pool_type = spatial_pool_type + + self.in_channels = in_channels + self.num_classes = num_classes + + self.dropout_ratio = dropout_ratio + self.dropout_before_pool = dropout_before_pool + + self.multilabel = multilabel + + self.focal_gamma = focal_gamma + self.focal_alpha = focal_alpha + + self.background_class = background_class + + if topk is None: + self.topk = () + elif isinstance(topk, int): + self.topk = (topk, ) + elif isinstance(topk, tuple): + assert all([isinstance(k, int) for k in topk]) + self.topk = topk + else: + raise TypeError('topk should be int or tuple[int], ' + f'but get {type(topk)}') + # Class 0 is ignored when calculating accuracy, + # so topk cannot be equal to num_classes. + assert all([k < num_classes for k in self.topk]) + + in_channels = self.in_channels + # Pool by default + if self.temporal_pool_type == 'avg': + self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None)) + else: + self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None)) + if self.spatial_pool_type == 'avg': + self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) + else: + self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1)) + + if dropout_ratio > 0: + self.dropout = nn.Dropout(dropout_ratio) + + if mlp_head: + self.fc_cls = nn.Sequential( + nn.Linear(in_channels, in_channels), nn.ReLU(), + nn.Linear(in_channels, num_classes)) + else: + self.fc_cls = nn.Linear(in_channels, num_classes) + + def init_weights(self) -> None: + """Initialize the classification head.""" + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor) -> Tensor: + """Computes the classification logits given ROI features.""" + if self.dropout_before_pool and self.dropout_ratio > 0: + x = self.dropout(x) + + x = self.temporal_pool(x) + x = self.spatial_pool(x) + + if not self.dropout_before_pool and self.dropout_ratio > 0: + x = self.dropout(x) + + x = x.view(x.size(0), -1) + cls_score = self.fc_cls(x) + return cls_score + + @staticmethod + def get_targets(sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict) -> tuple: + pos_proposals = [res.pos_priors for res in sampling_results] + neg_proposals = [res.neg_priors for res in sampling_results] + pos_gt_labels = [res.pos_gt_labels for res in sampling_results] + cls_targets = bbox_target(pos_proposals, neg_proposals, pos_gt_labels, + rcnn_train_cfg) + return cls_targets + + @staticmethod + def get_recall_prec(pred_vec: Tensor, target_vec: Tensor) -> tuple: + """Computes the Recall/Precision for both multi-label and single label + scenarios. + + Note that the computation calculates the micro average. + + Note, that in both cases, the concept of correct/incorrect is the same. + Args: + pred_vec (tensor[N x C]): each element is either 0 or 1 + target_vec (tensor[N x C]): each element is either 0 or 1 - for + single label it is expected that only one element is on (1) + although this is not enforced. + """ + correct = pred_vec & target_vec + recall = correct.sum(1) / target_vec.sum(1).float() # Enforce Float + prec = correct.sum(1) / (pred_vec.sum(1) + 1e-6) + return recall.mean(), prec.mean() + + @staticmethod + def topk_to_matrix(probs: Tensor, k: int) -> Tensor: + """Converts top-k to binary matrix.""" + topk_labels = probs.topk(k, 1, True, True)[1] + topk_matrix = probs.new_full(probs.size(), 0, dtype=torch.bool) + for i in range(probs.shape[0]): + topk_matrix[i, topk_labels[i]] = 1 + return topk_matrix + + def topk_accuracy(self, + pred: Tensor, + target: Tensor, + thr: float = 0.5) -> tuple: + """Computes the Top-K Accuracies for both single and multi-label + scenarios.""" + # Define Target vector: + target_bool = target > 0.5 + + # Branch on Multilabel for computing output classification + if self.multilabel: + pred = pred.sigmoid() + else: + pred = pred.softmax(dim=1) + + # Compute at threshold (K=1 for single) + if self.multilabel: + pred_bool = pred > thr + else: + pred_bool = self.topk_to_matrix(pred, 1) + recall_thr, prec_thr = self.get_recall_prec(pred_bool, target_bool) + + # Compute at various K + recalls_k, precs_k = [], [] + for k in self.topk: + pred_bool = self.topk_to_matrix(pred, k) + recall, prec = self.get_recall_prec(pred_bool, target_bool) + recalls_k.append(recall) + precs_k.append(prec) + + # Return all + return recall_thr, prec_thr, recalls_k, precs_k + + def loss_and_target(self, cls_score: Tensor, rois: Tensor, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, **kwargs) -> dict: + """Calculate the loss based on the features extracted by the bbox head. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + + Returns: + dict: A dictionary of loss components. + """ + cls_targets = self.get_targets(sampling_results, rcnn_train_cfg) + labels, _ = cls_targets + + losses = dict() + # Only use the cls_score + if cls_score is not None: + if self.background_class: + labels = labels[:, 1:] # Get valid labels (ignore first one) + cls_score = cls_score[:, 1:] + pos_inds = torch.sum(labels, dim=-1) > 0 + cls_score = cls_score[pos_inds] + labels = labels[pos_inds] + + # Compute First Recall/Precisions + # This has to be done first before normalising the label-space. + recall_thr, prec_thr, recall_k, prec_k = self.topk_accuracy( + cls_score, labels, thr=0.5) + losses['recall@thr=0.5'] = recall_thr + losses['prec@thr=0.5'] = prec_thr + for i, k in enumerate(self.topk): + losses[f'recall@top{k}'] = recall_k[i] + losses[f'prec@top{k}'] = prec_k[i] + + # If Single-label, need to ensure that target labels sum to 1: ie + # that they are valid probabilities. + if not self.multilabel and self.background_class: + labels = labels / labels.sum(dim=1, keepdim=True) + + # Select Loss function based on single/multi-label + # NB. Both losses auto-compute sigmoid/softmax on prediction + if self.multilabel: + loss_func = F.binary_cross_entropy_with_logits + else: + loss_func = cross_entropy_loss + + # Compute loss + loss = loss_func(cls_score, labels, reduction='none') + pt = torch.exp(-loss) + F_loss = self.focal_alpha * (1 - pt)**self.focal_gamma * loss + losses['loss_action_cls'] = torch.mean(F_loss) + + return dict(loss_bbox=losses, bbox_targets=cls_targets) + + def predict_by_feat(self, + rois: Tuple[Tensor], + cls_scores: Tuple[Tensor], + batch_img_metas: List[dict], + rcnn_test_cfg: Optional[ConfigDict] = None, + **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + rois (tuple[Tensor]): Tuple of boxes to be transformed. + Each has shape (num_boxes, 5). last dimension 5 arrange as + (batch_index, x1, y1, x2, y2). + cls_scores (tuple[Tensor]): Tuple of box scores, each has shape + (num_boxes, num_classes + 1). + bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each + has shape (num_boxes, num_classes * 4). + batch_img_metas (list[dict]): List of image information. + rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Instance segmentation + results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + result_list = [] + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single( + roi=rois[img_id], + cls_score=cls_scores[img_id], + img_meta=img_meta, + rcnn_test_cfg=rcnn_test_cfg, + **kwargs) + result_list.append(results) + + return result_list + + def _predict_by_feat_single(self, + roi: Tensor, + cls_score: Tensor, + img_meta: dict, + rcnn_test_cfg: Optional[ConfigDict] = None, + **kwargs) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tensor): Box energies / deltas. + has shape (num_boxes, num_classes * 4). + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image\ + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results = InstanceData() + + # might be used by testing w. augmentation + if isinstance(cls_score, list): + cls_score = sum(cls_score) / float(len(cls_score)) + + # Handle Multi/Single Label + if cls_score is not None: + if self.multilabel: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(dim=-1) + else: + scores = None + + bboxes = roi[:, 1:] + assert bboxes.shape[-1] == 4 + + # First reverse the flip + img_h, img_w = img_meta['img_shape'] + if img_meta.get('flip', False): + bboxes_ = bboxes.clone() + bboxes_[:, 0] = img_w - 1 - bboxes[:, 2] + bboxes_[:, 2] = img_w - 1 - bboxes[:, 0] + bboxes = bboxes_ + + # Then normalize the bbox to [0, 1] + bboxes[:, 0::2] /= img_w + bboxes[:, 1::2] /= img_h + + def _bbox_crop_undo(bboxes, crop_quadruple): + decropped = bboxes.clone() + + if crop_quadruple is not None: + x1, y1, tw, th = crop_quadruple + decropped[:, 0::2] = bboxes[..., 0::2] * tw + x1 + decropped[:, 1::2] = bboxes[..., 1::2] * th + y1 + + return decropped + + crop_quadruple = img_meta.get('crop_quadruple', np.array([0, 0, 1, 1])) + bboxes = _bbox_crop_undo(bboxes, crop_quadruple) + + results.bboxes = bboxes + results.scores = scores + + return results diff --git a/mmaction/models/roi_heads/roi_extractors/__init__.py b/mmaction/models/roi_heads/roi_extractors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cf008d90b16c7298139bf9941c050f0d302fa900 --- /dev/null +++ b/mmaction/models/roi_heads/roi_extractors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .single_straight3d import SingleRoIExtractor3D + +__all__ = ['SingleRoIExtractor3D'] diff --git a/mmaction/models/roi_heads/roi_extractors/single_straight3d.py b/mmaction/models/roi_heads/roi_extractors/single_straight3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0d86b15bcdff8ae3a36a2b4fee8f9f9b40836da6 --- /dev/null +++ b/mmaction/models/roi_heads/roi_extractors/single_straight3d.py @@ -0,0 +1,126 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class SingleRoIExtractor3D(nn.Module): + """Extract RoI features from a single level feature map. + + Args: + roi_layer_type (str): Specify the RoI layer type. + Defaults to ``RoIAlign``. + featmap_stride (int): Strides of input feature maps. Defaults to 16. + output_size (int or tuple): Size or (Height, Width). Defaults to 16. + sampling_ratio (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + Defaults to 0. + pool_mode (str): pooling mode in each bin. Choices are ``avg`` or + ``max``. Defaults to ``avg``. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + Defaults to True. + with_temporal_pool (bool): if True, avgpool the temporal dim. + Defaults to True. + with_global (bool): if True, concatenate the RoI feature with global + feature. Defaults to False. + + Note that sampling_ratio, pool_mode, aligned only apply when roi_layer_type + is set as RoIAlign. + """ + + def __init__(self, + roi_layer_type: str = 'RoIAlign', + featmap_stride: int = 16, + output_size: int = 16, + sampling_ratio: int = 0, + pool_mode: str = 'avg', + aligned: bool = True, + with_temporal_pool: bool = True, + temporal_pool_mode: str = 'avg', + with_global: bool = False) -> None: + super().__init__() + self.roi_layer_type = roi_layer_type + assert self.roi_layer_type in ['RoIPool', 'RoIAlign'] + self.featmap_stride = featmap_stride + self.spatial_scale = 1. / self.featmap_stride + + self.output_size = output_size + self.sampling_ratio = sampling_ratio + self.pool_mode = pool_mode + self.aligned = aligned + + self.with_temporal_pool = with_temporal_pool + self.temporal_pool_mode = temporal_pool_mode + + self.with_global = with_global + + try: + from mmcv.ops import RoIAlign, RoIPool + except (ImportError, ModuleNotFoundError): + raise ImportError('Failed to import `RoIAlign` and `RoIPool` from ' + '`mmcv.ops`. The two modules will be used in ' + '`SingleRoIExtractor3D`! ') + + if self.roi_layer_type == 'RoIPool': + self.roi_layer = RoIPool(self.output_size, self.spatial_scale) + else: + self.roi_layer = RoIAlign( + self.output_size, + self.spatial_scale, + sampling_ratio=self.sampling_ratio, + pool_mode=self.pool_mode, + aligned=self.aligned) + self.global_pool = nn.AdaptiveAvgPool2d(self.output_size) + + def forward(self, feat: Union[Tensor, Tuple[Tensor]], + rois: Tensor) -> tuple: + """Forward function for extract roi features. + + Args: + feat (Tensor or Tuple[Tensor]): The image features extracted by + the upstream network. The shape of feat is N, C, T, H, W. + rois (Tensor): Input RoIs, shape (k, 5). + + Returns: + tuple: A tuple of roi features and global features. + + - roi_feats (Tensor): Extracted bbox RoI features. + - feat (Tensor): Global features of the video clip. + """ + if not isinstance(feat, tuple): + feat = (feat, ) + + if len(feat) >= 2: + maxT = max([x.shape[2] for x in feat]) + max_shape = (maxT, ) + feat[0].shape[3:] + # resize each feat to the largest shape (w. nearest) + feat = [F.interpolate(x, max_shape).contiguous() for x in feat] + + if self.with_temporal_pool: + if self.temporal_pool_mode == 'avg': + feat = [torch.mean(x, 2, keepdim=True) for x in feat] + elif self.temporal_pool_mode == 'max': + feat = [torch.max(x, 2, keepdim=True)[0] for x in feat] + else: + raise NotImplementedError + + feat = torch.cat(feat, axis=1).contiguous() + + roi_feats = [] + for t in range(feat.size(2)): + frame_feat = feat[:, :, t].contiguous() + roi_feat = self.roi_layer(frame_feat, rois) + if self.with_global: + global_feat = self.global_pool(frame_feat.contiguous()) + inds = rois[:, 0].type(torch.int64) + global_feat = global_feat[inds] + roi_feat = torch.cat([roi_feat, global_feat], dim=1) + roi_feat = roi_feat.contiguous() + roi_feats.append(roi_feat) + + roi_feats = torch.stack(roi_feats, dim=2) + return roi_feats, feat diff --git a/mmaction/models/roi_heads/roi_head.py b/mmaction/models/roi_heads/roi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..15fc61e7aee951c614c6907feb33c300f0d80c86 --- /dev/null +++ b/mmaction/models/roi_heads/roi_head.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +from mmdet.models.roi_heads import StandardRoIHead +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.structures.bbox import bbox2roi +from torch import Tensor + +from mmaction.utils import ConfigType, InstanceList, SampleList + + +class AVARoIHead(StandardRoIHead): + + def loss(self, x: Union[Tensor, + Tuple[Tensor]], rpn_results_list: InstanceList, + data_samples: SampleList, **kwargs) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (Tensor or Tuple[Tensor]): The image features extracted by + the upstream network. + rpn_results_list (List[:obj:`InstanceData`]): List of region + proposals. + data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + Dict[str, Tensor]: A dictionary of loss components. + """ + assert len(rpn_results_list) == len(data_samples) + batch_gt_instances = [] + for data_sample in data_samples: + batch_gt_instances.append(data_sample.gt_instances) + + # assign gts and sample proposals + num_imgs = len(data_samples) + sampling_results = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign(rpn_results, + batch_gt_instances[i], + None) + sampling_result = self.bbox_sampler.sample(assign_result, + rpn_results, + batch_gt_instances[i]) + sampling_results.append(sampling_result) + + # LFB needs meta_info: 'img_key' + batch_img_metas = [ + data_samples.metainfo for data_samples in data_samples + ] + + losses = dict() + # bbox head forward and loss + bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas) + losses.update(bbox_results['loss_bbox']) + + return losses + + def _bbox_forward(self, x: Union[Tensor, Tuple[Tensor]], rois: Tensor, + batch_img_metas: List[dict], **kwargs) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (Tensor or Tuple[Tensor]): The image features extracted by + the upstream network. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + batch_img_metas (List[dict]): List of image information. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_feats, global_feat = self.bbox_roi_extractor(x, rois) + + if self.with_shared_head: + bbox_feats = self.shared_head( + bbox_feats, + feat=global_feat, + rois=rois, + img_metas=batch_img_metas) + + cls_score = self.bbox_head(bbox_feats) + + bbox_results = dict(cls_score=cls_score, bbox_feats=bbox_feats) + return bbox_results + + def bbox_loss(self, x: Union[Tensor, Tuple[Tensor]], + sampling_results: List[SamplingResult], + batch_img_metas: List[dict], **kwargs) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (Tensor or Tuple[Tensor]): The image features extracted by + the upstream network. + sampling_results (List[SamplingResult]): Sampling results. + batch_img_metas (List[dict]): List of image information. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois, batch_img_metas) + + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + return bbox_results + + def predict(self, x: Union[Tensor, + Tuple[Tensor]], rpn_results_list: InstanceList, + data_samples: SampleList, **kwargs) -> InstanceList: + """Perform forward propagation of the roi head and predict detection + results on the features of the upstream network. + + Args: + x (Tensor or Tuple[Tensor]): The image features extracted by + the upstream network. + rpn_results_list (List[:obj:`InstanceData`]): list of region + proposals. + data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + List[obj:`InstanceData`]: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + """ + assert self.with_bbox, 'Bbox head must be implemented.' + batch_img_metas = [ + data_samples.metainfo for data_samples in data_samples + ] + if isinstance(x, tuple): + x_shape = x[0].shape + else: + x_shape = x.shape + + assert x_shape[0] == 1, 'only accept 1 sample at test mode' + assert x_shape[0] == len(batch_img_metas) == len(rpn_results_list) + + results_list = self.predict_bbox( + x, batch_img_metas, rpn_results_list, rcnn_test_cfg=self.test_cfg) + + return results_list + + def predict_bbox(self, x: Tuple[Tensor], batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. Each item usually contains following + keys: + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + """ + proposals = [res.bboxes for res in rpn_results_list] + rois = bbox2roi(proposals) + bbox_results = self._bbox_forward(x, rois, batch_img_metas) + + # split batch bbox prediction back to each image + cls_scores = bbox_results['cls_score'] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + + result_list = self.bbox_head.predict_by_feat( + rois=rois, + cls_scores=cls_scores, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=rcnn_test_cfg) + + return result_list diff --git a/mmaction/models/roi_heads/shared_heads/__init__.py b/mmaction/models/roi_heads/shared_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a6869196d9556b5f3bd6c1db9f3deb60ddbc5504 --- /dev/null +++ b/mmaction/models/roi_heads/shared_heads/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .acrn_head import ACRNHead +from .fbo_head import FBOHead +from .lfb_infer_head import LFBInferHead + +__all__ = ['ACRNHead', 'LFBInferHead', 'FBOHead'] diff --git a/mmaction/models/roi_heads/shared_heads/acrn_head.py b/mmaction/models/roi_heads/shared_heads/acrn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..939a3af78eb167f49f9e494b650fd14ab2255f39 --- /dev/null +++ b/mmaction/models/roi_heads/shared_heads/acrn_head.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +# Note: All these heads take 5D Tensors as input (N, C, T, H, W) + + +class ACRNHead(nn.Module): + """ACRN Head: Tile + 1x1 convolution + 3x3 convolution. + + This module is proposed in + `Actor-Centric Relation Network + `_ + + Args: + in_channels (int): The input channel. + out_channels (int): The output channel. + stride (int): The spatial stride. + num_convs (int): The number of 3x3 convolutions in ACRNHead. + conv_cfg (dict): Config for norm layers. Default: dict(type='Conv'). + norm_cfg (dict): + Config for norm layers. required keys are `type` and + `requires_grad`. Default: dict(type='BN2d', requires_grad=True). + act_cfg (dict): Config for activate layers. + Default: dict(type='ReLU', inplace=True). + kwargs (dict): Other new arguments, to be compatible with MMDet update. + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + num_convs=1, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True), + **kwargs): + + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + self.num_convs = num_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.max_pool = nn.AdaptiveMaxPool3d(1) + + self.conv1 = ConvModule( + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + assert num_convs >= 1 + self.conv2 = ConvModule( + out_channels, + out_channels, + kernel_size=(1, 3, 3), + stride=(1, stride, stride), + padding=(0, 1, 1), + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + convs = [] + for _ in range(num_convs - 1): + conv = ConvModule( + out_channels, + out_channels, + kernel_size=(1, 3, 3), + padding=(0, 1, 1), + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + convs.append(conv) + self.convs = nn.ModuleList(convs) + + def init_weights(self, **kwargs): + """Weight Initialization for ACRNHead.""" + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + def forward(self, x, feat, rois, **kwargs): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The extracted RoI feature. + feat (torch.Tensor): The context feature. + rois (torch.Tensor): The regions of interest. + + Returns: + torch.Tensor: The RoI features that have interacted with context + feature. + """ + # We use max pooling by default + x = self.max_pool(x) + + h, w = feat.shape[-2:] + x_tile = x.repeat(1, 1, 1, h, w) + + roi_inds = rois[:, 0].type(torch.long) + roi_gfeat = feat[roi_inds] + + new_feat = torch.cat([x_tile, roi_gfeat], dim=1) + new_feat = self.conv1(new_feat) + new_feat = self.conv2(new_feat) + + for conv in self.convs: + new_feat = conv(new_feat) + + return new_feat diff --git a/mmaction/models/roi_heads/shared_heads/fbo_head.py b/mmaction/models/roi_heads/shared_heads/fbo_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b6199701288a1c0462cc8dbe3d05130dfce8465c --- /dev/null +++ b/mmaction/models/roi_heads/shared_heads/fbo_head.py @@ -0,0 +1,397 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyrigho (c) OpenMMLab. All rights reserved. +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.logging import MMLogger +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from .lfb import LFB + + +class NonLocalLayer(nn.Module): + """Non-local layer used in `FBONonLocal` is a variation of the vanilla non- + local block. + + Args: + st_feat_channels (int): Channels of short-term features. + lt_feat_channels (int): Channels of long-term features. + latent_channels (int): Channels of latent features. + use_scale (bool): Whether to scale pairwise_weight by + `1/sqrt(latent_channels)`. Default: True. + pre_activate (bool): Whether to use the activation function before + upsampling. Default: False. + conv_cfg (Dict | None): The config dict for convolution layers. If + not specified, it will use `nn.Conv2d` for convolution layers. + Default: None. + norm_cfg (Dict | None): he config dict for normalization layers. + Default: None. + dropout_ratio (float, optional): Probability of dropout layer. + Default: 0.2. + zero_init_out_conv (bool): Whether to use zero initialization for + out_conv. Default: False. + """ + + def __init__(self, + st_feat_channels, + lt_feat_channels, + latent_channels, + num_st_feat, + num_lt_feat, + use_scale=True, + pre_activate=True, + pre_activate_with_ln=True, + conv_cfg=None, + norm_cfg=None, + dropout_ratio=0.2, + zero_init_out_conv=False): + super().__init__() + if conv_cfg is None: + conv_cfg = dict(type='Conv3d') + self.st_feat_channels = st_feat_channels + self.lt_feat_channels = lt_feat_channels + self.latent_channels = latent_channels + self.num_st_feat = num_st_feat + self.num_lt_feat = num_lt_feat + self.use_scale = use_scale + self.pre_activate = pre_activate + self.pre_activate_with_ln = pre_activate_with_ln + self.dropout_ratio = dropout_ratio + self.zero_init_out_conv = zero_init_out_conv + + self.st_feat_conv = ConvModule( + self.st_feat_channels, + self.latent_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.lt_feat_conv = ConvModule( + self.lt_feat_channels, + self.latent_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.global_conv = ConvModule( + self.lt_feat_channels, + self.latent_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + if pre_activate: + self.ln = nn.LayerNorm([latent_channels, num_st_feat, 1, 1]) + else: + self.ln = nn.LayerNorm([st_feat_channels, num_st_feat, 1, 1]) + + self.relu = nn.ReLU() + + self.out_conv = ConvModule( + self.latent_channels, + self.st_feat_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + if self.dropout_ratio > 0: + self.dropout = nn.Dropout(self.dropout_ratio) + + def init_weights(self, pretrained=None): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {pretrained}') + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + if self.zero_init_out_conv: + constant_init(self.out_conv, 0, bias=0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, st_feat, lt_feat): + """Defines the computation performed at every call.""" + n, c = st_feat.size(0), self.latent_channels + num_st_feat, num_lt_feat = self.num_st_feat, self.num_lt_feat + + theta = self.st_feat_conv(st_feat) + theta = theta.view(n, c, num_st_feat) + + phi = self.lt_feat_conv(lt_feat) + phi = phi.view(n, c, num_lt_feat) + + g = self.global_conv(lt_feat) + g = g.view(n, c, num_lt_feat) + + # (n, num_st_feat, c), (n, c, num_lt_feat) + # -> (n, num_st_feat, num_lt_feat) + theta_phi = torch.matmul(theta.permute(0, 2, 1), phi) + if self.use_scale: + theta_phi /= c**0.5 + + p = theta_phi.softmax(dim=-1) + + # (n, c, num_lt_feat), (n, num_lt_feat, num_st_feat) + # -> (n, c, num_st_feat, 1, 1) + out = torch.matmul(g, p.permute(0, 2, 1)).view(n, c, num_st_feat, 1, 1) + + # If need to activate it before out_conv, use relu here, otherwise + # use relu outside the non local layer. + if self.pre_activate: + if self.pre_activate_with_ln: + out = self.ln(out) + out = self.relu(out) + + out = self.out_conv(out) + + if not self.pre_activate: + out = self.ln(out) + if self.dropout_ratio > 0: + out = self.dropout(out) + + return out + + +class FBONonLocal(nn.Module): + """Non local feature bank operator. + + Args: + st_feat_channels (int): Channels of short-term features. + lt_feat_channels (int): Channels of long-term features. + latent_channels (int): Channels of latent features. + num_st_feat (int): Number of short-term roi features. + num_lt_feat (int): Number of long-term roi features. + num_non_local_layers (int): Number of non-local layers, which is + at least 1. Default: 2. + st_feat_dropout_ratio (float): Probability of dropout layer for + short-term features. Default: 0.2. + lt_feat_dropout_ratio (float): Probability of dropout layer for + long-term features. Default: 0.2. + pre_activate (bool): Whether to use the activation function before + upsampling in non local layers. Default: True. + zero_init_out_conv (bool): Whether to use zero initialization for + out_conv in NonLocalLayer. Default: False. + """ + + def __init__(self, + st_feat_channels, + lt_feat_channels, + latent_channels, + num_st_feat, + num_lt_feat, + num_non_local_layers=2, + st_feat_dropout_ratio=0.2, + lt_feat_dropout_ratio=0.2, + pre_activate=True, + zero_init_out_conv=False, + **kwargs): + super().__init__() + assert num_non_local_layers >= 1, ( + 'At least one non_local_layer is needed.') + self.st_feat_channels = st_feat_channels + self.lt_feat_channels = lt_feat_channels + self.latent_channels = latent_channels + self.num_st_feat = num_st_feat + self.num_lt_feat = num_lt_feat + self.num_non_local_layers = num_non_local_layers + self.st_feat_dropout_ratio = st_feat_dropout_ratio + self.lt_feat_dropout_ratio = lt_feat_dropout_ratio + self.pre_activate = pre_activate + self.zero_init_out_conv = zero_init_out_conv + + self.st_feat_conv = nn.Conv3d( + st_feat_channels, latent_channels, kernel_size=1) + self.lt_feat_conv = nn.Conv3d( + lt_feat_channels, latent_channels, kernel_size=1) + + if self.st_feat_dropout_ratio > 0: + self.st_feat_dropout = nn.Dropout(self.st_feat_dropout_ratio) + + if self.lt_feat_dropout_ratio > 0: + self.lt_feat_dropout = nn.Dropout(self.lt_feat_dropout_ratio) + + if not self.pre_activate: + self.relu = nn.ReLU() + + self.non_local_layers = [] + for idx in range(self.num_non_local_layers): + layer_name = f'non_local_layer_{idx + 1}' + self.add_module( + layer_name, + NonLocalLayer( + latent_channels, + latent_channels, + latent_channels, + num_st_feat, + num_lt_feat, + pre_activate=self.pre_activate, + zero_init_out_conv=self.zero_init_out_conv)) + self.non_local_layers.append(layer_name) + + def init_weights(self, pretrained=None): + """Initiate the parameters either from existing checkpoint or from + scratch.""" + if isinstance(pretrained, str): + logger = MMLogger.get_current_instance() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + kaiming_init(self.st_feat_conv) + kaiming_init(self.lt_feat_conv) + for layer_name in self.non_local_layers: + non_local_layer = getattr(self, layer_name) + non_local_layer.init_weights(pretrained=pretrained) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, st_feat, lt_feat): + """Defines the computation performed at every call.""" + # prepare st_feat + st_feat = self.st_feat_conv(st_feat) + if self.st_feat_dropout_ratio > 0: + st_feat = self.st_feat_dropout(st_feat) + + # prepare lt_feat + lt_feat = self.lt_feat_conv(lt_feat) + if self.lt_feat_dropout_ratio > 0: + lt_feat = self.lt_feat_dropout(lt_feat) + + # fuse short-term and long-term features in NonLocal Layer + for layer_name in self.non_local_layers: + identity = st_feat + non_local_layer = getattr(self, layer_name) + nl_out = non_local_layer(st_feat, lt_feat) + nl_out = identity + nl_out + if not self.pre_activate: + nl_out = self.relu(nl_out) + st_feat = nl_out + + return nl_out + + +class FBOAvg(nn.Module): + """Avg pool feature bank operator.""" + + def __init__(self, **kwargs): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool3d((1, None, None)) + + def init_weights(self, pretrained=None): + # FBOAvg has no parameters to be initialized. + pass + + def forward(self, st_feat, lt_feat): + out = self.avg_pool(lt_feat) + return out + + +class FBOMax(nn.Module): + """Max pool feature bank operator.""" + + def __init__(self, **kwargs): + super().__init__() + self.max_pool = nn.AdaptiveMaxPool3d((1, None, None)) + + def init_weights(self, pretrained=None): + """FBOMax has no parameters to be initialized.""" + pass + + def forward(self, st_feat, lt_feat): + """Defines the computation performed at every call.""" + out = self.max_pool(lt_feat) + return out + + +class FBOHead(nn.Module): + """Feature Bank Operator Head. + + Add feature bank operator for the spatiotemporal detection model to fuse + short-term features and long-term features. + Args: + lfb_cfg (Dict): The config dict for LFB which is used to sample + long-term features. + fbo_cfg (Dict): The config dict for feature bank operator (FBO). The + type of fbo is also in the config dict and supported fbo type is + `fbo_dict`. + temporal_pool_type (str): The temporal pool type. Choices are 'avg' or + 'max'. Default: 'avg'. + spatial_pool_type (str): The spatial pool type. Choices are 'avg' or + 'max'. Default: 'max'. + """ + + fbo_dict = {'non_local': FBONonLocal, 'avg': FBOAvg, 'max': FBOMax} + + def __init__(self, + lfb_cfg, + fbo_cfg, + temporal_pool_type='avg', + spatial_pool_type='max'): + super().__init__() + fbo_type = fbo_cfg.pop('type', 'non_local') + assert fbo_type in FBOHead.fbo_dict + assert temporal_pool_type in ['max', 'avg'] + assert spatial_pool_type in ['max', 'avg'] + + self.lfb_cfg = copy.deepcopy(lfb_cfg) + self.fbo_cfg = copy.deepcopy(fbo_cfg) + + self.lfb = LFB(**self.lfb_cfg) + self.fbo = self.fbo_dict[fbo_type](**self.fbo_cfg) + + # Pool by default + if temporal_pool_type == 'avg': + self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None)) + else: + self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None)) + if spatial_pool_type == 'avg': + self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) + else: + self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1)) + + def init_weights(self, pretrained=None): + """Initialize the weights in the module. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Default: None. + """ + self.fbo.init_weights(pretrained=pretrained) + + def sample_lfb(self, rois, img_metas): + """Sample long-term features for each ROI feature.""" + inds = rois[:, 0].type(torch.int64) + lt_feat_list = [] + for ind in inds: + lt_feat_list.append(self.lfb[img_metas[ind]['img_key']]) + lt_feat = torch.stack(lt_feat_list, dim=0) + # [N, lfb_channels, window_size * max_num_feat_per_step] + lt_feat = lt_feat.permute(0, 2, 1).contiguous() + return lt_feat.unsqueeze(-1).unsqueeze(-1) + + def forward(self, x, rois, img_metas, **kwargs): + """Defines the computation performed at every call.""" + # [N, C, 1, 1, 1] + st_feat = self.temporal_pool(x) + st_feat = self.spatial_pool(st_feat) + identity = st_feat + + # [N, C, window_size * num_feat_per_step, 1, 1] + lt_feat = self.sample_lfb(rois, img_metas).to(st_feat.device) + + fbo_feat = self.fbo(st_feat, lt_feat) + + out = torch.cat([identity, fbo_feat], dim=1) + return out diff --git a/mmaction/models/roi_heads/shared_heads/lfb.py b/mmaction/models/roi_heads/shared_heads/lfb.py new file mode 100644 index 0000000000000000000000000000000000000000..1fcb406f91fd1e138ad343f2733349ca3a2eb5c7 --- /dev/null +++ b/mmaction/models/roi_heads/shared_heads/lfb.py @@ -0,0 +1,194 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import os +import os.path as osp +import warnings + +import torch +import torch.distributed as dist +from mmengine.dist import get_dist_info + +try: + import lmdb + lmdb_imported = True +except (ImportError, ModuleNotFoundError): + lmdb_imported = False + + +class LFB: + """Long-Term Feature Bank (LFB). LFB is proposed in `Long-Term Feature + Banks for Detailed Video Understanding `_ + The ROI features of videos are stored in the feature bank. The feature bank + was generated by inferring with a lfb infer config. Formally, LFB is a Dict + whose keys are video IDs and its values are also Dicts whose keys are + timestamps in seconds. Example of LFB: + + .. code-block:: Python + { + '0f39OWEqJ24': { + 901: tensor([[ 1.2760, 1.1965, ..., 0.0061, -0.0639], + [-0.6320, 0.3794, ..., -1.2768, 0.5684], + [ 0.2535, 1.0049, ..., 0.4906, 1.2555], + [-0.5838, 0.8549, ..., -2.1736, 0.4162]]), + ... + 1705: tensor([[-1.0169, -1.1293, ..., 0.6793, -2.0540], + [ 1.2436, -0.4555, ..., 0.2281, -0.8219], + [ 0.2815, -0.0547, ..., -0.4199, 0.5157]]), + ... + }, + 'xmqSaQPzL1E': { + ... + }, + ... + } + Args: + lfb_prefix_path (str): The storage path of lfb. + max_num_sampled_feat (int): The max number of sampled features. + Default: 5. + window_size (int): Window size of sampling long term feature. + Default: 60. + lfb_channels (int): Number of the channels of the features stored + in LFB. Default: 2048. + dataset_modes (tuple[str] | str): Load LFB of datasets with different + modes, such as training, validation, testing datasets. If you don't + do cross validation during training, just load the training dataset + i.e. setting `dataset_modes = ('train')`. + Default: ('train', 'val'). + device (str): Where to load lfb. Choices are 'gpu', 'cpu' and 'lmdb'. + A 1.65GB half-precision ava lfb (including training and validation) + occupies about 2GB GPU memory. Default: 'gpu'. + lmdb_map_size (int): Map size of lmdb. Default: 4e9. + construct_lmdb (bool): Whether to construct lmdb. If you have + constructed lmdb of lfb, you can set to False to skip the + construction. Default: True. + """ + + def __init__(self, + lfb_prefix_path, + max_num_sampled_feat=5, + window_size=60, + lfb_channels=2048, + dataset_modes=('train', 'val'), + device='gpu', + lmdb_map_size=4e9, + construct_lmdb=True): + if not osp.exists(lfb_prefix_path): + raise ValueError( + f'lfb prefix path {lfb_prefix_path} does not exist!') + self.lfb_prefix_path = lfb_prefix_path + self.max_num_sampled_feat = max_num_sampled_feat + self.window_size = window_size + self.lfb_channels = lfb_channels + if not isinstance(dataset_modes, tuple): + assert isinstance(dataset_modes, str) + dataset_modes = (dataset_modes, ) + self.dataset_modes = dataset_modes + self.device = device + + rank, world_size = get_dist_info() + # Loading LFB + if self.device == 'gpu': + if 'LOCAL_RANK' in os.environ: + local_rank = int(os.environ['LOCAL_RANK']) + else: + gpus_per_node = torch.cuda.device_count() + local_rank = rank % gpus_per_node + + self.load_lfb(f'cuda:{local_rank}') + elif self.device == 'cpu': + if world_size > 1: + warnings.warn( + 'If distributed training is used with multi-GPUs, lfb ' + 'will be loaded multiple times on RAM. In this case, ' + "'lmdb' is recommended.", UserWarning) + self.load_lfb('cpu') + elif self.device == 'lmdb': + assert lmdb_imported, ( + 'Please install `lmdb` to load lfb on lmdb!') + self.lmdb_map_size = lmdb_map_size + self.construct_lmdb = construct_lmdb + self.lfb_lmdb_path = osp.normpath( + osp.join(self.lfb_prefix_path, 'lmdb')) + + if rank == 0 and self.construct_lmdb: + print('Constructing LFB lmdb...') + self.load_lfb_on_lmdb() + + # Synchronizes all processes to make sure lfb lmdb exist. + if world_size > 1: + dist.barrier() + self.lmdb_env = lmdb.open(self.lfb_lmdb_path, readonly=True) + else: + raise ValueError("Device must be 'gpu', 'cpu' or 'lmdb', ", + f'but get {self.device}.') + + def load_lfb(self, map_location): + self.lfb = {} + for dataset_mode in self.dataset_modes: + lfb_path = osp.normpath( + osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl')) + print(f'Loading LFB from {lfb_path}...') + self.lfb.update(torch.load(lfb_path, map_location=map_location)) + + for video_id in self.lfb: + video_features = self.lfb[video_id] + for sec in video_features: + if isinstance(video_features[sec], (list, tuple)): + video_features[sec] = torch.stack(video_features[sec]) + self.lfb[video_id] = video_features + print(f'LFB has been loaded on {map_location}.') + + def load_lfb_on_lmdb(self): + lfb = {} + for dataset_mode in self.dataset_modes: + lfb_path = osp.normpath( + osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl')) + lfb.update(torch.load(lfb_path, map_location='cpu')) + + lmdb_env = lmdb.open(self.lfb_lmdb_path, map_size=self.lmdb_map_size) + for key, value in lfb.items(): + txn = lmdb_env.begin(write=True) + buff = io.BytesIO() + torch.save(value, buff) + buff.seek(0) + txn.put(key.encode(), buff.read()) + txn.commit() + buff.close() + + print(f'LFB lmdb has been constructed on {self.lfb_lmdb_path}!') + + def sample_long_term_features(self, video_id, timestamp): + if self.device == 'lmdb': + with self.lmdb_env.begin(write=False) as txn: + buf = txn.get(video_id.encode()) + video_features = torch.load(io.BytesIO(buf)) + else: + video_features = self.lfb[video_id] + + # Sample long term features. + window_size, K = self.window_size, self.max_num_sampled_feat + start = timestamp - (window_size // 2) + lt_feats = torch.zeros(window_size, K, self.lfb_channels) + + for idx, sec in enumerate(range(start, start + window_size)): + if sec in video_features: + # `num_feat` is the number of roi features in this second. + feat = video_features[sec] + num_feat = feat.shape[0] + + # Sample some roi features randomly. + random_lfb_indices = torch.randperm(num_feat)[:K] + lt_feats[idx, :num_feat] = feat[random_lfb_indices] + + # [window_size * max_num_sampled_feat, lfb_channels] + return lt_feats.reshape(-1, self.lfb_channels) + + def __getitem__(self, img_key): + """Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb` + is a instance of class LFB.""" + video_id, timestamp = img_key.split(',') + return self.sample_long_term_features(video_id, int(timestamp)) + + def __len__(self): + """The number of videos whose ROI features are stored in LFB.""" + return len(self.lfb) diff --git a/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py b/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6801e455b1a43800fd643fc9ca9476d472adc756 --- /dev/null +++ b/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import mmengine +import torch +import torch.distributed as dist +import torch.nn as nn + +# Note: All these heads take 5D Tensors as input (N, C, T, H, W) + + +class LFBInferHead(nn.Module): + """Long-Term Feature Bank Infer Head. + + This head is used to derive and save the LFB without affecting the input. + Args: + lfb_prefix_path (str): The prefix path to store the lfb. + dataset_mode (str, optional): Which dataset to be inferred. Choices are + 'train', 'val' or 'test'. Default: 'train'. + use_half_precision (bool, optional): Whether to store the + half-precision roi features. Default: True. + temporal_pool_type (str): The temporal pool type. Choices are 'avg' or + 'max'. Default: 'avg'. + spatial_pool_type (str): The spatial pool type. Choices are 'avg' or + 'max'. Default: 'max'. + """ + + def __init__(self, + lfb_prefix_path, + dataset_mode='train', + use_half_precision=True, + temporal_pool_type='avg', + spatial_pool_type='max'): + super().__init__() + rank, _ = mmengine.dist.get_dist_info() + if rank == 0: + if not osp.exists(lfb_prefix_path): + print(f'lfb prefix path {lfb_prefix_path} does not exist. ' + f'Creating the folder...') + mmengine.mkdir_or_exist(lfb_prefix_path) + print('\nInferring LFB...') + + assert temporal_pool_type in ['max', 'avg'] + assert spatial_pool_type in ['max', 'avg'] + self.lfb_prefix_path = lfb_prefix_path + self.dataset_mode = dataset_mode + self.use_half_precision = use_half_precision + + # Pool by default + if temporal_pool_type == 'avg': + self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None)) + else: + self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None)) + if spatial_pool_type == 'avg': + self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) + else: + self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1)) + + self.all_features = [] + self.all_metadata = [] + + def init_weights(self, pretrained=None): + """LFBInferHead has no parameters to be initialized.""" + pass + + def forward(self, x, rois, img_metas, **kwargs): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The extracted RoI feature. + rois (torch.Tensor): The regions of interest. + img_metas (List[dict]): The meta information of the data. + + Returns: + torch.Tensor: The RoI features that have interacted with context + """ + # [N, C, 1, 1, 1] + features = self.temporal_pool(x) + features = self.spatial_pool(features) + if self.use_half_precision: + features = features.half() + + inds = rois[:, 0].type(torch.int64) + for ind in inds: + self.all_metadata.append(img_metas[ind]['img_key']) + self.all_features += list(features) + + # Return the input directly and doesn't affect the input. + return x + + def __del__(self): + assert len(self.all_features) == len(self.all_metadata), ( + 'features and metadata are not equal in length!') + + rank, world_size = mmengine.dist.get_dist_info() + if world_size > 1: + dist.barrier() + + _lfb = {} + for feature, metadata in zip(self.all_features, self.all_metadata): + video_id, timestamp = metadata.split(',') + timestamp = int(timestamp) + + if video_id not in _lfb: + _lfb[video_id] = {} + if timestamp not in _lfb[video_id]: + _lfb[video_id][timestamp] = [] + + _lfb[video_id][timestamp].append(torch.squeeze(feature)) + + _lfb_file_path = osp.normpath( + osp.join(self.lfb_prefix_path, + f'_lfb_{self.dataset_mode}_{rank}.pkl')) + torch.save(_lfb, _lfb_file_path) + print(f'{len(self.all_features)} features from {len(_lfb)} videos ' + f'on GPU {rank} have been stored in {_lfb_file_path}.') + + # Synchronizes all processes to make sure all gpus have stored their + # roi features + if world_size > 1: + dist.barrier() + if rank > 0: + return + + print('Gathering all the roi features...') + + lfb = {} + for rank_id in range(world_size): + _lfb_file_path = osp.normpath( + osp.join(self.lfb_prefix_path, + f'_lfb_{self.dataset_mode}_{rank_id}.pkl')) + + # Since each frame will only be distributed to one GPU, + # the roi features on the same timestamp of the same video are all + # on the same GPU + _lfb = torch.load(_lfb_file_path) + for video_id in _lfb: + if video_id not in lfb: + lfb[video_id] = _lfb[video_id] + else: + lfb[video_id].update(_lfb[video_id]) + + osp.os.remove(_lfb_file_path) + + lfb_file_path = osp.normpath( + osp.join(self.lfb_prefix_path, f'lfb_{self.dataset_mode}.pkl')) + torch.save(lfb, lfb_file_path) + print(f'LFB has been constructed in {lfb_file_path}!') diff --git a/mmaction/models/similarity/__init__.py b/mmaction/models/similarity/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2c15d0fd8130d01d0258b34a396463f336e7be49 --- /dev/null +++ b/mmaction/models/similarity/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .adapters import SimpleMeanAdapter, TransformerAdapter +from .clip_similarity import CLIPSimilarity + +__all__ = ['CLIPSimilarity', 'TransformerAdapter', 'SimpleMeanAdapter'] diff --git a/mmaction/models/similarity/adapters.py b/mmaction/models/similarity/adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cba26c41883d8e56d9c3d6cf288a493bcb0048 --- /dev/null +++ b/mmaction/models/similarity/adapters.py @@ -0,0 +1,163 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmengine.model import BaseModule + +from mmaction.registry import MODELS + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Perform quick gelu.""" + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + """"ResidualAttentionBlock. + + Args: + d_model (int): The dimension of the model. + n_head (int): The number of heads. + attn_mask (torch.Tensor, optional): The attention mask. + Defaults to None. + """ + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: Optional[torch.Tensor] = None) -> None: + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor) -> torch.Tensor: + """Perform attention.""" + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + """"ResidualAttentionBlock. + + Args: + width (int): The width of transformer. + heads (int): The number of heads of transformer. + layers (int): The number of layers of transformer. + attn_mask (torch.Tensor, optional): The attention mask. + Defaults to None. + """ + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: Optional[torch.Tensor] = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + return self.resblocks(x) + + +@MODELS.register_module() +class TransformerAdapter(BaseModule): + """"Transformer adapter, modified from github.com/openai/CLIP. + + Args: + num_segs (int): The number of segments. + transformer_width (int): The width of transformer. + transformer_heads (int): The number of heads of transformer. + transformer_layers (int): The number of layers of transformer. + """ + + def __init__(self, num_segs: int, transformer_width: int, + transformer_heads: int, transformer_layers: int) -> None: + super(TransformerAdapter, self).__init__() + self.num_segs = num_segs + + self.positional_embedding = nn.Parameter( + torch.empty(num_segs, transformer_width)) + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads) + + def init_weights(self) -> None: + """Initialize the weights.""" + + nn.init.normal_(self.positional_embedding, std=0.01) + + proj_std = (self.transformer.width**-0.5) * ( + (2 * self.transformer.layers)**-0.5) + attn_std = self.transformer.width**-0.5 + fc_std = (2 * self.transformer.width)**-0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + b, seq_length, c = x.size() + + x_original = x + x = x + self.positional_embedding + x = x.transpose(0, 1) # NLD -> LND + x = self.transformer(x) + x = x.transpose(0, 1) # LND -> NLD + x = x.type(x_original.dtype) + x_original + return x.mean(dim=1) + + +@MODELS.register_module() +class SimpleMeanAdapter(BaseModule): + """Average features adapter. + + Args: + dim (int): The dimension to perform averaging. Defaults to 1. + """ + + def __init__(self, dim: Union[int, Tuple[int]] = 1) -> None: + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + return x.mean(dim=self.dim) diff --git a/mmaction/models/similarity/clip_similarity.py b/mmaction/models/similarity/clip_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..c7427c6d30f6ea36b2836f3ec6be13848c558650 --- /dev/null +++ b/mmaction/models/similarity/clip_similarity.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Dict, List, Tuple + +import torch +from mmengine.dist import all_gather, get_rank +from mmengine.model import BaseModel +from mmengine.structures import InstanceData + +from mmaction.registry import MODELS +from mmaction.utils import ForwardResults, OptSampleList + + +class GatherLayer(torch.autograd.Function): + """Gather tensors from all process, supporting backward propagation.""" + + @staticmethod + def forward(ctx: Any, input: torch.Tensor) -> Tuple[List]: + ctx.save_for_backward(input) + output = all_gather(input) + return tuple(output) + + @staticmethod + def backward(ctx: Any, *grads: torch.Tensor) -> torch.Tensor: + input, = ctx.saved_tensors + grad_out = torch.zeros_like(input) + grad_out[:] = grads[get_rank()] + return grad_out + + +@MODELS.register_module() +class CLIPSimilarity(BaseModel): + """CLIP-based similarity model. + + Args: + clip_arch (str): The architecture of the clip model. + Supported choices are `'ViT-B/32'`, `'ViT-B/16'`, + `'ViT-L/14'` and `'ViT-L/14@336px'`. + data_preprocessor (dict): The pre-process config. + adapter (dict): The 3D adapter config. + to_float32 (bool): Whether to convert the dtype of params of clip + model to float32. + frozen_layers: Layers to be frozen (all params fixed). -1 means + not freezing any parameters. Defaults to -1. + loss (dict): The config of loss. Defaults to + `dict(type='CrossEntropyLoss', loss_weight=0.5)`. + """ + + def __init__( + self, + clip_arch: str, + data_preprocessor: Dict[str, Dict], + adapter: Dict, + to_float32: bool = False, + frozen_layers: int = -1, + loss: Dict = dict(type='CrossEntropyLoss', loss_weight=0.5) + ) -> None: + super(CLIPSimilarity, + self).__init__(data_preprocessor=data_preprocessor) + + try: + import clip + except ImportError: + raise ImportError('Please run `pip install ' + 'git+https://github.com/openai/CLIP.git` ' + 'to install clip first. ') + + self.clip = clip.load(clip_arch, device='cpu')[0] + if to_float32: + self.clip.float() + self.loss = MODELS.build(loss) + self.adapter = MODELS.build(adapter) + self.frozen_layers = frozen_layers + self._freeze_stages() + + def encode_video(self, video: torch.Tensor) -> torch.Tensor: + """Encode video.""" + b, n, c, h, w = video.shape + video = video.view(-1, c, h, w) + frames_features = self.encode_image(video) + frames_features = frames_features.view(b, n, -1) + video_features = self.adapter(frames_features) + return video_features + + def encode_image(self, image: torch.Tensor) -> torch.Tensor: + """Encode image.""" + return self.clip.encode_image(image) + + def encode_text(self, text: torch.Tensor) -> torch.Tensor: + """Encode text.""" + return self.clip.encode_text(text) + + def extract_feat(self, + inputs: Dict[str, torch.Tensor], + norm: bool = True) -> Tuple: + """Extract features.""" + text_inputs = inputs['text'] + video_inputs = inputs['imgs'] + text_features = self.encode_text(text_inputs) + video_features = self.encode_video(video_inputs) + + if norm: + text_features = text_features / text_features.norm( + dim=-1, keepdim=True) + video_features = video_features / video_features.norm( + dim=-1, keepdim=True) + + return video_features, text_features + + def forward(self, + inputs: Dict[str, torch.Tensor], + data_samples: OptSampleList = None, + mode: str = 'tensor') -> ForwardResults: + """Forward function.""" + + if mode == 'tensor': + return self.extract_feat(inputs, norm=False) + + elif mode == 'loss': + video_features, text_features = self.extract_feat(inputs) + video_features = torch.cat( + GatherLayer.apply(video_features), dim=0) + text_features = torch.cat(GatherLayer.apply(text_features), dim=0) + + logit_scale = self.clip.logit_scale.exp() + logits_per_video = logit_scale * video_features @ text_features.t() + logits_per_text = logits_per_video.t() + + labels = torch.arange(logits_per_video.shape[0]).to( + logit_scale.device) + + sim_loss_v2t = self.loss(logits_per_video, labels) + sim_loss_t2v = self.loss(logits_per_text, labels) + + losses = dict() + losses['sim_loss_v2t'] = sim_loss_v2t + losses['sim_loss_t2v'] = sim_loss_t2v + return losses + + elif mode == 'predict': + video_features, text_features = self.extract_feat(inputs) + for ds, vf, tf in zip(data_samples, video_features, text_features): + features = InstanceData(video_feature=vf, text_feature=tf) + ds.features = features + return data_samples + + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def train(self, mode: bool = True) -> None: + """Set the optimization status when training.""" + super().train(mode) + self._freeze_stages() + + def _freeze_stages(self) -> None: + """Prevent all the parameters from being optimized before + ``self.frozen_layers``.""" + + if self.frozen_layers >= 0: + top_layers = [ + 'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post', + 'visual.proj' + ] + mid_layers = [ + 'visual.transformer.resblocks', 'transformer.resblocks' + ] + + for name, param in self.clip.named_parameters(): + if any(name.find(n) == 0 for n in top_layers): + continue + elif any(name.find(n) == 0 for n in mid_layers): + layer_n = int(name.split('.resblocks.')[1].split('.')[0]) + if layer_n >= self.frozen_layers: + continue + param.requires_grad = False diff --git a/mmaction/models/task_modules/__init__.py b/mmaction/models/task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..583fc6b73523f71252b26c354510bff4ff52b826 --- /dev/null +++ b/mmaction/models/task_modules/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +try: + from mmdet.registry import TASK_UTILS as MMDET_TASK_UTILS + + from .assigners import MaxIoUAssignerAVA + + MMDET_TASK_UTILS.register_module()(MaxIoUAssignerAVA) + + __all__ = ['MaxIoUAssignerAVA'] + +except (ImportError, ModuleNotFoundError): + pass diff --git a/mmaction/models/task_modules/assigners/__init__.py b/mmaction/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68447e5fbcb7fca754f3b742b3a965a124705d1b --- /dev/null +++ b/mmaction/models/task_modules/assigners/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .max_iou_assigner_ava import MaxIoUAssignerAVA + +__all__ = ['MaxIoUAssignerAVA'] diff --git a/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py b/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py new file mode 100644 index 0000000000000000000000000000000000000000..50e7861dd12303340c570e7843866e7b281df06f --- /dev/null +++ b/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdet.models.task_modules import AssignResult, MaxIoUAssigner +from torch import Tensor + + +class MaxIoUAssignerAVA(MaxIoUAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float | tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each + gt). Defaults to 0. + gt_max_assign_all (bool): Whether to assign all bboxes with the + same highest overlap with some gt to that gt. Defaults to True. + """ + + # The function is overridden, to handle the case that gt_label is not + # int + def assign_wrt_overlaps(self, overlaps: Tensor, + gt_labels: Tensor) -> AssignResult: + """Assign w.r.t. the overlaps of bboxes with gts. + + Args: + overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes, + shape(k, n). + gt_labels (Tensor): Labels of k gt_bboxes, shape + (k, num_classes). + + Returns: + :obj:`AssignResult`: The assign result. + """ + num_gts, num_bboxes = overlaps.size(0), overlaps.size(1) + + # 1. assign -1 by default + assigned_gt_inds = overlaps.new_full((num_bboxes, ), + -1, + dtype=torch.long) + + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = overlaps.new_zeros((num_bboxes, )) + assigned_labels = overlaps.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + # for each anchor, which gt best overlaps with it + # for each anchor, the max iou of all gts + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + # for each gt, which anchor best overlaps with it + # for each gt, the max iou of all proposals + gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1) + + # 2. assign negative: below + # the negative inds are set to be 0 + if isinstance(self.neg_iou_thr, float): + assigned_gt_inds[(max_overlaps >= 0) + & (max_overlaps < self.neg_iou_thr)] = 0 + elif isinstance(self.neg_iou_thr, tuple): + assert len(self.neg_iou_thr) == 2 + assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0]) + & (max_overlaps < self.neg_iou_thr[1])] = 0 + + # 3. assign positive: above positive IoU threshold + pos_inds = max_overlaps >= self.pos_iou_thr + assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1 + + if self.match_low_quality: + # Low-quality matching will overwrite the assigned_gt_inds + # assigned in Step 3. Thus, the assigned gt might not be the + # best one for prediction. + # For example, if bbox A has 0.9 and 0.8 iou with GT bbox + # 1 & 2, bbox 1 will be assigned as the best target for bbox A + # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A, + # bbox A's assigned_gt_inds will be overwritten to be bbox B. + # This might be the reason that it is not used in ROI Heads. + for i in range(num_gts): + if gt_max_overlaps[i] >= self.min_pos_iou: + if self.gt_max_assign_all: + max_iou_inds = overlaps[i, :] == gt_max_overlaps[i] + assigned_gt_inds[max_iou_inds] = i + 1 + else: + assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 + + # consider multi-class case (AVA) + assert len(gt_labels[0]) > 1 + assigned_labels = assigned_gt_inds.new_zeros( + (num_bboxes, len(gt_labels[0])), dtype=torch.float32) + + # If not assigned, labels will be all 0 + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) diff --git a/mmaction/models/utils/__init__.py b/mmaction/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c2643ace9853e306471ae11afa772ae89df3c332 --- /dev/null +++ b/mmaction/models/utils/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .blending_utils import (BaseMiniBatchBlending, CutmixBlending, + MixupBlending, RandomBatchAugment) +from .gcn_utils import * # noqa: F401,F403 +from .graph import Graph + +__all__ = [ + 'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph', + 'RandomBatchAugment' +] diff --git a/mmaction/models/utils/__pycache__/__init__.cpython-310.pyc b/mmaction/models/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b00f20a87fe996f1a19417ebb1c256a33f6f1f5c Binary files /dev/null and b/mmaction/models/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/models/utils/__pycache__/blending_utils.cpython-310.pyc b/mmaction/models/utils/__pycache__/blending_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..195f2bcc42bf6cb679cab4c41f256cc576e4abe8 Binary files /dev/null and b/mmaction/models/utils/__pycache__/blending_utils.cpython-310.pyc differ diff --git a/mmaction/models/utils/__pycache__/embed.cpython-310.pyc b/mmaction/models/utils/__pycache__/embed.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b395333189beb0d592f94d32e3a05eb6a88da314 Binary files /dev/null and b/mmaction/models/utils/__pycache__/embed.cpython-310.pyc differ diff --git a/mmaction/models/utils/__pycache__/gcn_utils.cpython-310.pyc b/mmaction/models/utils/__pycache__/gcn_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cadeadbd86addd67ae174c2035d756fda1f2ee Binary files /dev/null and b/mmaction/models/utils/__pycache__/gcn_utils.cpython-310.pyc differ diff --git a/mmaction/models/utils/__pycache__/graph.cpython-310.pyc b/mmaction/models/utils/__pycache__/graph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e7455a3852e3f3ead95679505a6a6db95d31486 Binary files /dev/null and b/mmaction/models/utils/__pycache__/graph.cpython-310.pyc differ diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ae9afb6665f38409a9304d82acae6c7ae73ca8 --- /dev/null +++ b/mmaction/models/utils/blending_utils.py @@ -0,0 +1,263 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from functools import partial +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.utils import digit_version +from torch.distributions.beta import Beta + +from mmaction.registry import MODELS +from mmaction.utils import SampleList + +if digit_version(torch.__version__) < digit_version('1.8.0'): + floor_div = torch.floor_divide +else: + floor_div = partial(torch.div, rounding_mode='floor') + +__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending'] + + +class BaseMiniBatchBlending(metaclass=ABCMeta): + """Base class for Image Aliasing. + + Args: + num_classes (int): Number of classes. + """ + + def __init__(self, num_classes: int) -> None: + self.num_classes = num_classes + + @abstractmethod + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: + """Blending images process.""" + raise NotImplementedError + + def __call__(self, imgs: torch.Tensor, batch_data_samples: SampleList, + **kwargs) -> Tuple: + """Blending data in a mini-batch. + + Images are float tensors with the shape of (B, N, C, H, W) for 2D + recognizers or (B, N, C, T, H, W) for 3D recognizers. + + Besides, labels are converted from hard labels to soft labels. + Hard labels are integer tensors with the shape of (B, ) and all of the + elements are in the range [0, num_classes - 1]. + Soft labels (probability distribution over classes) are float tensors + with the shape of (B, num_classes) and all of the elements are in + the range [0, 1]. + + Args: + imgs (torch.Tensor): Model input images, float tensor with the + shape of (B, N, C, H, W) or (B, N, C, T, H, W). + batch_data_samples (List[:obj:`ActionDataSample`]): The batch + data samples. It usually includes information such + as `gt_label`. + + Returns: + mixed_imgs (torch.Tensor): Blending images, float tensor with the + same shape of the input imgs. + batch_data_samples (List[:obj:`ActionDataSample`]): The modified + batch data samples. ``gt_label`` in each data sample are + converted from a hard label to a blended soft label, float + tensor with the shape of (num_classes, ) and all elements are + in range [0, 1]. + """ + label = [x.gt_label for x in batch_data_samples] + # single-label classification + if label[0].size(0) == 1: + label = torch.tensor(label, dtype=torch.long).to(imgs.device) + one_hot_label = F.one_hot(label, num_classes=self.num_classes) + # multi-label classification + else: + one_hot_label = torch.stack(label) + + mixed_imgs, mixed_label = self.do_blending(imgs, one_hot_label, + **kwargs) + + for label_item, sample in zip(mixed_label, batch_data_samples): + sample.set_gt_label(label_item) + + return mixed_imgs, batch_data_samples + + +@MODELS.register_module() +class MixupBlending(BaseMiniBatchBlending): + """Implementing Mixup in a mini-batch. + + This module is proposed in `mixup: Beyond Empirical Risk Minimization + `_. + Code Reference https://github.com/open-mmlab/mmclassification/blob/master/mmcls/models/utils/mixup.py # noqa + + Args: + num_classes (int): The number of classes. + alpha (float): Parameters for Beta distribution. + """ + + def __init__(self, num_classes: int, alpha: float = .2) -> None: + super().__init__(num_classes=num_classes) + self.beta = Beta(alpha, alpha) + + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: + """Blending images with mixup. + + Args: + imgs (torch.Tensor): Model input images, float tensor with the + shape of (B, N, C, H, W) or (B, N, C, T, H, W). + label (torch.Tensor): One hot labels, integer tensor with the shape + of (B, num_classes). + + Returns: + tuple: A tuple of blended images and labels. + """ + assert len(kwargs) == 0, f'unexpected kwargs for mixup {kwargs}' + + lam = self.beta.sample() + batch_size = imgs.size(0) + rand_index = torch.randperm(batch_size) + + mixed_imgs = lam * imgs + (1 - lam) * imgs[rand_index, :] + mixed_label = lam * label + (1 - lam) * label[rand_index, :] + + return mixed_imgs, mixed_label + + +@MODELS.register_module() +class CutmixBlending(BaseMiniBatchBlending): + """Implementing Cutmix in a mini-batch. + + This module is proposed in `CutMix: Regularization Strategy to Train Strong + Classifiers with Localizable Features `_. + Code Reference https://github.com/clovaai/CutMix-PyTorch + + Args: + num_classes (int): The number of classes. + alpha (float): Parameters for Beta distribution. + """ + + def __init__(self, num_classes: int, alpha: float = .2) -> None: + super().__init__(num_classes=num_classes) + self.beta = Beta(alpha, alpha) + + @staticmethod + def rand_bbox(img_size: torch.Size, lam: torch.Tensor) -> Tuple: + """Generate a random boudning box.""" + w = img_size[-1] + h = img_size[-2] + cut_rat = torch.sqrt(1. - lam) + cut_w = torch.tensor(int(w * cut_rat)) + cut_h = torch.tensor(int(h * cut_rat)) + + # uniform + cx = torch.randint(w, (1, ))[0] + cy = torch.randint(h, (1, ))[0] + + bbx1 = torch.clamp(cx - floor_div(cut_w, 2), 0, w) + bby1 = torch.clamp(cy - floor_div(cut_h, 2), 0, h) + bbx2 = torch.clamp(cx + floor_div(cut_w, 2), 0, w) + bby2 = torch.clamp(cy + floor_div(cut_h, 2), 0, h) + + return bbx1, bby1, bbx2, bby2 + + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: + """Blending images with cutmix. + + Args: + imgs (torch.Tensor): Model input images, float tensor with the + shape of (B, N, C, H, W) or (B, N, C, T, H, W). + label (torch.Tensor): One hot labels, integer tensor with the shape + of (B, num_classes). + + Returns: + tuple: A tuple of blended images and labels. + """ + + assert len(kwargs) == 0, f'unexpected kwargs for cutmix {kwargs}' + + batch_size = imgs.size(0) + rand_index = torch.randperm(batch_size) + lam = self.beta.sample() + + bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.size(), lam) + imgs[:, ..., bby1:bby2, bbx1:bbx2] = imgs[rand_index, ..., bby1:bby2, + bbx1:bbx2] + lam = 1 - (1.0 * (bbx2 - bbx1) * (bby2 - bby1) / + (imgs.size()[-1] * imgs.size()[-2])) + + label = lam * label + (1 - lam) * label[rand_index, :] + + return imgs, label + + +@MODELS.register_module() +class RandomBatchAugment(BaseMiniBatchBlending): + """Randomly choose one batch augmentation to apply. + + Args: + augments (dict | list): configs of batch + augmentations. + probs (float | List[float] | None): The probabilities of each batch + augmentations. If None, choose evenly. Defaults to None. + + Example: + >>> augments_cfg = [ + ... dict(type='CutmixBlending', alpha=1., num_classes=10), + ... dict(type='MixupBlending', alpha=1., num_classes=10) + ... ] + >>> batch_augment = RandomBatchAugment(augments_cfg, probs=[0.5, 0.3]) + >>> imgs = torch.randn(16, 3, 8, 32, 32) + >>> label = torch.randint(0, 10, (16, )) + >>> imgs, label = batch_augment(imgs, label) + + .. note :: + + To decide which batch augmentation will be used, it picks one of + ``augments`` based on the probabilities. In the example above, the + probability to use CutmixBlending is 0.5, to use MixupBlending is 0.3, + and to do nothing is 0.2. + """ + + def __init__(self, + augments: Union[dict, list], + probs: Optional[Union[float, List[float]]] = None) -> None: + if not isinstance(augments, (tuple, list)): + augments = [augments] + + self.augments = [] + for aug in augments: + assert isinstance(aug, dict), \ + f'blending augment config must be a dict. Got {type(aug)}' + self.augments.append(MODELS.build(aug)) + + self.num_classes = augments[0].get('num_classes') + + if isinstance(probs, float): + probs = [probs] + + if probs is not None: + assert len(augments) == len(probs), \ + '``augments`` and ``probs`` must have same lengths. ' \ + f'Got {len(augments)} vs {len(probs)}.' + assert sum(probs) <= 1, \ + 'The total probability of batch augments exceeds 1.' + self.augments.append(None) + probs.append(1 - sum(probs)) + + self.probs = probs + + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: + """Randomly apply batch augmentations to the batch inputs and batch + data samples.""" + aug_index = np.random.choice(len(self.augments), p=self.probs) + aug = self.augments[aug_index] + + if aug is not None: + return aug.do_blending(imgs, label, **kwargs) + else: + return imgs, label diff --git a/mmaction/models/utils/embed.py b/mmaction/models/utils/embed.py new file mode 100644 index 0000000000000000000000000000000000000000..9e8aa44fb58e7520e9293a7d338503f151cc5eef --- /dev/null +++ b/mmaction/models/utils/embed.py @@ -0,0 +1,234 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule +from mmengine.utils import to_3tuple + + +class AdaptivePadding(nn.Module): + """Applies padding adaptively to the input. + + This module can make input get fully covered by filter + you specified. It support two modes "same" and "corner". The + "same" mode is same with "SAME" padding mode in TensorFlow, pad + zero around input. The "corner" mode would pad zero + to bottom right. + + Args: + kernel_size (int | tuple): Size of the kernel. Default: 1. + stride (int | tuple): Stride of the filter. Default: 1. + dilation (int | tuple): Spacing between kernel elements. + Default: 1. + padding (str): Support "same" and "corner", "corner" mode + would pad zero to bottom right, and "same" mode would + pad zero around input. Default: "corner". + + Example: + >>> kernel_size = 16 + >>> stride = 16 + >>> dilation = 1 + >>> input = torch.rand(1, 1, 15, 17) + >>> adap_pad = AdaptivePadding( + >>> kernel_size=kernel_size, + >>> stride=stride, + >>> dilation=dilation, + >>> padding="corner") + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + >>> input = torch.rand(1, 1, 16, 17) + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + """ + + def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): + super().__init__() + assert padding in ('same', 'corner') + + kernel_size = to_3tuple(kernel_size) + stride = to_3tuple(stride) + dilation = to_3tuple(dilation) + + self.padding = padding + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + + def get_pad_shape(self, input_shape): + """Calculate the padding size of input. + + Args: + input_shape (:obj:`torch.Size`): arrange as (H, W). + + Returns: + Tuple[int]: The padding size along the + original H and W directions + """ + input_t, input_h, input_w = input_shape + kernel_d, kernel_h, kernel_w = self.kernel_size + stride_d, stride_h, stride_w = self.stride + output_d = math.ceil(input_t / stride_d) + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + pad_d = max((output_d - 1) * stride_d + + (kernel_d - 1) * self.dilation[0] + 1 - input_t, 0) + pad_h = max((output_h - 1) * stride_h + + (kernel_h - 1) * self.dilation[1] + 1 - input_h, 0) + pad_w = max((output_w - 1) * stride_w + + (kernel_w - 1) * self.dilation[2] + 1 - input_w, 0) + return pad_d, pad_h, pad_w + + def forward(self, x): + """Add padding to `x` + + Args: + x (Tensor): Input tensor has shape (B, C, H, W). + + Returns: + Tensor: The tensor with adaptive padding + """ + pad_d, pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) + if pad_d > 0 or pad_h > 0 or pad_w > 0: + if self.padding == 'corner': + x = F.pad(x, [0, pad_w, 0, pad_h, 0, pad_d]) + elif self.padding == 'same': + x = F.pad(x, [ + pad_w // 2, + pad_w - pad_w // 2, + pad_h // 2, + pad_h - pad_h // 2, + pad_d // 2, + pad_d - pad_d // 2, + ]) + return x + + +class PatchEmbed3D(BaseModule): + """Video to Patch Embedding. + + We use a conv layer to implement PatchEmbed. + + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (str): The type of convolution + to generate patch embedding. Default: "Conv3d". + kernel_size (int): The kernel_size of embedding conv. + Default: (2, 4, 4). + stride (int): The slide stride of embedding conv. + Default: (2, 4, 4). + padding (int | tuple | string): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int): The dilation rate of embedding conv. Default: 1. + bias (bool): Bias of embed conv. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + input_size (int | tuple | None): The size of input, which will be + used to calculate the out size. Only works when `dynamic_size` + is False. Default: None. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type='Conv3d', + kernel_size=(2, 4, 4), + stride=(2, 4, 4), + padding='corner', + dilation=1, + bias=True, + norm_cfg=None, + input_size=None, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + if stride is None: + stride = kernel_size + + kernel_size = to_3tuple(kernel_size) + stride = to_3tuple(stride) + dilation = to_3tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of conv + padding = 0 + else: + self.adaptive_padding = None + padding = to_3tuple(padding) + + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + if input_size: + input_size = to_3tuple(input_size) + # `init_out_size` would be used outside to + # calculate the num_patches + # e.g. when `use_abs_pos_embed` outside + self.init_input_size = input_size + if self.adaptive_padding: + pad_d, pad_h, pad_w = self.adaptive_padding.get_pad_shape( + input_size) + input_t, input_h, input_w = input_size + input_t = input_t + pad_d + input_h = input_h + pad_h + input_w = input_w + pad_w + input_size = (input_t, input_h, input_w) + + # https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html + t_out = (input_size[0] + 2 * padding[0] - dilation[0] * + (kernel_size[0] - 1) - 1) // stride[0] + 1 + h_out = (input_size[1] + 2 * padding[1] - dilation[1] * + (kernel_size[1] - 1) - 1) // stride[1] + 1 + w_out = (input_size[2] + 2 * padding[2] - dilation[2] * + (kernel_size[2] - 1) - 1) // stride[2] + 1 + self.init_out_size = (t_out, h_out, w_out) + else: + self.init_input_size = None + self.init_out_size = None + + def forward(self, x): + """ + Args: + x (Tensor): Has shape (B, C, T, H, W). In most case, C is 3. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, out_t * out_h * out_w, embed_dims) + - out_size (tuple[int]): Spatial shape of x, arrange as + (out_t, out_h, out_w). + """ + + if self.adaptive_padding: + x = self.adaptive_padding(x) + + x = self.projection(x) + out_size = (x.shape[2], x.shape[3], x.shape[4]) + x = x.flatten(2).transpose(1, 2) + if self.norm is not None: + x = self.norm(x) + return x, out_size diff --git a/mmaction/models/utils/gcn_utils.py b/mmaction/models/utils/gcn_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5c423a3e3def5f417e8c5f403df15fe9901e8227 --- /dev/null +++ b/mmaction/models/utils/gcn_utils.py @@ -0,0 +1,421 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmengine.model import BaseModule, ModuleList, Sequential + + +class unit_gcn(BaseModule): + """The basic unit of graph convolutional network. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + A (torch.Tensor): The adjacency matrix defined in the graph + with shape of `(num_subsets, num_nodes, num_nodes)`. + adaptive (str): The strategy for adapting the weights of the + adjacency matrix. Defaults to ``'importance'``. + conv_pos (str): The position of the 1x1 2D conv. + Defaults to ``'pre'``. + with_res (bool): Whether to use residual connection. + Defaults to False. + norm (str): The name of norm layer. Defaults to ``'BN'``. + act (str): The name of activation layer. Defaults to ``'Relu'``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + A: torch.Tensor, + adaptive: str = 'importance', + conv_pos: str = 'pre', + with_res: bool = False, + norm: str = 'BN', + act: str = 'ReLU', + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_subsets = A.size(0) + + assert adaptive in [None, 'init', 'offset', 'importance'] + self.adaptive = adaptive + assert conv_pos in ['pre', 'post'] + self.conv_pos = conv_pos + self.with_res = with_res + + self.norm_cfg = norm if isinstance(norm, dict) else dict(type=norm) + self.act_cfg = act if isinstance(act, dict) else dict(type=act) + self.bn = build_norm_layer(self.norm_cfg, out_channels)[1] + self.act = build_activation_layer(self.act_cfg) + + if self.adaptive == 'init': + self.A = nn.Parameter(A.clone()) + else: + self.register_buffer('A', A) + + if self.adaptive in ['offset', 'importance']: + self.PA = nn.Parameter(A.clone()) + if self.adaptive == 'offset': + nn.init.uniform_(self.PA, -1e-6, 1e-6) + elif self.adaptive == 'importance': + nn.init.constant_(self.PA, 1) + + if self.conv_pos == 'pre': + self.conv = nn.Conv2d(in_channels, out_channels * A.size(0), 1) + elif self.conv_pos == 'post': + self.conv = nn.Conv2d(A.size(0) * in_channels, out_channels, 1) + + if self.with_res: + if in_channels != out_channels: + self.down = Sequential( + nn.Conv2d(in_channels, out_channels, 1), + build_norm_layer(self.norm_cfg, out_channels)[1]) + else: + self.down = lambda x: x + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + n, c, t, v = x.shape + res = self.down(x) if self.with_res else 0 + + A_switch = {None: self.A, 'init': self.A} + if hasattr(self, 'PA'): + A_switch.update({ + 'offset': self.A + self.PA, + 'importance': self.A * self.PA + }) + A = A_switch[self.adaptive] + + if self.conv_pos == 'pre': + x = self.conv(x) + x = x.view(n, self.num_subsets, -1, t, v) + x = torch.einsum('nkctv,kvw->nctw', (x, A)).contiguous() + elif self.conv_pos == 'post': + x = torch.einsum('nctv,kvw->nkctw', (x, A)).contiguous() + x = x.view(n, -1, t, v) + x = self.conv(x) + + return self.act(self.bn(x) + res) + + +class unit_aagcn(BaseModule): + """The graph convolution unit of AAGCN. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + A (torch.Tensor): The adjacency matrix defined in the graph + with shape of `(num_subsets, num_joints, num_joints)`. + coff_embedding (int): The coefficient for downscaling the embedding + dimension. Defaults to 4. + adaptive (bool): Whether to use adaptive graph convolutional layer. + Defaults to True. + attention (bool): Whether to use the STC-attention module. + Defaults to True. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='Constant', layer='BatchNorm2d', val=1, + override=dict(type='Constant', name='bn', val=1e-6)), + dict(type='Kaiming', layer='Conv2d', mode='fan_out'), + dict(type='ConvBranch', name='conv_d') + ]``. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + A: torch.Tensor, + coff_embedding: int = 4, + adaptive: bool = True, + attention: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict( + type='Constant', + layer='BatchNorm2d', + val=1, + override=dict(type='Constant', name='bn', val=1e-6)), + dict(type='Kaiming', layer='Conv2d', mode='fan_out'), + dict(type='ConvBranch', name='conv_d') + ] + ) -> None: + + if attention: + attention_init_cfg = [ + dict( + type='Constant', + layer='Conv1d', + val=0, + override=dict(type='Xavier', name='conv_sa')), + dict( + type='Kaiming', + layer='Linear', + mode='fan_in', + override=dict(type='Constant', val=0, name='fc2c')) + ] + init_cfg = cp.copy(init_cfg) + init_cfg.extend(attention_init_cfg) + + super(unit_aagcn, self).__init__(init_cfg=init_cfg) + inter_channels = out_channels // coff_embedding + self.inter_c = inter_channels + self.out_c = out_channels + self.in_c = in_channels + self.num_subset = A.shape[0] + self.adaptive = adaptive + self.attention = attention + + num_joints = A.shape[-1] + + self.conv_d = ModuleList() + for i in range(self.num_subset): + self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1)) + + if self.adaptive: + self.A = nn.Parameter(A) + + self.alpha = nn.Parameter(torch.zeros(1)) + self.conv_a = ModuleList() + self.conv_b = ModuleList() + for i in range(self.num_subset): + self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1)) + self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1)) + else: + self.register_buffer('A', A) + + if self.attention: + self.conv_ta = nn.Conv1d(out_channels, 1, 9, padding=4) + # s attention + ker_joint = num_joints if num_joints % 2 else num_joints - 1 + pad = (ker_joint - 1) // 2 + self.conv_sa = nn.Conv1d(out_channels, 1, ker_joint, padding=pad) + # channel attention + rr = 2 + self.fc1c = nn.Linear(out_channels, out_channels // rr) + self.fc2c = nn.Linear(out_channels // rr, out_channels) + + self.down = lambda x: x + if in_channels != out_channels: + self.down = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1), + nn.BatchNorm2d(out_channels)) + + self.bn = nn.BatchNorm2d(out_channels) + self.tan = nn.Tanh() + self.sigmoid = nn.Sigmoid() + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + N, C, T, V = x.size() + + y = None + if self.adaptive: + for i in range(self.num_subset): + A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view( + N, V, self.inter_c * T) + A2 = self.conv_b[i](x).view(N, self.inter_c * T, V) + A1 = self.tan(torch.matmul(A1, A2) / A1.size(-1)) # N V V + A1 = self.A[i] + A1 * self.alpha + A2 = x.view(N, C * T, V) + z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) + y = z + y if y is not None else z + else: + for i in range(self.num_subset): + A1 = self.A[i] + A2 = x.view(N, C * T, V) + z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) + y = z + y if y is not None else z + + y = self.relu(self.bn(y) + self.down(x)) + + if self.attention: + # spatial attention first + se = y.mean(-2) # N C V + se1 = self.sigmoid(self.conv_sa(se)) # N 1 V + y = y * se1.unsqueeze(-2) + y + # then temporal attention + se = y.mean(-1) # N C T + se1 = self.sigmoid(self.conv_ta(se)) # N 1 T + y = y * se1.unsqueeze(-1) + y + # then spatial temporal attention ?? + se = y.mean(-1).mean(-1) # N C + se1 = self.relu(self.fc1c(se)) + se2 = self.sigmoid(self.fc2c(se1)) # N C + y = y * se2.unsqueeze(-1).unsqueeze(-1) + y + # A little bit weird + return y + + +class unit_tcn(BaseModule): + """The basic unit of temporal convolutional network. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Size of the temporal convolution kernel. + Defaults to 9. + stride (int): Stride of the temporal convolution. Defaults to 1. + dilation (int): Spacing between temporal kernel elements. + Defaults to 1. + norm (str): The name of norm layer. Defaults to ``'BN'``. + dropout (float): Dropout probability. Defaults to 0. + init_cfg (dict or list[dict]): Initialization config dict. Defaults to + ``[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ]``. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 9, + stride: int = 1, + dilation: int = 1, + norm: str = 'BN', + dropout: float = 0, + init_cfg: Union[Dict, List[Dict]] = [ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + + self.in_channels = in_channels + self.out_channels = out_channels + self.norm_cfg = norm if isinstance(norm, dict) else dict(type=norm) + pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2 + + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=(kernel_size, 1), + padding=(pad, 0), + stride=(stride, 1), + dilation=(dilation, 1)) + self.bn = build_norm_layer(self.norm_cfg, out_channels)[1] \ + if norm is not None else nn.Identity() + + self.drop = nn.Dropout(dropout, inplace=True) + self.stride = stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + return self.drop(self.bn(self.conv(x))) + + +class mstcn(BaseModule): + """The multi-scale temporal convolutional network. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + mid_channels (int): Number of middle channels. Defaults to None. + dropout (float): Dropout probability. Defaults to 0. + ms_cfg (list): The config of multi-scale branches. Defaults to + ``[(3, 1), (3, 2), (3, 3), (3, 4), ('max', 3), '1x1']``. + stride (int): Stride of the temporal convolution. Defaults to 1. + init_cfg (dict or list[dict]): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + mid_channels: int = None, + dropout: float = 0., + ms_cfg: List = [(3, 1), (3, 2), (3, 3), (3, 4), ('max', 3), + '1x1'], + stride: int = 1, + init_cfg: Union[Dict, List[Dict]] = None) -> None: + super().__init__(init_cfg=init_cfg) + # Multiple branches of temporal convolution + self.ms_cfg = ms_cfg + num_branches = len(ms_cfg) + self.num_branches = num_branches + self.in_channels = in_channels + self.out_channels = out_channels + self.act = nn.ReLU() + + if mid_channels is None: + mid_channels = out_channels // num_branches + rem_mid_channels = out_channels - mid_channels * (num_branches - 1) + else: + assert isinstance(mid_channels, float) and mid_channels > 0 + mid_channels = int(out_channels * mid_channels) + rem_mid_channels = mid_channels + + self.mid_channels = mid_channels + self.rem_mid_channels = rem_mid_channels + + branches = [] + for i, cfg in enumerate(ms_cfg): + branch_c = rem_mid_channels if i == 0 else mid_channels + if cfg == '1x1': + branches.append( + nn.Conv2d( + in_channels, + branch_c, + kernel_size=1, + stride=(stride, 1))) + continue + assert isinstance(cfg, tuple) + if cfg[0] == 'max': + branches.append( + Sequential( + nn.Conv2d(in_channels, branch_c, kernel_size=1), + nn.BatchNorm2d(branch_c), self.act, + nn.MaxPool2d( + kernel_size=(cfg[1], 1), + stride=(stride, 1), + padding=(1, 0)))) + continue + assert isinstance(cfg[0], int) and isinstance(cfg[1], int) + branch = Sequential( + nn.Conv2d(in_channels, branch_c, kernel_size=1), + nn.BatchNorm2d(branch_c), self.act, + unit_tcn( + branch_c, + branch_c, + kernel_size=cfg[0], + stride=stride, + dilation=cfg[1], + norm=None)) + branches.append(branch) + + self.branches = ModuleList(branches) + tin_channels = mid_channels * (num_branches - 1) + rem_mid_channels + + self.transform = Sequential( + nn.BatchNorm2d(tin_channels), self.act, + nn.Conv2d(tin_channels, out_channels, kernel_size=1)) + + self.bn = nn.BatchNorm2d(out_channels) + self.drop = nn.Dropout(dropout, inplace=True) + + def inner_forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + N, C, T, V = x.shape + + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + feat = torch.cat(branch_outs, dim=1) + feat = self.transform(feat) + return feat + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + out = self.inner_forward(x) + out = self.bn(out) + return self.drop(out) diff --git a/mmaction/models/utils/graph.py b/mmaction/models/utils/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..d7f506f5fc2de86d32b91c13d0f971e85c6aa5bc --- /dev/null +++ b/mmaction/models/utils/graph.py @@ -0,0 +1,218 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import numpy as np +import torch + + +def k_adjacency(A: Union[torch.Tensor, np.ndarray], + k: int, + with_self: bool = False, + self_factor: float = 1) -> np.ndarray: + """Construct k-adjacency matrix. + + Args: + A (torch.Tensor or np.ndarray): The adjacency matrix. + k (int): The number of hops. + with_self (bool): Whether to add self-loops to the + k-adjacency matrix. The self-loops is critical + for learning the relationships between the current + joint and its k-hop neighbors. Defaults to False. + self_factor (float): The scale factor to the added + identity matrix. Defaults to 1. + + Returns: + np.ndarray: The k-adjacency matrix. + """ + # A is a 2D square array + if isinstance(A, torch.Tensor): + A = A.data.cpu().numpy() + assert isinstance(A, np.ndarray) + Iden = np.eye(len(A), dtype=A.dtype) + if k == 0: + return Iden + Ak = np.minimum(np.linalg.matrix_power(A + Iden, k), 1) - np.minimum( + np.linalg.matrix_power(A + Iden, k - 1), 1) + if with_self: + Ak += (self_factor * Iden) + return Ak + + +def edge2mat(edges: List[Tuple[int, int]], num_node: int) -> np.ndarray: + """Get adjacency matrix from edges. + + Args: + edges (list[tuple[int, int]]): The edges of the graph. + num_node (int): The number of nodes of the graph. + + Returns: + np.ndarray: The adjacency matrix. + """ + A = np.zeros((num_node, num_node)) + for i, j in edges: + A[j, i] = 1 + return A + + +def normalize_digraph(A: np.ndarray, dim: int = 0) -> np.ndarray: + """Normalize the digraph according to the given dimension. + + Args: + A (np.ndarray): The adjacency matrix. + dim (int): The dimension to perform normalization. + Defaults to 0. + + Returns: + np.ndarray: The normalized adjacency matrix. + """ + # A is a 2D square array + Dl = np.sum(A, dim) + h, w = A.shape + Dn = np.zeros((w, w)) + + for i in range(w): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + + AD = np.dot(A, Dn) + return AD + + +def get_hop_distance(num_node: int, + edges: List[Tuple[int, int]], + max_hop: int = 1) -> np.ndarray: + """Get n-hop distance matrix by edges. + + Args: + num_node (int): The number of nodes of the graph. + edges (list[tuple[int, int]]): The edges of the graph. + max_hop (int): The maximal distance between two connected nodes. + Defaults to 1. + + Returns: + np.ndarray: The n-hop distance matrix. + """ + A = np.eye(num_node) + + for i, j in edges: + A[i, j] = 1 + A[j, i] = 1 + + # compute hop steps + hop_dis = np.zeros((num_node, num_node)) + np.inf + transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)] + arrive_mat = (np.stack(transfer_mat) > 0) + for d in range(max_hop, -1, -1): + hop_dis[arrive_mat[d]] = d + return hop_dis + + +class Graph: + """The Graph to model the skeletons. + + Args: + layout (str or dict): must be one of the following candidates: + 'openpose', 'nturgb+d', 'coco', or a dict with the following + keys: 'num_node', 'inward', and 'center'. + Defaults to ``'coco'``. + mode (str): must be one of the following candidates: + 'stgcn_spatial', 'spatial'. Defaults to ``'spatial'``. + max_hop (int): the maximal distance between two connected + nodes. Defaults to 1. + """ + + def __init__(self, + layout: Union[str, dict] = 'coco', + mode: str = 'spatial', + max_hop: int = 1) -> None: + + self.max_hop = max_hop + self.layout = layout + self.mode = mode + + if isinstance(layout, dict): + assert 'num_node' in layout + assert 'inward' in layout + assert 'center' in layout + else: + assert layout in ['openpose', 'nturgb+d', 'coco'] + + self.set_layout(layout) + self.hop_dis = get_hop_distance(self.num_node, self.inward, max_hop) + + assert hasattr(self, mode), f'Do Not Exist This Mode: {mode}' + self.A = getattr(self, mode)() + + def __str__(self): + return self.A + + def set_layout(self, layout: str) -> None: + """Initialize the layout of candidates.""" + + if layout == 'openpose': + self.num_node = 18 + self.inward = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), + (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), + (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)] + self.center = 1 + elif layout == 'nturgb+d': + self.num_node = 25 + neighbor_base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), + (7, 6), (8, 7), (9, 21), (10, 9), (11, 10), + (12, 11), (13, 1), (14, 13), (15, 14), (16, 15), + (17, 1), (18, 17), (19, 18), (20, 19), (22, 8), + (23, 8), (24, 12), (25, 12)] + self.inward = [(i - 1, j - 1) for (i, j) in neighbor_base] + self.center = 21 - 1 + elif layout == 'coco': + self.num_node = 17 + self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), + (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0), + (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)] + self.center = 0 + elif isinstance(layout, dict): + self.num_node = layout['num_node'] + self.inward = layout['inward'] + self.center = layout['center'] + else: + raise ValueError(f'Do Not Exist This Layout: {layout}') + self.self_link = [(i, i) for i in range(self.num_node)] + self.outward = [(j, i) for (i, j) in self.inward] + self.neighbor = self.inward + self.outward + + def stgcn_spatial(self) -> np.ndarray: + """ST-GCN spatial mode.""" + adj = np.zeros((self.num_node, self.num_node)) + adj[self.hop_dis <= self.max_hop] = 1 + normalize_adj = normalize_digraph(adj) + hop_dis = self.hop_dis + center = self.center + + A = [] + for hop in range(self.max_hop + 1): + a_close = np.zeros((self.num_node, self.num_node)) + a_further = np.zeros((self.num_node, self.num_node)) + for i in range(self.num_node): + for j in range(self.num_node): + if hop_dis[j, i] == hop: + if hop_dis[j, center] >= hop_dis[i, center]: + a_close[j, i] = normalize_adj[j, i] + else: + a_further[j, i] = normalize_adj[j, i] + A.append(a_close) + if hop > 0: + A.append(a_further) + return np.stack(A) + + def spatial(self) -> np.ndarray: + """Standard spatial mode.""" + Iden = edge2mat(self.self_link, self.num_node) + In = normalize_digraph(edge2mat(self.inward, self.num_node)) + Out = normalize_digraph(edge2mat(self.outward, self.num_node)) + A = np.stack((Iden, In, Out)) + return A + + def binary_adj(self) -> np.ndarray: + """Construct an adjacency matrix for an undirected graph.""" + A = edge2mat(self.neighbor, self.num_node) + return A[None] diff --git a/mmaction/registry.py b/mmaction/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..539627c4c740fba8a32aeb2959b4d0fe31d40672 --- /dev/null +++ b/mmaction/registry.py @@ -0,0 +1,140 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""MMAction provides 20 registry nodes to support using modules across +projects. Each node is a child of the root registry in MMEngine. + +More details can be found at +https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. +""" + +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS +from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR +from mmengine.registry import FUNCTIONS as MMENGINE_FUNCTION +from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS +from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS +from mmengine.registry import LOOPS as MMENGINE_LOOPS +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS +from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS +from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS +from mmengine.registry import Registry + +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry( + 'runner', parent=MMENGINE_RUNNERS, locations=['mmaction.engine.runner']) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', + parent=MMENGINE_RUNNER_CONSTRUCTORS, + locations=['mmaction.engine.runner']) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry( + 'loop', parent=MMENGINE_LOOPS, locations=['mmaction.engine.runner']) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmaction.engine.hooks']) + +# manage data-related modules +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmaction.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmaction.datasets']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmaction.datasets.transforms']) + +# manage all kinds of modules inheriting `nn.Module` +MODELS = Registry( + 'model', parent=MMENGINE_MODELS, locations=['mmaction.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmaction.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmaction.models']) + +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmaction.engine.optimizers']) +# manage optimizer wrapper +OPTIM_WRAPPERS = Registry( + 'optim_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmaction.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer wrapper constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmaction.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmaction.engine']) + +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmaction.evaluation']) +# manage evaluator +EVALUATOR = Registry( + 'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmaction.evaluation']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmaction.models']) + +# manage visualizer +VISUALIZERS = Registry( + 'visualizer', + parent=MMENGINE_VISUALIZERS, + locations=['mmaction.visualization']) +# manage visualizer backend +VISBACKENDS = Registry( + 'vis_backend', + parent=MMENGINE_VISBACKENDS, + locations=['mmaction.visualization']) + +# manage logprocessor +LOG_PROCESSORS = Registry( + 'log_processor', + parent=MMENGINE_LOG_PROCESSORS, + locations=['mmaction.engine']) + +# manage inferencer +INFERENCERS = Registry( + 'inferencer', + parent=MMENGINE_INFERENCERS, + locations=['mmaction.apis.inferencers']) + +# manage function +FUNCTION = Registry( + 'function', parent=MMENGINE_FUNCTION, locations=['mmaction.mmengine']) + +# Tokenizer to encode sequence +TOKENIZER = Registry( + 'tokenizer', + locations=['mmaction.models'], +) diff --git a/mmaction/structures/__init__.py b/mmaction/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d977d76d5a4afafd8a9a0ec5723242548489371 --- /dev/null +++ b/mmaction/structures/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .action_data_sample import ActionDataSample +from .bbox import bbox2result, bbox_target + +__all__ = [ + 'ActionDataSample', + 'bbox2result', + 'bbox_target', +] diff --git a/mmaction/structures/__pycache__/__init__.cpython-310.pyc b/mmaction/structures/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b161f23ba6a14cf3cf9a781015b6076dfd9b92e Binary files /dev/null and b/mmaction/structures/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/structures/__pycache__/action_data_sample.cpython-310.pyc b/mmaction/structures/__pycache__/action_data_sample.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13460eb7f8c26d367f893f58c6057bd874987daa Binary files /dev/null and b/mmaction/structures/__pycache__/action_data_sample.cpython-310.pyc differ diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..de61ae78cc655320330a22bc78a15305ae5cccbe --- /dev/null +++ b/mmaction/structures/action_data_sample.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Sequence, Union + +import numpy as np +import torch +from mmengine.structures import BaseDataElement, InstanceData +from mmengine.utils import is_str + +LABEL_TYPE = Union[torch.Tensor, np.ndarray, Sequence, int] +SCORE_TYPE = Union[torch.Tensor, np.ndarray, Sequence, Dict] + + +def format_label(value: LABEL_TYPE) -> torch.Tensor: + """Convert various python types to label-format tensor. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int`. + + Args: + value (torch.Tensor | numpy.ndarray | Sequence | int): Label value. + + Returns: + :obj:`torch.Tensor`: The formatted label tensor. + """ + + # Handle single number + if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0: + value = int(value.item()) + + if isinstance(value, np.ndarray): + value = torch.from_numpy(value).to(torch.long) + elif isinstance(value, Sequence) and not is_str(value): + value = torch.tensor(value).to(torch.long) + elif isinstance(value, int): + value = torch.LongTensor([value]) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'Type {type(value)} is not an available label type.') + + return value + + +def format_score(value: SCORE_TYPE) -> Union[torch.Tensor, Dict]: + """Convert various python types to score-format tensor. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`. + + Args: + value (torch.Tensor | numpy.ndarray | Sequence | dict): + Score values or dict of scores values. + + Returns: + :obj:`torch.Tensor` | dict: The formatted scores. + """ + + if isinstance(value, np.ndarray): + value = torch.from_numpy(value).float() + elif isinstance(value, Sequence) and not is_str(value): + value = torch.tensor(value).float() + elif isinstance(value, dict): + for k, v in value.items(): + value[k] = format_score(v) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'Type {type(value)} is not an available label type.') + + return value + + +class ActionDataSample(BaseDataElement): + + def set_gt_label(self, value: LABEL_TYPE) -> 'ActionDataSample': + """Set `gt_label``.""" + self.set_field(format_label(value), 'gt_label', dtype=torch.Tensor) + return self + + def set_pred_label(self, value: LABEL_TYPE) -> 'ActionDataSample': + """Set ``pred_label``.""" + self.set_field(format_label(value), 'pred_label', dtype=torch.Tensor) + return self + + def set_pred_score(self, value: SCORE_TYPE) -> 'ActionDataSample': + """Set score of ``pred_label``.""" + score = format_score(value) + self.set_field(score, 'pred_score') + if hasattr(self, 'num_classes'): + assert len(score) == self.num_classes, \ + f'The length of score {len(score)} should be '\ + f'equal to the num_classes {self.num_classes}.' + else: + self.set_field( + name='num_classes', value=len(score), field_type='metainfo') + return self + + @property + def proposals(self): + """Property of `proposals`""" + return self._proposals + + @proposals.setter + def proposals(self, value): + """Setter of `proposals`""" + self.set_field(value, '_proposals', dtype=InstanceData) + + @proposals.deleter + def proposals(self): + """Deleter of `proposals`""" + del self._proposals + + @property + def gt_instances(self): + """Property of `gt_instances`""" + return self._gt_instances + + @gt_instances.setter + def gt_instances(self, value): + """Setter of `gt_instances`""" + self.set_field(value, '_gt_instances', dtype=InstanceData) + + @gt_instances.deleter + def gt_instances(self): + """Deleter of `gt_instances`""" + del self._gt_instances + + @property + def features(self): + """Setter of `features`""" + return self._features + + @features.setter + def features(self, value): + """Setter of `features`""" + self.set_field(value, '_features', dtype=InstanceData) + + @features.deleter + def features(self): + """Deleter of `features`""" + del self._features diff --git a/mmaction/structures/bbox/__init__.py b/mmaction/structures/bbox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1e23c58af3e3901644ceac89770374eadbf725 --- /dev/null +++ b/mmaction/structures/bbox/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_target import bbox_target +from .transforms import bbox2result + +__all__ = ['bbox_target', 'bbox2result'] diff --git a/mmaction/structures/bbox/__pycache__/__init__.cpython-310.pyc b/mmaction/structures/bbox/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a96900d3513579ff0d61ecc51c0f39d20971be2e Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/structures/bbox/__pycache__/bbox_target.cpython-310.pyc b/mmaction/structures/bbox/__pycache__/bbox_target.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be1ed0b8029acf288ce65e6455739fbe1bf7b455 Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/bbox_target.cpython-310.pyc differ diff --git a/mmaction/structures/bbox/__pycache__/transforms.cpython-310.pyc b/mmaction/structures/bbox/__pycache__/transforms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08800b8d3ddcc24507ddbdc2067b8afca3667b97 Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/transforms.cpython-310.pyc differ diff --git a/mmaction/structures/bbox/bbox_target.py b/mmaction/structures/bbox/bbox_target.py new file mode 100644 index 0000000000000000000000000000000000000000..649724bb11ca63b56b90bd9d18f61d14325cce84 --- /dev/null +++ b/mmaction/structures/bbox/bbox_target.py @@ -0,0 +1,48 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import mmengine +import torch +import torch.nn.functional as F + + +def bbox_target(pos_bboxes_list: List[torch.Tensor], + neg_bboxes_list: List[torch.Tensor], + gt_labels: List[torch.Tensor], + cfg: Union[dict, mmengine.ConfigDict]) -> tuple: + """Generate classification targets for bboxes. + + Args: + pos_bboxes_list (List[torch.Tensor]): Positive bboxes list. + neg_bboxes_list (List[torch.Tensor]): Negative bboxes list. + gt_labels (List[torch.Tensor]): Groundtruth classification label list. + cfg (dict | mmengine.ConfigDict): RCNN config. + + Returns: + tuple: Label and label_weight for bboxes. + """ + labels, label_weights = [], [] + pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight + + assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels) + length = len(pos_bboxes_list) + + for i in range(length): + pos_bboxes = pos_bboxes_list[i] + neg_bboxes = neg_bboxes_list[i] + gt_label = gt_labels[i] + + num_pos = pos_bboxes.size(0) + num_neg = neg_bboxes.size(0) + num_samples = num_pos + num_neg + label = F.pad(gt_label, (0, 0, 0, num_neg)) + label_weight = pos_bboxes.new_zeros(num_samples) + label_weight[:num_pos] = pos_weight + label_weight[-num_neg:] = 1. + + labels.append(label) + label_weights.append(label_weight) + + labels = torch.cat(labels, 0) + label_weights = torch.cat(label_weights, 0) + return labels, label_weights diff --git a/mmaction/structures/bbox/transforms.py b/mmaction/structures/bbox/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe4d3a0a6335721c3ed12093de8c20fb3d90147 --- /dev/null +++ b/mmaction/structures/bbox/transforms.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + + +def bbox2result(bboxes: torch.Tensor, + labels: torch.Tensor, + num_classes: int, + thr: float = 0.01) -> list: + """Convert detection results to a list of numpy arrays. + + This identifies single-label classification (as opposed to multi-label) + through the thr parameter which is set to a negative value. + + ToDo: The ideal way would be for this to be automatically set when the + Currently, the way to set this is to set ``test_cfg.rcnn.action_thr=-1.0`` + model cfg uses multilabel=False, however this could be a breaking change + and is left as a future exercise. + NB - this should not interfere with the evaluation in any case. + + Args: + bboxes (torch.Tensor): shape ``(n, 4)``. + labels (torch.Tensor): shape ``(n, num_classes)``. + num_classes (int): class number, including background class. + thr (float): The score threshold used when converting predictions to + detection results. If a single negative value, uses single-label + classification. + Returns: + List(ndarray): bbox results of each class. + """ + if bboxes.shape[0] == 0: + return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32)) + + bboxes = bboxes.cpu().numpy() + scores = labels.cpu().numpy() # rename for clarification + + # Although we can handle single-label classification, we still want scores + assert scores.shape[-1] > 1 + + # Robustly check for multi/single-label: + if not hasattr(thr, '__len__'): + multilabel = thr >= 0 + thr = (thr, ) * num_classes + else: + multilabel = True + + # Check Shape + assert scores.shape[1] == num_classes + assert len(thr) == num_classes + + result = [] + for i in range(num_classes - 1): + if multilabel: + where = (scores[:, i + 1] > thr[i + 1]) + else: + where = (scores[:, 1:].argmax(axis=1) == i) + result.append( + np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]), + axis=1)) + return result diff --git a/mmaction/testing/__init__.py b/mmaction/testing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2be44ffe3f2cf0a528f5c69fd4db116b3f8cc202 --- /dev/null +++ b/mmaction/testing/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ._utils import (check_norm_state, generate_backbone_demo_inputs, + generate_detector_demo_inputs, get_audio_recognizer_cfg, + get_cfg, get_detector_cfg, get_localizer_cfg, + get_recognizer_cfg, get_similarity_cfg, + get_skeletongcn_cfg) + +__all__ = [ + 'check_norm_state', 'generate_backbone_demo_inputs', 'get_cfg', + 'get_recognizer_cfg', 'get_audio_recognizer_cfg', 'get_localizer_cfg', + 'get_detector_cfg', 'generate_detector_demo_inputs', 'get_skeletongcn_cfg', + 'get_similarity_cfg' +] diff --git a/mmaction/testing/_utils.py b/mmaction/testing/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f452f6eaf542824626daf4e0fd7837a0bdc786f4 --- /dev/null +++ b/mmaction/testing/_utils.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import mmengine +import numpy as np +import torch +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def generate_backbone_demo_inputs(input_shape=(1, 3, 64, 64)): + """Create a superset of inputs needed to run backbone. + + Args: + input_shape (tuple): input batch dimensions. + Defaults to ``(1, 3, 64, 64)``. + """ + imgs = np.random.random(input_shape) + imgs = torch.FloatTensor(imgs) + + return imgs + + +# TODO Remove this API +def generate_recognizer_demo_inputs( + input_shape=(1, 3, 3, 224, 224), model_type='2D'): + """Create a superset of inputs needed to run test or train batches. + + Args: + input_shape (tuple): input batch dimensions. + Default: (1, 250, 3, 224, 224). + model_type (str): Model type for data generation, from {'2D', '3D'}. + Default:'2D' + """ + if len(input_shape) == 5: + (N, L, _, _, _) = input_shape + elif len(input_shape) == 6: + (N, M, _, L, _, _) = input_shape + + imgs = np.random.random(input_shape) + + if model_type == '2D' or model_type == 'skeleton': + gt_labels = torch.LongTensor([2] * N) + elif model_type == '3D': + gt_labels = torch.LongTensor([2] * M) + elif model_type == 'audio': + gt_labels = torch.LongTensor([2] * L) + else: + raise ValueError(f'Data type {model_type} is not available') + + inputs = {'imgs': torch.FloatTensor(imgs), 'gt_labels': gt_labels} + return inputs + + +def generate_detector_demo_inputs( + input_shape=(1, 3, 4, 224, 224), num_classes=81, train=True, + device='cpu'): + num_samples = input_shape[0] + if not train: + assert num_samples == 1 + + def random_box(n): + box = torch.rand(n, 4) * 0.5 + box[:, 2:] += 0.5 + box[:, 0::2] *= input_shape[3] + box[:, 1::2] *= input_shape[4] + if device == 'cuda': + box = box.cuda() + return box + + def random_label(n): + label = torch.randn(n, num_classes) + label = (label > 0.8).type(torch.float32) + label[:, 0] = 0 + if device == 'cuda': + label = label.cuda() + return label + + img = torch.FloatTensor(np.random.random(input_shape)) + if device == 'cuda': + img = img.cuda() + + proposals = [random_box(2) for i in range(num_samples)] + gt_bboxes = [random_box(2) for i in range(num_samples)] + gt_labels = [random_label(2) for i in range(num_samples)] + img_metas = [dict(img_shape=input_shape[-2:]) for i in range(num_samples)] + + if train: + return dict( + img=img, + proposals=proposals, + gt_bboxes=gt_bboxes, + gt_labels=gt_labels, + img_metas=img_metas) + + return dict(img=[img], proposals=[proposals], img_metas=[img_metas]) + + +def get_cfg(config_type, fname): + """Grab configs necessary to create a recognizer. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config_types = ('recognition', 'recognition_audio', 'localization', + 'detection', 'skeleton', 'retrieval') + assert config_type in config_types + + repo_dpath = osp.dirname(osp.dirname(osp.dirname(__file__))) + config_dpath = osp.join(repo_dpath, 'configs/' + config_type) + config_fpath = osp.join(config_dpath, fname) + if not osp.exists(config_dpath): + raise Exception('Cannot find config path') + config = mmengine.Config.fromfile(config_fpath) + return config + + +def get_recognizer_cfg(fname): + return get_cfg('recognition', fname) + + +def get_audio_recognizer_cfg(fname): + return get_cfg('recognition_audio', fname) + + +def get_localizer_cfg(fname): + return get_cfg('localization', fname) + + +def get_detector_cfg(fname): + return get_cfg('detection', fname) + + +def get_skeletongcn_cfg(fname): + return get_cfg('skeleton', fname) + + +def get_similarity_cfg(fname): + return get_cfg('retrieval', fname) diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2736dd102a20ac1c411e82922c687cee6d71dc6f --- /dev/null +++ b/mmaction/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .gradcam_utils import GradCAM +from .misc import (VideoWriter, frame_extract, get_random_string, get_shm_dir, + get_str_type, get_thread_id) +from .progress import track, track_on_main_process +from .setup_env import register_all_modules +from .typing_utils import * # noqa: F401,F403 + +__all__ = [ + 'collect_env', 'get_random_string', 'get_thread_id', 'get_shm_dir', + 'frame_extract', 'GradCAM', 'register_all_modules', 'VideoWriter', + 'get_str_type', 'track', 'track_on_main_process' +] diff --git a/mmaction/utils/__pycache__/__init__.cpython-310.pyc b/mmaction/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..157c1a823018079d21099f9910a081a30fa04e07 Binary files /dev/null and b/mmaction/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/collect_env.cpython-310.pyc b/mmaction/utils/__pycache__/collect_env.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aee8dd66304035492f0f58381f1a394cfe295584 Binary files /dev/null and b/mmaction/utils/__pycache__/collect_env.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/gradcam_utils.cpython-310.pyc b/mmaction/utils/__pycache__/gradcam_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1dfffcccdd7e96aa590af1c48a8c42ef7a1f9b4 Binary files /dev/null and b/mmaction/utils/__pycache__/gradcam_utils.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/misc.cpython-310.pyc b/mmaction/utils/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65e0d13fe9d10a6155c3acfa8d015472fb39cdeb Binary files /dev/null and b/mmaction/utils/__pycache__/misc.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/progress.cpython-310.pyc b/mmaction/utils/__pycache__/progress.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6236f12755ceaf068be9f9026916a67e2d86edb Binary files /dev/null and b/mmaction/utils/__pycache__/progress.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/setup_env.cpython-310.pyc b/mmaction/utils/__pycache__/setup_env.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6894c5312789c2cfdce3b02beb9ebceea3c5c56 Binary files /dev/null and b/mmaction/utils/__pycache__/setup_env.cpython-310.pyc differ diff --git a/mmaction/utils/__pycache__/typing_utils.cpython-310.pyc b/mmaction/utils/__pycache__/typing_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6b081aa3d8eade82f01f076eceb72aa57e8ca46 Binary files /dev/null and b/mmaction/utils/__pycache__/typing_utils.cpython-310.pyc differ diff --git a/mmaction/utils/collect_env.py b/mmaction/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..d2e4314b7f1bb00e6b3c29a93aff39d9e284a88f --- /dev/null +++ b/mmaction/utils/collect_env.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_basic_env + +import mmaction + + +def collect_env(): + """Collect the information of the running environments.""" + env_info = collect_basic_env() + env_info['MMAction2'] = ( + mmaction.__version__ + '+' + get_git_hash(digits=7)) + env_info['MMCV'] = (mmcv.__version__) + + try: + import mmdet + env_info['MMDetection'] = (mmdet.__version__) + except ImportError: + pass + + try: + import mmpose + env_info['MMPose'] = (mmpose.__version__) + except ImportError: + pass + + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/mmaction/utils/dependency.py b/mmaction/utils/dependency.py new file mode 100644 index 0000000000000000000000000000000000000000..61a045ede5c01d9c9bfc299da1a41238fef3720c --- /dev/null +++ b/mmaction/utils/dependency.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re +from functools import wraps +from inspect import isfunction + +from importlib_metadata import PackageNotFoundError, distribution +from mmengine.utils import digit_version + + +def satisfy_requirement(dep): + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, dep, maxsplit=1) + parts = [p.strip() for p in parts] + package = parts[0] + if len(parts) > 1: + op, version = parts[1:] + op = { + '>=': '__ge__', + '==': '__eq__', + '>': '__gt__', + '<': '__lt__', + '<=': '__le__' + }[op] + else: + op, version = None, None + + try: + dist = distribution(package) + if op is None or getattr(digit_version(dist.version), op)( + digit_version(version)): + return True + except PackageNotFoundError: + pass + + return False + + +def require(dep, install=None): + """A wrapper of function for extra package requirements. + + Args: + dep (str): The dependency package name, like ``transformers`` + or ``transformers>=4.28.0``. + install (str, optional): The installation command hint. Defaults + to None, which means to use "pip install dep". + """ + + def wrapper(fn): + assert isfunction(fn) + + @wraps(fn) + def ask_install(*args, **kwargs): + name = fn.__qualname__.replace('.__init__', '') + ins = install or f'pip install "{dep}"' + raise ImportError( + f'{name} requires {dep}, please install it by `{ins}`.') + + if satisfy_requirement(dep): + fn._verify_require = getattr(fn, '_verify_require', lambda: None) + return fn + + ask_install._verify_require = ask_install + return ask_install + + return wrapper + + +WITH_MULTIMODAL = all( + satisfy_requirement(item) for item in ['transformers>=4.28.0']) + + +def register_multimodal_placeholder(names, registry): + for name in names: + + def ask_install(*args, **kwargs): + raise ImportError( + f'{name} requires extra multi-modal dependencies, please ' + 'install it by `pip install "mmaction2[multimodal]"` ' + 'or `pip install -e ".[multimodal]"`.') + + registry.register_module(name=name, module=ask_install) diff --git a/mmaction/utils/gradcam_utils.py b/mmaction/utils/gradcam_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7841485cb665fd7a83d46eb2081847dea97c99de --- /dev/null +++ b/mmaction/utils/gradcam_utils.py @@ -0,0 +1,243 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class GradCAM: + """GradCAM class helps create visualization results. + + Visualization results are blended by heatmaps and input images. + This class is modified from + https://github.com/facebookresearch/SlowFast/blob/master/slowfast/visualization/gradcam_utils.py # noqa + For more information about GradCAM, please visit: + https://arxiv.org/pdf/1610.02391.pdf + + Args: + model (nn.Module): the recognizer model to be used. + target_layer_name (str): name of convolutional layer to + be used to get gradients and feature maps from for creating + localization maps. + colormap (str): matplotlib colormap used to create + heatmap. Defaults to 'viridis'. For more information, please visit + https://matplotlib.org/3.3.0/tutorials/colors/colormaps.html + """ + + def __init__(self, + model: nn.Module, + target_layer_name: str, + colormap: str = 'viridis') -> None: + from ..models.recognizers import Recognizer2D, Recognizer3D + if isinstance(model, Recognizer2D): + self.is_recognizer2d = True + elif isinstance(model, Recognizer3D): + self.is_recognizer2d = False + else: + raise ValueError( + 'GradCAM utils only support Recognizer2D & Recognizer3D.') + + self.model = model + self.model.eval() + self.target_gradients = None + self.target_activations = None + + import matplotlib.pyplot as plt + self.colormap = plt.get_cmap(colormap) + self._register_hooks(target_layer_name) + + def _register_hooks(self, layer_name: str) -> None: + """Register forward and backward hook to a layer, given layer_name, to + obtain gradients and activations. + + Args: + layer_name (str): name of the layer. + """ + + def get_gradients(module, grad_input, grad_output): + self.target_gradients = grad_output[0].detach() + + def get_activations(module, input, output): + self.target_activations = output.clone().detach() + + layer_ls = layer_name.split('/') + prev_module = self.model + for layer in layer_ls: + prev_module = prev_module._modules[layer] + + target_layer = prev_module + target_layer.register_forward_hook(get_activations) + target_layer.register_backward_hook(get_gradients) + + def _calculate_localization_map(self, + data: dict, + use_labels: bool, + delta=1e-20) -> tuple: + """Calculate localization map for all inputs with Grad-CAM. + + Args: + data (dict): model inputs, generated by test pipeline, + use_labels (bool): Whether to use given labels to generate + localization map. + delta (float): used in localization map normalization, + must be small enough. Please make sure + `localization_map_max - localization_map_min >> delta` + + Returns: + localization_map (torch.Tensor): the localization map for + input imgs. + preds (torch.Tensor): Model predictions with shape + (batch_size, num_classes). + """ + inputs = data['inputs'] + + # use score before softmax + self.model.cls_head.average_clips = 'score' + # model forward & backward + results = self.model.test_step(data) + preds = [result.pred_score for result in results] + preds = torch.stack(preds) + + if use_labels: + labels = [result.gt_label for result in results] + labels = torch.stack(labels) + score = torch.gather(preds, dim=1, index=labels) + else: + score = torch.max(preds, dim=-1)[0] + self.model.zero_grad() + score = torch.sum(score) + score.backward() + + imgs = torch.stack(inputs) + if self.is_recognizer2d: + # [batch_size, num_segments, 3, H, W] + b, t, _, h, w = imgs.size() + else: + # [batch_size, num_crops*num_clips, 3, clip_len, H, W] + b1, b2, _, t, h, w = imgs.size() + b = b1 * b2 + + gradients = self.target_gradients + activations = self.target_activations + if self.is_recognizer2d: + # [B*Tg, C', H', W'] + b_tg, c, _, _ = gradients.size() + tg = b_tg // b + else: + # source shape: [B, C', Tg, H', W'] + _, c, tg, _, _ = gradients.size() + # target shape: [B, Tg, C', H', W'] + gradients = gradients.permute(0, 2, 1, 3, 4) + activations = activations.permute(0, 2, 1, 3, 4) + + # calculate & resize to [B, 1, T, H, W] + weights = torch.mean(gradients.view(b, tg, c, -1), dim=3) + weights = weights.view(b, tg, c, 1, 1) + activations = activations.view([b, tg, c] + + list(activations.size()[-2:])) + localization_map = torch.sum( + weights * activations, dim=2, keepdim=True) + localization_map = F.relu(localization_map) + localization_map = localization_map.permute(0, 2, 1, 3, 4) + localization_map = F.interpolate( + localization_map, + size=(t, h, w), + mode='trilinear', + align_corners=False) + + # Normalize the localization map. + localization_map_min, localization_map_max = ( + torch.min(localization_map.view(b, -1), dim=-1, keepdim=True)[0], + torch.max(localization_map.view(b, -1), dim=-1, keepdim=True)[0]) + localization_map_min = torch.reshape( + localization_map_min, shape=(b, 1, 1, 1, 1)) + localization_map_max = torch.reshape( + localization_map_max, shape=(b, 1, 1, 1, 1)) + localization_map = (localization_map - localization_map_min) / ( + localization_map_max - localization_map_min + delta) + localization_map = localization_map.data + + return localization_map.squeeze(dim=1), preds + + def _alpha_blending(self, localization_map: torch.Tensor, + input_imgs: torch.Tensor, + alpha: float) -> torch.Tensor: + """Blend heatmaps and model input images and get visulization results. + + Args: + localization_map (torch.Tensor): localization map for all inputs, + generated with Grad-CAM. + input_imgs (torch.Tensor): model inputs, raw images. + alpha (float): transparency level of the heatmap, + in the range [0, 1]. + + Returns: + torch.Tensor: blending results for localization map and input + images, with shape [B, T, H, W, 3] and pixel values in + RGB order within range [0, 1]. + """ + # localization_map shape [B, T, H, W] + localization_map = localization_map.cpu() + + # heatmap shape [B, T, H, W, 3] in RGB order + heatmap = self.colormap(localization_map.detach().numpy()) + heatmap = heatmap[..., :3] + heatmap = torch.from_numpy(heatmap) + input_imgs = torch.stack(input_imgs) + # Permute input imgs to [B, T, H, W, 3], like heatmap + if self.is_recognizer2d: + # Recognizer2D input (B, T, C, H, W) + curr_inp = input_imgs.permute(0, 1, 3, 4, 2) + else: + # Recognizer3D input (B', num_clips*num_crops, C, T, H, W) + # B = B' * num_clips * num_crops + curr_inp = input_imgs.view([-1] + list(input_imgs.size()[2:])) + curr_inp = curr_inp.permute(0, 2, 3, 4, 1) + + # renormalize input imgs to [0, 1] + curr_inp = curr_inp.cpu().float() + curr_inp /= 255. + + # alpha blending + blended_imgs = alpha * heatmap + (1 - alpha) * curr_inp + + return blended_imgs + + def __call__(self, + data: dict, + use_labels: bool = False, + alpha: float = 0.5) -> tuple: + """Visualize the localization maps on their corresponding inputs as + heatmap, using Grad-CAM. + + Generate visualization results for **ALL CROPS**. + For example, for I3D model, if `clip_len=32, num_clips=10` and + use `ThreeCrop` in test pipeline, then for every model inputs, + there are 960(32*10*3) images generated. + + Args: + data (dict): model inputs, generated by test pipeline. + use_labels (bool): Whether to use given labels to generate + localization map. + alpha (float): transparency level of the heatmap, + in the range [0, 1]. + + Returns: + blended_imgs (torch.Tensor): Visualization results, blended by + localization maps and model inputs. + preds (torch.Tensor): Model predictions for inputs. + """ + + # localization_map shape [B, T, H, W] + # preds shape [batch_size, num_classes] + localization_map, preds = self._calculate_localization_map( + data, use_labels=use_labels) + + # blended_imgs shape [B, T, H, W, 3] + blended_imgs = self._alpha_blending(localization_map, data['inputs'], + alpha) + + # blended_imgs shape [B, T, H, W, 3] + # preds shape [batch_size, num_classes] + # Recognizer2D: B = batch_size, T = num_segments + # Recognizer3D: B = batch_size * num_crops * num_clips, T = clip_len + return blended_imgs, preds diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..c58963df4e973a6ca49567f743d61b0a5d14bbc7 --- /dev/null +++ b/mmaction/utils/misc.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import ctypes +import inspect +import os +import os.path as osp +import random +import string +from types import FunctionType, ModuleType +from typing import Optional, Union + +import cv2 +import mmcv +import numpy as np + + +def get_random_string(length: int = 15) -> str: + """Get random string with letters and digits. + + Args: + length (int): Length of random string. Defaults to 15. + """ + return ''.join( + random.choice(string.ascii_letters + string.digits) + for _ in range(length)) + + +def get_thread_id() -> int: + """Get current thread id.""" + # use ctype to find thread id + thread_id = ctypes.CDLL('libc.so.6').syscall(186) + return thread_id + + +def get_shm_dir() -> str: + """Get shm dir for temporary usage.""" + return '/dev/shm' + + +def frame_extract(video_path: str, + short_side: Optional[int] = None, + out_dir: str = './tmp'): + """Extract frames given video_path. + + Args: + video_path (str): The video path. + short_side (int): Target short-side of the output image. + Defaults to None, means keeping original shape. + out_dir (str): The output directory. Defaults to ``'./tmp'``. + """ + # Load the video, extract frames into OUT_DIR/video_name + target_dir = osp.join(out_dir, osp.basename(osp.splitext(video_path)[0])) + os.makedirs(target_dir, exist_ok=True) + # Should be able to handle videos up to several hours + frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') + assert osp.exists(video_path), f'file not exit {video_path}' + vid = cv2.VideoCapture(video_path) + frames = [] + frame_paths = [] + flag, frame = vid.read() + cnt = 0 + new_h, new_w = None, None + while flag: + if short_side is not None: + if new_h is None: + h, w, _ = frame.shape + new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf)) + frame = mmcv.imresize(frame, (new_w, new_h)) + + frames.append(frame) + frame_path = frame_tmpl.format(cnt + 1) + frame_paths.append(frame_path) + + cv2.imwrite(frame_path, frame) + cnt += 1 + flag, frame = vid.read() + + return frame_paths, frames + + +class VideoWriter(): + + def __init__(self, video_file, fps): + self.video_file = video_file + self.fps = fps + if video_file.endswith('.mp4'): + self.fourcc = 'mp4v' + elif video_file.endswith('.avi'): + self.fourcc = 'XVID' + + out_dir = osp.dirname(osp.abspath(self.video_file)) + if not osp.exists(out_dir): + os.makedirs(out_dir, exist_ok=True) + + def _init_cv2_writer(self, frame): + from cv2 import VideoWriter, VideoWriter_fourcc + height, width = frame.shape[:2] + resolution = (width, height) + self.writer = VideoWriter(self.video_file, + VideoWriter_fourcc(*self.fourcc), self.fps, + resolution) + + def write_frame(self, frame): + if not getattr(self, 'writer', None): + self._init_cv2_writer(frame) + self.writer.write(frame) + + def release(self): + self.writer.release() + + def __enter__(self): + return self + + def __exit__(self, type, value, trace): + self.release() + + +def get_str_type(module: Union[str, ModuleType, FunctionType]) -> str: + """Return the string type name of module. + + Args: + module (str | ModuleType | FunctionType): + The target module class + + Returns: + Class name of the module + """ + if isinstance(module, str): + str_type = module + elif inspect.isclass(module) or inspect.isfunction(module): + str_type = module.__name__ + else: + return None + + return str_type diff --git a/mmaction/utils/progress.py b/mmaction/utils/progress.py new file mode 100644 index 0000000000000000000000000000000000000000..04e403befd45bf6571ffed5913fac53b2d04ca26 --- /dev/null +++ b/mmaction/utils/progress.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import mmengine.dist as dist +import rich.progress as progress +from rich.live import Live + +disable_progress_bar = False +global_progress = progress.Progress( + '{task.description}', + progress.BarColumn(), + progress.TaskProgressColumn(show_speed=True), + progress.TimeRemainingColumn(), +) +global_live = Live(global_progress, refresh_per_second=10) + + +def track(sequence, description: str = '', total: Optional[float] = None): + if disable_progress_bar: + yield from sequence + else: + global_live.start() + task_id = global_progress.add_task(description, total=total) + task = global_progress._tasks[task_id] + try: + yield from global_progress.track(sequence, task_id=task_id) + finally: + if task.total is None: + global_progress.update(task_id, total=task.completed) + if all(task.finished for task in global_progress.tasks): + global_live.stop() + for task_id in global_progress.task_ids: + global_progress.remove_task(task_id) + + +def track_on_main_process(sequence, description='', total=None): + if not dist.is_main_process() or disable_progress_bar: + yield from sequence + else: + yield from track(sequence, total=total, description=description) diff --git a/mmaction/utils/setup_env.py b/mmaction/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..1153469265914249bb547258bf99e0b1cf5492b2 --- /dev/null +++ b/mmaction/utils/setup_env.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import warnings + +from mmengine import DefaultScope + + +def register_all_modules(init_default_scope: bool = True) -> None: + """Register all modules in mmaction into the registries. + + Args: + init_default_scope (bool): Whether initialize the mmaction default + scope. If True, the global default scope will be set to `mmaction`, + and all registries will build modules from mmaction's registry + node. To understand more about the registry, please refer to + https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md + Defaults to True. + """ + import mmaction.datasets # noqa: F401,F403 + import mmaction.engine # noqa: F401,F403 + import mmaction.evaluation # noqa: F401,F403 + import mmaction.models # noqa: F401,F403 + import mmaction.structures # noqa: F401,F403 + import mmaction.visualization # noqa: F401,F403 + + if init_default_scope: + never_created = DefaultScope.get_current_instance() is None \ + or not DefaultScope.check_instance_created('mmaction') + if never_created: + DefaultScope.get_instance('mmaction', scope_name='mmaction') + return + current_scope = DefaultScope.get_current_instance() + if current_scope.scope_name != 'mmaction': + warnings.warn('The current default scope ' + f'"{current_scope.scope_name}" is not "mmaction", ' + '`register_all_modules` will force set the current' + 'default scope to "mmaction". If this is not as ' + 'expected, please set `init_default_scope=False`.') + # avoid name conflict + new_instance_name = f'mmaction-{datetime.datetime.now()}' + DefaultScope.get_instance(new_instance_name, scope_name='mmaction') diff --git a/mmaction/utils/typing_utils.py b/mmaction/utils/typing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a2c1cb71f23a4c743a999dba3ad40c84df38dca0 --- /dev/null +++ b/mmaction/utils/typing_utils.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Collecting some commonly used type hint in mmaction.""" +from typing import Dict, List, Optional, Tuple, Union + +import torch +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData, LabelData + +from mmaction.structures import ActionDataSample + +# Type hint of config data +ConfigType = Union[ConfigDict, dict] +OptConfigType = Optional[ConfigType] +# Type hint of one or more config data +MultiConfig = Union[ConfigType, List[ConfigType]] +OptMultiConfig = Optional[MultiConfig] + +InstanceList = List[InstanceData] +OptInstanceList = Optional[InstanceList] + +LabelList = List[LabelData] +OptLabelList = Optional[LabelList] + +SampleList = List[ActionDataSample] +OptSampleList = Optional[SampleList] + +ForwardResults = Union[Dict[str, torch.Tensor], List[ActionDataSample], + Tuple[torch.Tensor], torch.Tensor] + + +class SamplingResult: + """Dummy :class:`SamplingResult` in mmdet.""" + + def __init__(self, *args, **kwargs): + pass diff --git a/mmaction/version.py b/mmaction/version.py new file mode 100644 index 0000000000000000000000000000000000000000..3cff4e8e6893591a60b966c85d3aec353b9bfd45 --- /dev/null +++ b/mmaction/version.py @@ -0,0 +1,26 @@ +# Copyright (c) Open-MMLab. All rights reserved. + +__version__ = '1.2.0' + + +def parse_version_info(version_str: str): + """Parse a version string into a tuple. + + Args: + version_str (str): The version string. + Returns: + tuple[int or str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). + """ + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) diff --git a/mmaction/visualization/__init__.py b/mmaction/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a4614aa285e08ede042a84f4f99b6845cf276769 --- /dev/null +++ b/mmaction/visualization/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .action_visualizer import ActionVisualizer +from .video_backend import (LocalVisBackend, TensorboardVisBackend, + WandbVisBackend) + +__all__ = [ + 'ActionVisualizer', 'LocalVisBackend', 'WandbVisBackend', + 'TensorboardVisBackend' +] diff --git a/mmaction/visualization/__pycache__/__init__.cpython-310.pyc b/mmaction/visualization/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f375ac56c33becae0f98d8c1520d33c9a07db33e Binary files /dev/null and b/mmaction/visualization/__pycache__/__init__.cpython-310.pyc differ diff --git a/mmaction/visualization/__pycache__/action_visualizer.cpython-310.pyc b/mmaction/visualization/__pycache__/action_visualizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31a78b1e24ffaec5426c2eaa9444056a75a13643 Binary files /dev/null and b/mmaction/visualization/__pycache__/action_visualizer.cpython-310.pyc differ diff --git a/mmaction/visualization/__pycache__/video_backend.cpython-310.pyc b/mmaction/visualization/__pycache__/video_backend.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7995b90a18e09ad1ce1939fe8c8a786d2b116b1c Binary files /dev/null and b/mmaction/visualization/__pycache__/video_backend.cpython-310.pyc differ diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..3a695afa9ee0db3709dfec0085f66aea9588eb97 --- /dev/null +++ b/mmaction/visualization/action_visualizer.py @@ -0,0 +1,315 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmengine.dist import master_only +from mmengine.fileio.io import isdir, isfile, join_path, list_dir_or_file +from mmengine.visualization import Visualizer + +from mmaction.registry import VISBACKENDS, VISUALIZERS +from mmaction.structures import ActionDataSample + + +def _get_adaptive_scale(img_shape: Tuple[int, int], + min_scale: float = 0.3, + max_scale: float = 3.0) -> float: + """Get adaptive scale according to frame shape. + + The target scale depends on the the short edge length of the frame. If the + short edge length equals 224, the output is 1.0. And output linear scales + according the short edge length. + + You can also specify the minimum scale and the maximum scale to limit the + linear scale. + + Args: + img_shape (Tuple[int, int]): The shape of the canvas frame. + min_size (int): The minimum scale. Defaults to 0.3. + max_size (int): The maximum scale. Defaults to 3.0. + + Returns: + int: The adaptive scale. + """ + short_edge_length = min(img_shape) + scale = short_edge_length / 224. + return min(max(scale, min_scale), max_scale) + + +@VISUALIZERS.register_module() +class ActionVisualizer(Visualizer): + """Universal Visualizer for classification task. + + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + vis_backends (list, optional): Visual backend config list. + Defaults to None. + save_dir (str, optional): Save file dir for all storage backends. + If it is None, the backend storage will not save any data. + fig_save_cfg (dict): Keyword parameters of figure for saving. + Defaults to empty dict. + fig_show_cfg (dict): Keyword parameters of figure for showing. + Defaults to empty dict. + + Examples: + >>> import torch + >>> import decord + >>> from pathlib import Path + >>> from mmaction.structures import ActionDataSample, ActionVisualizer + >>> from mmengine.structures import LabelData + >>> # Example frame + >>> video = decord.VideoReader('./demo/demo.mp4') + >>> video = video.get_batch(range(32)).asnumpy() + >>> # Example annotation + >>> data_sample = ActionDataSample() + >>> data_sample.gt_label = LabelData(item=torch.tensor([2])) + >>> # Setup the visualizer + >>> vis = ActionVisualizer( + ... save_dir="./outputs", + ... vis_backends=[dict(type='LocalVisBackend')]) + >>> # Set classes names + >>> vis.dataset_meta = {'classes': ['running', 'standing', 'sitting']} + >>> # Save the visualization result by the specified storage backends. + >>> vis.add_datasample('demo', video) + >>> assert Path('outputs/vis_data/demo/frames_0/1.png').exists() + >>> assert Path('outputs/vis_data/demo/frames_0/2.png').exists() + >>> # Save another visualization result with the same name. + >>> vis.add_datasample('demo', video, step=1) + >>> assert Path('outputs/vis_data/demo/frames_1/2.png').exists() + """ + + def __init__( + self, + name='visualizer', + vis_backends: Optional[List[Dict]] = None, + save_dir: Optional[str] = None, + fig_save_cfg=dict(frameon=False), + fig_show_cfg=dict(frameon=False) + ) -> None: + super().__init__( + name=name, + image=None, + vis_backends=vis_backends, + save_dir=save_dir, + fig_save_cfg=fig_save_cfg, + fig_show_cfg=fig_show_cfg) + + def _load_video(self, + video: Union[np.ndarray, Sequence[np.ndarray], str], + target_resolution: Optional[Tuple[int]] = None): + """Load video from multiple source and convert to target resolution. + + Args: + video (np.ndarray, str): The video to draw. + target_resolution (Tuple[int], optional): Set to + (desired_width desired_height) to have resized frames. If + either dimension is None, the frames are resized by keeping + the existing aspect ratio. Defaults to None. + """ + if isinstance(video, np.ndarray) or isinstance(video, list): + frames = video + elif isinstance(video, str): + # video file path + if isfile(video): + try: + import decord + except ImportError: + raise ImportError( + 'Please install decord to load video file.') + video = decord.VideoReader(video) + frames = [x.asnumpy()[..., ::-1] for x in video] + # rawframes folder path + elif isdir(video): + frame_list = sorted(list_dir_or_file(video, list_dir=False)) + frames = [mmcv.imread(join_path(video, x)) for x in frame_list] + else: + raise TypeError(f'type of video {type(video)} not supported') + + if target_resolution is not None: + w, h = target_resolution + frame_h, frame_w, _ = frames[0].shape + if w == -1: + w = int(h / frame_h * frame_w) + if h == -1: + h = int(w / frame_w * frame_h) + frames = [mmcv.imresize(f, (w, h)) for f in frames] + + return frames + + @master_only + def add_datasample(self, + name: str, + video: Union[np.ndarray, Sequence[np.ndarray], str], + data_sample: Optional[ActionDataSample] = None, + draw_gt: bool = True, + draw_pred: bool = True, + draw_score: bool = True, + rescale_factor: Optional[float] = None, + show_frames: bool = False, + text_cfg: dict = dict(), + wait_time: float = 0.1, + out_path: Optional[str] = None, + out_type: str = 'img', + target_resolution: Optional[Tuple[int]] = None, + step: int = 0, + fps: int = 4) -> None: + """Draw datasample and save to all backends. + + - If ``out_path`` is specified, all storage backends are ignored + and save the videos to the ``out_path``. + - If ``show_frames`` is True, plot the frames in a window sequentially, + please confirm you are able to access the graphical interface. + + Args: + name (str): The frame identifier. + video (np.ndarray, str): The video to draw. supports decoded + np.ndarray, video file path, rawframes folder path. + data_sample (:obj:`ActionDataSample`, optional): The annotation of + the frame. Defaults to None. + draw_gt (bool): Whether to draw ground truth labels. + Defaults to True. + draw_pred (bool): Whether to draw prediction labels. + Defaults to True. + draw_score (bool): Whether to draw the prediction scores + of prediction categories. Defaults to True. + rescale_factor (float, optional): Rescale the frame by the rescale + factor before visualization. Defaults to None. + show_frames (bool): Whether to display the frames of the video. + Defaults to False. + text_cfg (dict): Extra text setting, which accepts + arguments of :attr:`mmengine.Visualizer.draw_texts`. + Defaults to an empty dict. + wait_time (float): Delay in seconds. 0 is the special + value that means "forever". Defaults to 0.1. + out_path (str, optional): Extra folder to save the visualization + result. If specified, the visualizer will only save the result + frame to the out_path and ignore its storage backends. + Defaults to None. + out_type (str): Output format type, choose from 'img', 'gif', + 'video'. Defaults to ``'img'``. + target_resolution (Tuple[int], optional): Set to + (desired_width desired_height) to have resized frames. If + either dimension is None, the frames are resized by keeping + the existing aspect ratio. Defaults to None. + step (int): Global step value to record. Defaults to 0. + fps (int): Frames per second for saving video. Defaults to 4. + """ + classes = None + video = self._load_video(video, target_resolution) + tol_video = len(video) + + if self.dataset_meta is not None: + classes = self.dataset_meta.get('classes', None) + + if data_sample is None: + data_sample = ActionDataSample() + + resulted_video = [] + for frame_idx, frame in enumerate(video): + frame_name = 'frame %d of %s' % (frame_idx + 1, name) + if rescale_factor is not None: + frame = mmcv.imrescale(frame, rescale_factor) + + texts = ['Frame %d of total %d frames' % (frame_idx, tol_video)] + self.set_image(frame) + + if draw_gt and 'gt_labels' in data_sample: + gt_labels = data_sample.gt_label + idx = gt_labels.tolist() + class_labels = [''] * len(idx) + if classes is not None: + class_labels = [f' ({classes[i]})' for i in idx] + labels = [ + str(idx[i]) + class_labels[i] for i in range(len(idx)) + ] + prefix = 'Ground truth: ' + texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels)) + + if draw_pred and 'pred_labels' in data_sample: + pred_labels = data_sample.pred_labels + idx = pred_labels.item.tolist() + score_labels = [''] * len(idx) + class_labels = [''] * len(idx) + if draw_score and 'score' in pred_labels: + score_labels = [ + f', {pred_labels.score[i].item():.2f}' for i in idx + ] + + if classes is not None: + class_labels = [f' ({classes[i]})' for i in idx] + + labels = [ + str(idx[i]) + score_labels[i] + class_labels[i] + for i in range(len(idx)) + ] + prefix = 'Prediction: ' + texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels)) + + img_scale = _get_adaptive_scale(frame.shape[:2]) + _text_cfg = { + 'positions': + np.array([(img_scale * 5, ) * 2]).astype(np.int32), + 'font_sizes': int(img_scale * 7), + 'font_families': 'monospace', + 'colors': 'white', + 'bboxes': dict(facecolor='black', alpha=0.5, boxstyle='Round'), + } + _text_cfg.update(text_cfg) + self.draw_texts('\n'.join(texts), **_text_cfg) + drawn_img = self.get_image() + resulted_video.append(drawn_img) + + if show_frames: + frame_wait_time = 1. / fps + for frame_idx, drawn_img in enumerate(resulted_video): + frame_name = 'frame %d of %s' % (frame_idx + 1, name) + if frame_idx < len(resulted_video) - 1: + wait_time = frame_wait_time + else: + wait_time = wait_time + self.show( + drawn_img[:, :, ::-1], + win_name=frame_name, + wait_time=wait_time) + + resulted_video = np.array(resulted_video) + if out_path is not None: + save_dir, save_name = osp.split(out_path) + vis_backend_cfg = dict(type='LocalVisBackend', save_dir=save_dir) + tmp_local_vis_backend = VISBACKENDS.build(vis_backend_cfg) + tmp_local_vis_backend.add_video( + save_name, + resulted_video, + step=step, + fps=fps, + out_type=out_type) + else: + self.add_video( + name, resulted_video, step=step, fps=fps, out_type=out_type) + return resulted_video + + @master_only + def add_video( + self, + name: str, + image: np.ndarray, + step: int = 0, + fps: int = 4, + out_type: str = 'img', + ) -> None: + """Record the image. + + Args: + name (str): The image identifier. + image (np.ndarray, optional): The image to be saved. The format + should be RGB. Default to None. + step (int): Global step value to record. Default to 0. + fps (int): Frames per second for saving video. Defaults to 4. + out_type (str): Output format type, choose from 'img', 'gif', + 'video'. Defaults to ``'img'``. + """ + for vis_backend in self._vis_backends.values(): + vis_backend.add_video( + name, image, step=step, fps=fps, + out_type=out_type) # type: ignore diff --git a/mmaction/visualization/video_backend.py b/mmaction/visualization/video_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..9cc549d1a99be377fbcf66730b74b310ed94816f --- /dev/null +++ b/mmaction/visualization/video_backend.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from typing import Optional + +import cv2 +import numpy as np +from mmengine.visualization import (LocalVisBackend, TensorboardVisBackend, + WandbVisBackend) +from mmengine.visualization.vis_backend import force_init_env + +from mmaction.registry import VISBACKENDS + +try: + import wandb +except ImportError: + pass + + +@VISBACKENDS.register_module() +class LocalVisBackend(LocalVisBackend): + """Local visualization backend class with video support. + + See mmengine.visualization.LocalVisBackend for more details. + """ + + @force_init_env + def add_video(self, + name: str, + frames: np.ndarray, + step: int = 0, + fps: Optional[int] = 4, + out_type: Optional[int] = 'img', + **kwargs) -> None: + """Record the frames of a video to disk. + + Args: + name (str): The video identifier (frame folder). + frames (np.ndarray): The frames to be saved. The format + should be RGB. The shape should be (T, H, W, C). + step (int): Global step value to record. Defaults to 0. + out_type (str): Output format type, choose from 'img', 'gif', + 'video'. Defaults to ``'img'``. + fps (int): Frames per second for saving video. Defaults to 4. + """ + assert frames.dtype == np.uint8 + + if out_type == 'img': + frames_dir = osp.join(self._save_dir, name, f'frames_{step}') + os.makedirs(frames_dir, exist_ok=True) + for idx, frame in enumerate(frames): + drawn_image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + save_file_name = f'{idx}.png' + cv2.imwrite(osp.join(frames_dir, save_file_name), drawn_image) + else: + try: + from moviepy.editor import ImageSequenceClip + except ImportError: + raise ImportError('Please install moviepy to enable ' + 'output file.') + + frames = [x[..., ::-1] for x in frames] + video_clips = ImageSequenceClip(frames, fps=fps) + name = osp.splitext(name)[0] + if out_type == 'gif': + out_path = osp.join(self._save_dir, name + '.gif') + video_clips.write_gif(out_path, logger=None) + elif out_type == 'video': + out_path = osp.join(self._save_dir, name + '.mp4') + video_clips.write_videofile( + out_path, remove_temp=True, logger=None) + + +@VISBACKENDS.register_module() +class WandbVisBackend(WandbVisBackend): + """Wandb visualization backend class with video support. See + mmengine.visualization.WandbVisBackend for more details. + + Note that this requires the ``wandb`` and ``moviepy`` package. A wandb + account login is also required at ``https://wandb.ai/authorize``. + """ + + @force_init_env + def add_video(self, + name: str, + frames: np.ndarray, + fps: int = 4, + **kwargs) -> None: + """Record the frames of a video to wandb. + + Note that this requires the ``moviepy`` package. + + Args: + name (str): The video identifier (frame folder). + frames (np.ndarray): The frames to be saved. The format + should be RGB. The shape should be (T, H, W, C). + step is a useless parameter that Wandb does not need. + fps (int): Frames per second. Defaults to 4. + """ + frames = frames.transpose(0, 3, 1, 2) + self._wandb.log({'video': wandb.Video(frames, fps=fps, format='gif')}) + + +@VISBACKENDS.register_module() +class TensorboardVisBackend(TensorboardVisBackend): + """Tensorboard visualization backend class with video support. See + mmengine.visualization.TensorboardVisBackend for more details. + + Note that this requires the ``future`` and ``tensorboard`` package. + """ + + @force_init_env + def add_video(self, + name: str, + frames: np.ndarray, + step: int = 0, + fps: int = 4, + **kwargs) -> None: + """Record the frames of a video to tensorboard. + + Note that this requires the ``moviepy`` package. + + Args: + name (str): The video identifier (frame folder). + frames (np.ndarray): The frames to be saved. The format + should be RGB. The shape should be (T, H, W, C). + step (int): Global step value to record. Defaults to 0. + fps (int): Frames per second. Defaults to 4. + """ + frames = frames.transpose(0, 3, 1, 2) + frames = frames.reshape(1, *frames.shape) + self._tensorboard.add_video(name, frames, global_step=step, fps=fps) diff --git a/model-index.yml b/model-index.yml new file mode 100644 index 0000000000000000000000000000000000000000..d35cdf44f9fcc9d4d41940f49d720651f4035ec7 --- /dev/null +++ b/model-index.yml @@ -0,0 +1,36 @@ +Import: + - configs/detection/acrn/metafile.yml + - configs/detection/lfb/metafile.yml + - configs/detection/slowfast/metafile.yml + - configs/detection/slowonly/metafile.yml + - configs/detection/videomae/metafile.yml + - configs/recognition/c2d/metafile.yml + - configs/recognition/c3d/metafile.yml + - configs/recognition/csn/metafile.yml + - configs/recognition/i3d/metafile.yml + - configs/recognition/mvit/metafile.yml + - configs/recognition/omnisource/metafile.yml + - configs/recognition/r2plus1d/metafile.yml + - configs/recognition/slowfast/metafile.yml + - configs/recognition/slowonly/metafile.yml + - configs/recognition/swin/metafile.yml + - configs/recognition/tanet/metafile.yml + - configs/recognition/timesformer/metafile.yml + - configs/recognition/tin/metafile.yml + - configs/recognition/tpn/metafile.yml + - configs/recognition/trn/metafile.yml + - configs/recognition/tsm/metafile.yml + - configs/recognition/tsn/metafile.yml + - configs/recognition/uniformer/metafile.yml + - configs/recognition/uniformerv2/metafile.yml + - configs/recognition/videomae/metafile.yml + - configs/recognition/videomaev2/metafile.yml + - configs/recognition/x3d/metafile.yml + - configs/recognition_audio/resnet/metafile.yml + - configs/localization/bmn/metafile.yml + - configs/localization/bsn/metafile.yml + - configs/retrieval/clip4clip/metafile.yml + - configs/skeleton/2s-agcn/metafile.yml + - configs/skeleton/posec3d/metafile.yml + - configs/skeleton/stgcn/metafile.yml + - configs/skeleton/stgcnpp/metafile.yml diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8fcf90170057548748c9ed4e093881ae0c97d09a --- /dev/null +++ b/projects/README.md @@ -0,0 +1,17 @@ +# Welcome to Projects of MMAction2 + +In this folder, we welcome all contributions of deep-learning video understanding models from the community. + +Here, these requirements, e.g., code standards, are not as strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from the community to make MMAction2 greater. + +Here is an [example project](./example_project) about how to add your algorithms easily. + +We also provide some documentation listed below: + +- [Contribution Guide](https://mmaction2.readthedocs.io/en/latest/get_started/contribution_guide.html) + + The guides for new contributors about how to add your projects to MMAction2. + +- [Discussions](https://github.com/open-mmlab/mmaction2/discussions) + + Welcome to start a discussion! diff --git a/projects/actionclip/README.md b/projects/actionclip/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7470963de970181e959581744ae6180f7a964880 --- /dev/null +++ b/projects/actionclip/README.md @@ -0,0 +1,186 @@ +# ActionCLIP Project + +[ActionCLIP: A New Paradigm for Video Action Recognition](https://arxiv.org/abs/2109.08472) + + + +## Abstract + + + +The canonical approach to video action recognition dictates a neural model to do a classic and standard 1-of-N majority vote task. They are trained to predict a fixed set of predefined categories, limiting their transferable ability on new datasets with unseen concepts. In this paper, we provide a new perspective on action recognition by attaching importance to the semantic information of label texts rather than simply mapping them into numbers. Specifically, we model this task as a video-text matching problem within a multimodal learning framework, which strengthens the video representation with more semantic language supervision and enables our model to do zero-shot action recognition without any further labeled data or parameters requirements. Moreover, to handle the deficiency of label texts and make use of tremendous web data, we propose a new paradigm based on this multimodal learning framework for action recognition, which we dub "pre-train, prompt and fine-tune". This paradigm first learns powerful representations from pre-training on a large amount of web image-text or video-text data. Then it makes the action recognition task to act more like pre-training problems via prompt engineering. Finally, it end-to-end fine-tunes on target datasets to obtain strong performance. We give an instantiation of the new paradigm, ActionCLIP, which not only has superior and flexible zero-shot/few-shot transfer ability but also reaches a top performance on general action recognition task, achieving 83.8% top-1 accuracy on Kinetics-400 with a ViT-B/16 as the backbone. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. Run the following command to install `clip`. + +```shell +pip install git+https://github.com/openai/CLIP.git +``` + +Assume that you are located at `$MMACTION2/projects/actionclip`. + +Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md). + +Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link. + +```shell +ln -s ../../data ./data +``` + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### Kinetics400 + +| frame sampling strategy | backbone | top1 acc | top5 acc | testing protocol | config | ckpt | +| :---------------------: | :------: | :------: | :------: | :----------------: | :------------------------------------------------------------------: | :-----------------------------------------------------------------: | +| 1x1x8 | ViT-B/32 | 77.6 | 93.8 | 8 clips x 1 crop | [config](./configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth)\[1\] | +| 1x1x8 | ViT-B/16 | 80.3 | 95.2 | 8 clips x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth)\[1\] | +| 1x1x16 | ViT-B/16 | 81.1 | 95.6 | 16 clips x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth)\[1\] | +| 1x1x32 | ViT-B/16 | 81.3 | 95.8 | 32 clips x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth)\[1\] | + +\[1\] The models are ported from the repo [ActionCLIP](https://github.com/sallymmx/ActionCLIP) and tested on our data. Currently, we only support the testing of ActionCLIP models. Due to the variation in testing data, our reported test accuracy differs from that of the original repository (on average, it is lower by one point). Please refer to this [issue](https://github.com/sallymmx/ActionCLIP/issues/14) for more details. + +### Kinetics400 (Trained on Our K400 dataset) + +| frame sampling strategy | gpus | backbone | top1 acc | top5 acc | testing protocol | config | ckpt | log | +| :---------------------: | :--: | :------: | :------: | :------: | :---------------: | :-------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| 1x1x8 | 8 | ViT-B/32 | 77.5 | 93.2 | 8 clips x 1 crop | [config](./configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb_20230801-8535b794.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.log) | +| 1x1x8 | 8 | ViT-B/16 | 81.3 | 95.2 | 8 clips x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb_20230801-b307a0cd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.log) | + +## Zero-Shot Prediction + +We offer two methods for zero-shot prediction as follows. The `test.mp4` can be downloaded from [here](https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/237333525-89ebee9a-573e-4e27-9047-0ad6422fa82f.mp4). + +### Using Naive Pytorch + +```python +import torch +import clip +from models.load import init_actionclip +from mmaction.utils import register_all_modules + +register_all_modules(True) + +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = init_actionclip('ViT-B/32-8', device=device) + +video_anno = dict(filename='test.mp4', start_index=0) +video = preprocess(video_anno).unsqueeze(0).to(device) + +template = 'The woman is {}' +labels = ['singing', 'dancing', 'performing'] +text = clip.tokenize([template.format(label) for label in labels]).to(device) + +with torch.no_grad(): + video_features = model.encode_video(video) + text_features = model.encode_text(text) + +video_features /= video_features.norm(dim=-1, keepdim=True) +text_features /= text_features.norm(dim=-1, keepdim=True) +similarity = (100 * video_features @ text_features.T).softmax(dim=-1) +probs = similarity.cpu().numpy() + +print("Label probs:", probs) # [[9.995e-01 5.364e-07 6.666e-04]] +``` + +### Using MMAction2 APIs + +```python +import mmengine +import torch +from mmaction.utils import register_all_modules +from mmaction.apis import inference_recognizer, init_recognizer + +register_all_modules(True) + +config_path = 'configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py' +checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth' +template = 'The woman is {}' +labels = ['singing', 'dancing', 'performing'] + +# Update the labels, the default is the label list of K400. +config = mmengine.Config.fromfile(config_path) +config.model.labels_or_label_file = labels +config.model.template = template + +device = "cuda" if torch.cuda.is_available() else "cpu" +model = init_recognizer(config=config, checkpoint=checkpoint_path, device=device) + +pred_result = inference_recognizer(model, 'test.mp4') +probs = pred_result.pred_score.cpu().numpy() +print("Label probs:", probs) # [9.995e-01 5.364e-07 6.666e-04] +``` + +## Citation + + + +```bibtex +@article{wang2021actionclip, + title={Actionclip: A new paradigm for video action recognition}, + author={Wang, Mengmeng and Xing, Jiazheng and Liu, Yong}, + journal={arXiv preprint arXiv:2109.08472}, + year={2021} +} +``` diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..39b8d1100d8f32236d4d33f1124b651307832899 --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py @@ -0,0 +1,52 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +num_segs = 16 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/16', + num_adapter_segs=num_segs, + num_adapter_layers=6, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict(type='AccMetric') diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..9b78fd201167251ce27a2fc9699e52e53f0a11f0 --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py @@ -0,0 +1,52 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +num_segs = 32 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/16', + num_adapter_segs=num_segs, + num_adapter_layers=6, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict(type='AccMetric') diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..d29281b9694b7891f1bd7f0d5a77d37c9392bc84 --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py @@ -0,0 +1,52 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +num_segs = 8 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/16', + num_adapter_segs=num_segs, + num_adapter_layers=6, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict(type='AccMetric') diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a585b44e3112a6aa00ae3efaacac22dfa4f520ec --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py @@ -0,0 +1,162 @@ +custom_imports = dict(imports='models') + +num_segs = 8 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/16', + num_adapter_segs=num_segs, + num_adapter_layers=6, + to_float32=True, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400/': 's3://openmmlab/datasets/action/Kinetics400/'})) + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', clip_len=1, frame_interval=1, num_clips=num_segs), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, .875, .75, .66), + random_crop=False, + num_fixed_crops=13, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-6, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.2), + paramwise_cfg=dict(custom_keys=dict(adapter=dict(lr_mult=10)))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=45, + eta_min=0, + by_epoch=True, + begin=5, + end=50, + convert_to_iter_based=True) +] + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=100, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..5e127429427fa37ea7144c20d35c404f7fb55ea2 --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py @@ -0,0 +1,52 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +num_segs = 8 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/32', + num_adapter_segs=num_segs, + num_adapter_layers=6, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict(type='AccMetric') diff --git a/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..bb38ebfb2ae413d78004aa1739ee7ea949e7816e --- /dev/null +++ b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py @@ -0,0 +1,162 @@ +custom_imports = dict(imports='models') + +num_segs = 8 + +model = dict( + type='ActionClip', + clip_arch='ViT-B/32', + num_adapter_segs=num_segs, + num_adapter_layers=6, + to_float32=True, + labels_or_label_file='configs/label_map_k400.txt', + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[122.771, 116.746, 104.093], + std=[68.500, 66.632, 70.323], + format_shape='NCHW')) + +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400/': 's3://openmmlab/datasets/action/Kinetics400/'})) + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', clip_len=1, frame_interval=1, num_clips=num_segs), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, .875, .75, .66), + random_crop=False, + num_fixed_crops=13, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-6, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.2), + paramwise_cfg=dict(custom_keys=dict(adapter=dict(lr_mult=10)))) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=45, + eta_min=0, + by_epoch=True, + begin=5, + end=50, + convert_to_iter_based=True) +] + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=100, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/actionclip/configs/label_map_k400.txt b/projects/actionclip/configs/label_map_k400.txt new file mode 100644 index 0000000000000000000000000000000000000000..9193a07c6bda30b85b591da52e5e4cb375c31c06 --- /dev/null +++ b/projects/actionclip/configs/label_map_k400.txt @@ -0,0 +1,400 @@ +abseiling +air drumming +answering questions +applauding +applying cream +archery +arm wrestling +arranging flowers +assembling computer +auctioning +baby waking up +baking cookies +balloon blowing +bandaging +barbequing +bartending +beatboxing +bee keeping +belly dancing +bench pressing +bending back +bending metal +biking through snow +blasting sand +blowing glass +blowing leaves +blowing nose +blowing out candles +bobsledding +bookbinding +bouncing on trampoline +bowling +braiding hair +breading or breadcrumbing +breakdancing +brush painting +brushing hair +brushing teeth +building cabinet +building shed +bungee jumping +busking +canoeing or kayaking +capoeira +carrying baby +cartwheeling +carving pumpkin +catching fish +catching or throwing baseball +catching or throwing frisbee +catching or throwing softball +celebrating +changing oil +changing wheel +checking tires +cheerleading +chopping wood +clapping +clay pottery making +clean and jerk +cleaning floor +cleaning gutters +cleaning pool +cleaning shoes +cleaning toilet +cleaning windows +climbing a rope +climbing ladder +climbing tree +contact juggling +cooking chicken +cooking egg +cooking on campfire +cooking sausages +counting money +country line dancing +cracking neck +crawling baby +crossing river +crying +curling hair +cutting nails +cutting pineapple +cutting watermelon +dancing ballet +dancing charleston +dancing gangnam style +dancing macarena +deadlifting +decorating the christmas tree +digging +dining +disc golfing +diving cliff +dodgeball +doing aerobics +doing laundry +doing nails +drawing +dribbling basketball +drinking +drinking beer +drinking shots +driving car +driving tractor +drop kicking +drumming fingers +dunking basketball +dying hair +eating burger +eating cake +eating carrots +eating chips +eating doughnuts +eating hotdog +eating ice cream +eating spaghetti +eating watermelon +egg hunting +exercising arm +exercising with an exercise ball +extinguishing fire +faceplanting +feeding birds +feeding fish +feeding goats +filling eyebrows +finger snapping +fixing hair +flipping pancake +flying kite +folding clothes +folding napkins +folding paper +front raises +frying vegetables +garbage collecting +gargling +getting a haircut +getting a tattoo +giving or receiving award +golf chipping +golf driving +golf putting +grinding meat +grooming dog +grooming horse +gymnastics tumbling +hammer throw +headbanging +headbutting +high jump +high kick +hitting baseball +hockey stop +holding snake +hopscotch +hoverboarding +hugging +hula hooping +hurdling +hurling (sport) +ice climbing +ice fishing +ice skating +ironing +javelin throw +jetskiing +jogging +juggling balls +juggling fire +juggling soccer ball +jumping into pool +jumpstyle dancing +kicking field goal +kicking soccer ball +kissing +kitesurfing +knitting +krumping +laughing +laying bricks +long jump +lunge +making a cake +making a sandwich +making bed +making jewelry +making pizza +making snowman +making sushi +making tea +marching +massaging back +massaging feet +massaging legs +massaging person's head +milking cow +mopping floor +motorcycling +moving furniture +mowing lawn +news anchoring +opening bottle +opening present +paragliding +parasailing +parkour +passing American football (in game) +passing American football (not in game) +peeling apples +peeling potatoes +petting animal (not cat) +petting cat +picking fruit +planting trees +plastering +playing accordion +playing badminton +playing bagpipes +playing basketball +playing bass guitar +playing cards +playing cello +playing chess +playing clarinet +playing controller +playing cricket +playing cymbals +playing didgeridoo +playing drums +playing flute +playing guitar +playing harmonica +playing harp +playing ice hockey +playing keyboard +playing kickball +playing monopoly +playing organ +playing paintball +playing piano +playing poker +playing recorder +playing saxophone +playing squash or racquetball +playing tennis +playing trombone +playing trumpet +playing ukulele +playing violin +playing volleyball +playing xylophone +pole vault +presenting weather forecast +pull ups +pumping fist +pumping gas +punching bag +punching person (boxing) +push up +pushing car +pushing cart +pushing wheelchair +reading book +reading newspaper +recording music +riding a bike +riding camel +riding elephant +riding mechanical bull +riding mountain bike +riding mule +riding or walking with horse +riding scooter +riding unicycle +ripping paper +robot dancing +rock climbing +rock scissors paper +roller skating +running on treadmill +sailing +salsa dancing +sanding floor +scrambling eggs +scuba diving +setting table +shaking hands +shaking head +sharpening knives +sharpening pencil +shaving head +shaving legs +shearing sheep +shining shoes +shooting basketball +shooting goal (soccer) +shot put +shoveling snow +shredding paper +shuffling cards +side kick +sign language interpreting +singing +situp +skateboarding +ski jumping +skiing (not slalom or crosscountry) +skiing crosscountry +skiing slalom +skipping rope +skydiving +slacklining +slapping +sled dog racing +smoking +smoking hookah +snatch weight lifting +sneezing +sniffing +snorkeling +snowboarding +snowkiting +snowmobiling +somersaulting +spinning poi +spray painting +spraying +springboard diving +squat +sticking tongue out +stomping grapes +stretching arm +stretching leg +strumming guitar +surfing crowd +surfing water +sweeping floor +swimming backstroke +swimming breast stroke +swimming butterfly stroke +swing dancing +swinging legs +swinging on something +sword fighting +tai chi +taking a shower +tango dancing +tap dancing +tapping guitar +tapping pen +tasting beer +tasting food +testifying +texting +throwing axe +throwing ball +throwing discus +tickling +tobogganing +tossing coin +tossing salad +training dog +trapezing +trimming or shaving beard +trimming trees +triple jump +tying bow tie +tying knot (not on a tie) +tying tie +unboxing +unloading truck +using computer +using remote controller (not gaming) +using segway +vault +waiting in line +walking the dog +washing dishes +washing feet +washing hair +washing hands +water skiing +water sliding +watering plants +waxing back +waxing chest +waxing eyebrows +waxing legs +weaving basket +welding +whistling +windsurfing +wrapping present +wrestling +writing +yawning +yoga +zumba diff --git a/projects/actionclip/models/__init__.py b/projects/actionclip/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a4bdfcb32b4b2d12767cffc92713800cef905d5 --- /dev/null +++ b/projects/actionclip/models/__init__.py @@ -0,0 +1,4 @@ +from .actionclip import ActionClip +from .load import init_actionclip + +__all__ = ['ActionClip', 'init_actionclip'] diff --git a/projects/actionclip/models/actionclip.py b/projects/actionclip/models/actionclip.py new file mode 100644 index 0000000000000000000000000000000000000000..89975b82ea886dcfb20a49d366f05c9bc5f7f8e4 --- /dev/null +++ b/projects/actionclip/models/actionclip.py @@ -0,0 +1,176 @@ +from typing import Any, Dict, List, Optional, Tuple, Union + +import clip +import mmengine +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.dist import all_gather, get_rank +from mmengine.model import BaseModel +from mmengine.structures import LabelData + +from mmaction.registry import MODELS +from .adapter import TransformerAdapter + + +class GatherLayer(torch.autograd.Function): + + @staticmethod + def forward(ctx: Any, input: torch.Tensor) -> Tuple[List]: + ctx.save_for_backward(input) + output = all_gather(input) + return tuple(output) + + @staticmethod + def backward(ctx: Any, *grads: torch.Tensor) -> torch.Tensor: + input, = ctx.saved_tensors + grad_out = torch.zeros_like(input) + grad_out[:] = grads[get_rank()] + return grad_out + + +def text_prompt(labels_or_label_file, templates_or_template_file=None): + if isinstance(labels_or_label_file, str): + labels = mmengine.list_from_file(labels_or_label_file) + elif isinstance(labels_or_label_file, list): + labels = labels_or_label_file + else: + raise ValueError(f'`labels_or_label_file` must be `list` or `str`, ' + f'but got {type(labels_or_label_file)}') + + if templates_or_template_file is None: + templates = [ + 'a photo of action {}', 'a picture of action {}', + 'Human action of {}', '{}, an action', '{} this is an action', + '{}, a video of action', 'Playing action of {}', '{}', + 'Playing a kind of action, {}', 'Doing a kind of action, {}', + 'Look, the human is {}', 'Can you recognize the action of {}?', + 'Video classification of {}', 'A video of {}', 'The man is {}', + 'The woman is {}' + ] + elif isinstance(templates_or_template_file, str): + templates = mmengine.list_from_file(templates_or_template_file) + elif not mmengine.is_seq_of(templates_or_template_file, str): + raise ValueError(f'`template` must be list of `str`, `str` or `None`, ' + f'but got {type(templates_or_template_file)}') + + num_prompt = len(templates) + prompt = torch.cat( + [clip.tokenize(t.format(c)) for t in templates for c in labels]) + return prompt, num_prompt + + +@MODELS.register_module() +class ActionClip(BaseModel): + + def __init__(self, + clip_arch: str, + num_adapter_segs: int, + num_adapter_layers: int = 6, + to_float32: bool = False, + labels_or_label_file: Optional[Union[List[str], str]] = None, + templates_or_template_file: Optional[Union[List[str], + str]] = None, + data_preprocessor: Optional[Dict] = None, + loss: Dict = dict(type='CrossEntropyLoss', loss_weight=0.5)): + super(ActionClip, self).__init__(data_preprocessor=data_preprocessor) + self.clip = clip.load(clip_arch, device='cpu')[0] + if to_float32: + self.clip.float() + + self.adapter = TransformerAdapter(self.clip, num_adapter_segs, + num_adapter_layers) + + self.loss = MODELS.build(loss) + + if labels_or_label_file is not None: + self.prompt, self.num_prompt = text_prompt( + labels_or_label_file, templates_or_template_file) + + def encode_video(self, video): + b, n, c, h, w = video.shape + video = video.view(-1, c, h, w) + frames_features = self.encode_image(video) + frames_features = frames_features.view(b, n, -1) + video_features = self.adapter(frames_features) + return video_features + + def encode_image(self, image): + return self.clip.encode_image(image) + + def encode_text(self, text): + return self.clip.encode_text(text) + + def forward(self, + inputs: torch.Tensor, + data_samples: Optional[List] = None, + mode: str = 'tensor'): + + if mode == 'tensor': + return self.encode_video(inputs) + + elif mode == 'predict': + assert hasattr(self, 'prompt'),\ + '`labels_or_label_file` is required to perform prediction. ' + + video_features = self.encode_video(inputs) + video_features = video_features / video_features.norm( + dim=-1, keepdim=True) + + bsz = len(data_samples) + num_views = video_features.shape[0] // bsz + + text_features = self.encode_text(self.prompt.to(inputs.device)) + text_features = text_features / text_features.norm( + dim=-1, keepdim=True) + + # (bsz*num_views, num_prompt, num_classes) -> + # (bsz, num_views*num_prompt, num_classes) + similarity = (100.0 * video_features @ text_features.T). \ + view(bsz, num_views * self.num_prompt, -1) + + cls_scores = F.softmax(similarity, dim=2).mean(dim=1) + + for data_sample, score in zip(data_samples, cls_scores): + data_sample.pred_scores = LabelData(item=score) + + return data_samples + + elif mode == 'loss': + video_features = self.encode_video(inputs) + video_features = video_features / video_features.norm( + dim=-1, keepdim=True) + + text_id = np.random.randint( + self.num_prompt, size=len(data_samples)) + real_labels = [x.gt_labels.item.item() for x in data_samples] + selected_prompt = self.prompt.view( + self.num_prompt, -1, + self.prompt.shape[-1])[text_id, real_labels].to(inputs.device) + + text_features = self.encode_text(selected_prompt) + text_features = text_features / text_features.norm( + dim=-1, keepdim=True) + + video_features = torch.cat( + GatherLayer.apply(video_features), dim=0) + text_features = torch.cat(GatherLayer.apply(text_features), dim=0) + + logit_scale = self.clip.logit_scale.exp() + logits_per_video = logit_scale * video_features @ text_features.t() + logits_per_text = logits_per_video.t() + labels = torch.arange(logits_per_video.shape[0]).to( + logit_scale.device) + + sim_loss_v2t = self.loss(logits_per_video, labels) + sim_loss_t2v = self.loss(logits_per_text, labels) + + losses = dict() + losses['sim_loss_v2t'] = sim_loss_v2t + losses['sim_loss_t2v'] = sim_loss_t2v + return losses + + else: + raise RuntimeError( + f'Invalid mode "{mode}". ' + 'Only supports `predict`, `loss` and `tensor` mode. ') diff --git a/projects/actionclip/models/adapter.py b/projects/actionclip/models/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..b86cffa8c64d6c964287fcfe5cf8f51a650a6f2c --- /dev/null +++ b/projects/actionclip/models/adapter.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +from clip.model import Transformer +from mmengine.model import BaseModule + + +class TransformerAdapter(BaseModule): + + def __init__(self, + clip_model: nn.Module, + num_segs: int, + num_layers: int = 6): + super(TransformerAdapter, self).__init__() + self.num_segs = num_segs + + embed_dim = clip_model.text_projection.shape[1] + transformer_width = clip_model.ln_final.weight.shape[0] + transformer_heads = transformer_width // 64 + + self.frame_position_embeddings = nn.Embedding(self.num_segs, embed_dim) + self.transformer = Transformer( + width=embed_dim, layers=num_layers, heads=transformer_heads) + + def init_weights(self): + for module in self.modules(): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def forward(self, x: torch.Tensor): + b, seq_length, c = x.size() + + x_original = x + position_ids = torch.arange( + seq_length, dtype=torch.long, device=x.device) + embeddings = self.frame_position_embeddings(position_ids) + x = x + embeddings.unsqueeze(0) + x = x.transpose(0, 1) # NLD -> LND + x = self.transformer(x) + x = x.transpose(0, 1) # LND -> NLD + x = x.type(x_original.dtype) + x_original + return x.mean(dim=1) diff --git a/projects/actionclip/models/load.py b/projects/actionclip/models/load.py new file mode 100644 index 0000000000000000000000000000000000000000..baf773c68f9e8d75ebd46fe2c270cc3142c57723 --- /dev/null +++ b/projects/actionclip/models/load.py @@ -0,0 +1,72 @@ +import torch +from mmengine.dataset import Compose +from mmengine.runner.checkpoint import _load_checkpoint +from torchvision.transforms import Normalize + +from .actionclip import ActionClip + +_MODELS = { + 'ViT-B/32-8': + 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth', # noqa: E501 + 'ViT-B/16-8': + 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth', # noqa: E501 + 'ViT-B/16-16': + 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth', # noqa: E501 + 'ViT-B/16-32': + 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth', # noqa: E501 +} + + +def available_models(): + """Returns the names of available ActionCLIP models.""" + return list(_MODELS.keys()) + + +def _transform(num_segs): + pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=num_segs, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + lambda x: torch.tensor(x['imgs']).div(255), + Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ] + return Compose(pipeline) + + +def init_actionclip(name, device): + assert name in _MODELS, \ + f'Model {name} not found; available models = {available_models()}' + model_path = _MODELS[name] + + checkpoint = _load_checkpoint(model_path, map_location='cpu') + state_dict = checkpoint['state_dict'] + + clip_arch = name.split('-')[0] + '-' + name.split('-')[1] + + num_adapter_segs = int(name.split('-')[2]) + assert num_adapter_segs == \ + state_dict['adapter.frame_position_embeddings.weight'].shape[0] + num_adapter_layers = len([ + k for k in state_dict.keys() + if k.startswith('adapter.') and k.endswith('.attn.in_proj_weight') + ]) + + model = ActionClip( + clip_arch=clip_arch, + num_adapter_segs=num_adapter_segs, + num_adapter_layers=num_adapter_layers) + + model.load_state_dict(state_dict) + model.to(device) + model.eval() + + return model, _transform(num_adapter_segs) diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c7736849ea900fc0c40d0b7d6b9d64874554c80 --- /dev/null +++ b/projects/ctrgcn/README.md @@ -0,0 +1,113 @@ +# CTRGCN Project + +[Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213) + + + +## Abstract + + + +Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. + +Assume that you are located at `$MMACTION2/projects/ctrgcn`. + +Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md). + +Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link. + +```shell +ln -s ../../data ./data +``` + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | CTRGCN | 89.6 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230308-7aba454e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | CTRGCN | 89.0 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-950dca0a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | + +## Citation + + + +```bibtex +@inproceedings{chen2021channel, + title={Channel-wise topology refinement graph convolution for skeleton-based action recognition}, + author={Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming}, + booktitle={CVPR}, + pages={13359--13368}, + year={2021} +} +``` diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..3835d11fdd0e438f48751165eabfa00a36c47b68 --- /dev/null +++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='CTRGCN', graph_cfg=dict(layout='coco', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..03a1c302f53ff9136d9c0c49937b774c6cb4340f --- /dev/null +++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='CTRGCN', graph_cfg=dict(layout='nturgb+d', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/ctrgcn/models/__init__.py b/projects/ctrgcn/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0cca6c4bd105b327a22cbd00a04b30c289ce2fb2 --- /dev/null +++ b/projects/ctrgcn/models/__init__.py @@ -0,0 +1,3 @@ +from .ctrgcn import CTRGCN + +__all__ = ['CTRGCN'] diff --git a/projects/ctrgcn/models/ctrgcn.py b/projects/ctrgcn/models/ctrgcn.py new file mode 100644 index 0000000000000000000000000000000000000000..73e884d5702df870eea9790f8bc4912aa243a8c3 --- /dev/null +++ b/projects/ctrgcn/models/ctrgcn.py @@ -0,0 +1,104 @@ +import torch +import torch.nn as nn +from mmengine.model import BaseModule, ModuleList + +from mmaction.models.utils import Graph, unit_tcn +from mmaction.registry import MODELS +from .ctrgcn_utils import MSTCN, unit_ctrgcn + + +class CTRGCNBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + stride=1, + residual=True, + kernel_size=5, + dilations=[1, 2], + tcn_dropout=0): + super(CTRGCNBlock, self).__init__() + self.gcn1 = unit_ctrgcn(in_channels, out_channels, A) + self.tcn1 = MSTCN( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + dilations=dilations, + residual=False, + tcn_dropout=tcn_dropout) + self.relu = nn.ReLU(inplace=True) + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + def forward(self, x): + y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x)) + return y + + +@MODELS.register_module() +class CTRGCN(BaseModule): + + def __init__(self, + graph_cfg, + in_channels=3, + base_channels=64, + num_stages=10, + inflate_stages=[5, 8], + down_stages=[5, 8], + pretrained=None, + num_person=2, + **kwargs): + super(CTRGCN, self).__init__() + + self.graph = Graph(**graph_cfg) + A = torch.tensor( + self.graph.A, dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + + self.num_person = num_person + self.base_channels = base_channels + + self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1)) + + kwargs0 = {k: v for k, v in kwargs.items() if k != 'tcn_dropout'} + modules = [ + CTRGCNBlock( + in_channels, + base_channels, + A.clone(), + residual=False, + **kwargs0) + ] + for i in range(2, num_stages + 1): + in_channels = base_channels + out_channels = base_channels * (1 + (i in inflate_stages)) + stride = 1 + (i in down_stages) + modules.append( + CTRGCNBlock( + base_channels, + out_channels, + A.clone(), + stride=stride, + **kwargs)) + base_channels = out_channels + self.net = ModuleList(modules) + + def forward(self, x): + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous() + x = self.data_bn(x.view(N, M * V * C, T)) + x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, + 2).contiguous().view(N * M, C, T, V) + + for gcn in self.net: + x = gcn(x) + + x = x.reshape((N, M) + x.shape[1:]) + return x diff --git a/projects/ctrgcn/models/ctrgcn_utils.py b/projects/ctrgcn/models/ctrgcn_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6fe3a8529f95c6817141cc508e3abfc33fb47872 --- /dev/null +++ b/projects/ctrgcn/models/ctrgcn_utils.py @@ -0,0 +1,192 @@ +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer +from mmengine.model import BaseModule, ModuleList, Sequential + +from mmaction.models.utils import unit_tcn + + +# ! Notice: The implementation of MSTCN in +# MS-G3D is not the same as our implementation. +class MSTCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilations=[1, 2, 3, 4], + residual=True, + act_cfg=dict(type='ReLU'), + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ], + tcn_dropout=0): + + super().__init__(init_cfg=init_cfg) + # Multiple branches of temporal convolution + self.num_branches = len(dilations) + 2 + branch_channels = out_channels // self.num_branches + branch_channels_rem = out_channels - branch_channels * ( + self.num_branches - 1) + + if type(kernel_size) == list: + assert len(kernel_size) == len(dilations) + else: + kernel_size = [kernel_size] * len(dilations) + + self.branches = ModuleList([ + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + unit_tcn( + branch_channels, + branch_channels, + kernel_size=ks, + stride=stride, + dilation=dilation), + ) for ks, dilation in zip(kernel_size, dilations) + ]) + + # Additional Max & 1x1 branch + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + nn.MaxPool2d( + kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)), + nn.BatchNorm2d(branch_channels))) + + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, + branch_channels_rem, + kernel_size=1, + padding=0, + stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem))) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + self.act = build_activation_layer(act_cfg) + self.drop = nn.Dropout(tcn_dropout) + + def forward(self, x): + # Input dim: (N,C,T,V) + res = self.residual(x) + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + out = torch.cat(branch_outs, dim=1) + out += res + out = self.act(out) + out = self.drop(out) + return out + + +class CTRGC(BaseModule): + + def __init__(self, + in_channels, + out_channels, + rel_reduction=8, + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ]): + super(CTRGC, self).__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + if in_channels <= 16: + self.rel_channels = 8 + else: + self.rel_channels = in_channels // rel_reduction + self.conv1 = nn.Conv2d( + self.in_channels, self.rel_channels, kernel_size=1) + self.conv2 = nn.Conv2d( + self.in_channels, self.rel_channels, kernel_size=1) + self.conv3 = nn.Conv2d( + self.in_channels, self.out_channels, kernel_size=1) + self.conv4 = nn.Conv2d( + self.rel_channels, self.out_channels, kernel_size=1) + self.tanh = nn.Tanh() + + def forward(self, x, A=None, alpha=1): + # Input: N, C, T, V + x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean( + -2), self.conv3(x) + # X1, X2: N, R, V + # N, R, V, 1 - N, R, 1, V + x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2)) + # N, R, V, V + x1 = self.conv4(x1) * alpha + (A[None, None] if A is not None else 0 + ) # N,C,V,V + x1 = torch.einsum('ncuv,nctu->nctv', x1, x3) + return x1 + + +class unit_ctrgcn(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + init_cfg=[ + dict( + type='Constant', + layer='BatchNorm2d', + val=1, + override=dict(type='Constant', name='bn', val=1e-6)), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ]): + + super(unit_ctrgcn, self).__init__(init_cfg=init_cfg) + inter_channels = out_channels // 4 + self.inter_c = inter_channels + self.out_c = out_channels + self.in_c = in_channels + + self.num_subset = A.shape[0] + self.convs = ModuleList() + + for i in range(self.num_subset): + self.convs.append(CTRGC(in_channels, out_channels)) + + if in_channels != out_channels: + self.down = Sequential( + nn.Conv2d(in_channels, out_channels, 1), + nn.BatchNorm2d(out_channels)) + else: + self.down = lambda x: x + + self.A = nn.Parameter(A.clone()) + + self.alpha = nn.Parameter(torch.zeros(1)) + self.bn = nn.BatchNorm2d(out_channels) + self.soft = nn.Softmax(-2) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + y = None + + for i in range(self.num_subset): + z = self.convs[i](x, self.A[i], self.alpha) + y = z + y if y is not None else z + + y = self.bn(y) + y += self.down(x) + return self.relu(y) diff --git a/projects/example_project/README.md b/projects/example_project/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d78eb9b099164bd6dbc763a7056b4dc51c95e7a3 --- /dev/null +++ b/projects/example_project/README.md @@ -0,0 +1,122 @@ +# Example Project + +This is an example README for community `projects/`. You can write your README in your own project. Here are +some recommended parts of a README for others to understand and use your project, you can copy or modify them +according to your project. + +## Usage + +### Setup Environment + +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. + +At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md). + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-------------------------------------------: | -------------------------------------: | -----------------------------: | +| 1x1x3 | 224x224 | 8 | ResNet50 | ImageNet | 72.83 | 90.65 | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://example/checkpoint/url) | [log](https://example/log/url) | + +## Citation + + + +```bibtex +@misc{2020mmaction2, + title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark}, + author={MMAction2 Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmaction2}}, + year={2020} +} +``` + +## Checklist + +Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects. + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + + + - [ ] Basic docstrings & proper citation + + + + - [ ] Converted checkpoint and results (Only for reproduction) + + + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training results + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Unit tests + + + + - [ ] Code style + + + + - [ ] `metafile.yml` and `README.md` + + diff --git a/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..61bb5310c6aba9efe69fd3b2df29d269ada067c2 --- /dev/null +++ b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,11 @@ +# Directly inherit the entire recipe you want to use. +_base_ = 'mmaction::recognition/tsn/' \ + 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py' + +# This line is to import your own modules. +custom_imports = dict(imports='models') + +# Modify the backbone to use your own backbone. +_base_['model']['backbone'] = dict(type='ExampleNet', depth=50) +# Modify the in_channels of classifier head to fit your backbone. +_base_['model']['cls_head']['in_channels'] = 2048 diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..826d70dafe534369df6e5f7a36929726697bb2d9 --- /dev/null +++ b/projects/example_project/models/__init__.py @@ -0,0 +1,3 @@ +from .example_net import ExampleNet + +__all__ = ['ExampleNet'] diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py new file mode 100644 index 0000000000000000000000000000000000000000..415251a7a76c75a20b03b574e8ebf64d54f0ea0d --- /dev/null +++ b/projects/example_project/models/example_net.py @@ -0,0 +1,21 @@ +from mmaction.models import ResNet +from mmaction.registry import MODELS + + +# Register your model to the `MODELS`. +@MODELS.register_module() +class ExampleNet(ResNet): + """Implements an example backbone. + + Implement the backbone network just like a normal pytorch network. + """ + + def __init__(self, **kwargs) -> None: + print('#############################\n' + '# Hello MMAction2! #\n' + '#############################') + super().__init__(**kwargs) + + def forward(self, x): + """Defines the computation performed at every call.""" + return super().forward(x) diff --git a/projects/gesture_recognition/README.md b/projects/gesture_recognition/README.md new file mode 100644 index 0000000000000000000000000000000000000000..519960dc0d7b0b3844a0bbd503d3a73428572ca5 --- /dev/null +++ b/projects/gesture_recognition/README.md @@ -0,0 +1,33 @@ +# Gesture Recognition + + + +## Introduction + + + +In this project, we present a skeleton based pipeline for gesture recognition. The pipeline is three-stage. The first stage consists of a hand detection module that outputs bounding boxes of human hands from video frames. Afterwards, the second stage employs a pose estimation module to generate keypoints of the detected hands. Finally, the third stage utilizes a skeleton-based gesture recognition module to classify hand actions based on the provided hand skeleton. The three-stage pipeline is lightweight and can achieve real-time on CPU devices. In this README, we provide the models and the inference demo for the project. Training data preparation and training scripts are described in [TRAINING.md](/projects/gesture_recognition/TRAINING.md). + +## Hand detection stage + +Hand detection results on OneHand10K validation dataset + +| Config | Input Size | bbox mAP | bbox mAP 50 | bbox mAP 75 | ckpt | log | +| :------------------------------------------------------ | :--------: | :------: | :---------: | :---------: | :---------------------------------------------------: | :--------------------------------------------------: | +| [rtmdet_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) | 320x320 | 0.8100 | 0.9870 | 0.9190 | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) | + +## Pose estimation stage + +Pose estimation results on COCO-WholeBody-Hand validation set + +| Config | Input Size | PCK@0.2 | AUC | EPE | ckpt | +| :----------------------------------------------------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :-------------------------------------: | +| [rtmpose_m](/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.815 | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/) | + +## Gesture recognition stage + +Skeleton base gesture recognition results on Jester validation + +| Config | Input Size | Top 1 accuracy | Top 5 accuracy | ckpt | log | +| :------------------------------------------------------ | :--------: | :------------: | :------------: | :----------------------------------------------------: | :---------------------------------------------------: | +| [STGCNPP](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) | 100x17x3 | 89.22 | 97.52 | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d_20230524-fffa7ff0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.log) | diff --git a/projects/gesture_recognition/TRAINING.md b/projects/gesture_recognition/TRAINING.md new file mode 100644 index 0000000000000000000000000000000000000000..abd2feb87c4527d989511c2db52e5cdf395288e5 --- /dev/null +++ b/projects/gesture_recognition/TRAINING.md @@ -0,0 +1,89 @@ +In this document, we show how to prepare the training data and train models required for this project. + +# Hand detection + +## Data Preparation + +We use multiple hand pose estimation datasets to generate a hand detection dataset. The circumscribed rectangle of hand key points of is used as the detection bounding box of the hand. In our demo, we use 4 datasets supported from [MMPose](https://github.com/open-mmlab/mmpose): [FreiHAND Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#freihand-dataset), [OneHand10K](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#onehand10k), [RHD Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#rhd-dataset) and [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe). You can find instructions for preparing each dataset from the corresponding link. + +To train the hand detection model, you need to install [MMDet](https://github.com/open-mmlab/mmdetection) and move (or link) the above datasets to `$MMDet/data/`. The folder structure should look like this: + +``` +mmdetection +โ”œโ”€โ”€ mmdetection +โ”œโ”€โ”€ docs +โ”œโ”€โ”€ tests +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +|โ”€โ”€ data + |-- freihand + โ”‚-- annotations + โ”‚-- .. + |-- onehand10k + โ”‚-- annotations + โ”‚-- .. + |-- rhd + โ”‚-- annotations + โ”‚-- .. + โ”‚-- halpe + โ”‚-- annotations + |-- hico_20160224_det + โ”‚-- images + |-- .. + โ”‚-- .. +``` + +We provide a [parse_pose.py](/projects/gesture_recognition/parse_pose.py) file to convert the annotation files of the above pose datasets to a COCO-style detection annotation. Suppose you are at `$MMDet/data`, run the following command and it will generate `hand_det_train.json` and `hand_det_val.json` at `$MMDet/data/hand_det/` + +``` +python3 $MMAction/projects/gesture_recognition/parse_pose.py +``` + +The training annotation file combines the above four data sets, and the validation annotation file just uses the OneHand10K validation for a quick verification. You can also add more hand detection datasets to improve performance. Now we are done with data preparation. + +## Training and inference + +We provide a [config](/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py) to train a [RTMDet](https://arxiv.org/abs/2212.07784) detection model. Suppose you are at `$MMDet`, you can run the follow command to train the hand detection model with 8 GPUs: + +```bash +bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py 8 +``` + +To see the detection result for a single image, we can use `$MMDet/demo/image_demo.py`. The follow command will do inference on a single [image](/projects/gesture_recognition/demo/hand_det.jpg) (from a video in the [jester dataset](/tools/data/jester)) and the output should be similar to [this image](/projects/gesture_recognition/demo/hand_det_out.jpg). + +```bash +python3 $MMDet/demo/image_demo.py $MMAction/projects/gesture_recognition/demo/hand_det.jpg PATH_TO_HAND_DET_CHECKPOINT --out-dir='.' +``` + +# Pose estimation + +We directly use the pose estimation model from MMPose. Please refer to [RTMPose](https://github.com/open-mmlab/mmpose/tree/main/configs/hand_2d_keypoint/rtmpose) for details. + +# Gesture recognition + +## Data Preparation + +We use the [jester dataset](/tools/data/jester)) to train a skeleton based gesture recognition model. Please follow the link to prepare this dataset (in frames). + +Once we have the jester dataset, we provide the [extract_keypoint.py](/projects/gesture_recognition/extract_keypoint.py) to extract the hand keypoints for all video frames in the dataset. This step requires the hand detection model and the pose estimation model in the above two stages. Here is an example to extract the keypoints for the dataset. You may need to modify the path to the dataset, configs or checkpoints according to your system. + +```bash +ROOT_TO_JESTER='20bn-jester-v1' +POSE_CONFIG='rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py' +POSE_CKPT='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth' +DET_CONFIG='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py' +DET_CKPT='hand-cocktail5-4e-4-bs256-210e-b74fb594_20230320.pth' +python3 -u extract_keypoint.py $ROOT_TO_JESTER \ + --pose_config $POSE_CONFIG --pose_ckpt $POSE_CKPT \ + --det_config $DET_CONFIG --det-ckpt $DET_CKPT +``` + +The program will generate a `jester.pkl` file in your current directory. Then move this file to `$MMAction`. We will use this file for skeleton based gesture recognition training. + +## Training and inference + +We provide a [config](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) to train a STGCN++ model. Suppose you are at `$MMAction`, you can run the follow command to train the model with 8 GPUs: + +```bash +bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-80e_jester-keypoint-2d.py 8 +``` diff --git a/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py new file mode 100644 index 0000000000000000000000000000000000000000..f91b71f12fa314ed2d331d258c48d2cea1b862ed --- /dev/null +++ b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py @@ -0,0 +1,123 @@ +_base_ = 'mmdet::rtmdet/rtmdet_nano_8xb32-300e_coco.py' + +input_shape = 320 + +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.25, + use_depthwise=True, + ), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True, + ), + bbox_head=dict( + in_channels=64, + feat_channels=64, + share_conv=False, + exp_on_reg=False, + use_depthwise=True, + num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +data_root = 'data/' +file_client_args = dict(backend='disk') + +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='CachedMosaic', + img_scale=(input_shape, input_shape), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(input_shape * 2, input_shape * 2), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(input_shape, input_shape), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + dataset=dict( + data_root=data_root, + ann_file='hand_det/hand_det_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + metainfo=dict(classes=('hand', )), + )) + +val_dataloader = dict( + dataset=dict( + data_root=data_root, + ann_file='hand_det/hand_det_val.json', + data_prefix=dict(img=''), + pipeline=test_pipeline, + metainfo=dict(classes=('hand', )), + )) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'hand_det/hand_det_val.json') +test_evaluator = val_evaluator + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000000000000000000000000000000000000..3fbecfa61e0c2dfe3becc9e1d738989d004588c0 --- /dev/null +++ b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,339 @@ +default_scope = 'mmpose' +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', + interval=10, + save_best='AUC', + rule='greater', + max_keep_ckpts=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='PoseVisualizationHook', enable=False)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=180, + switch_pipeline=[ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0.0, + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5) + ]), + dict( + type='GenerateTarget', + encoder=dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False)), + dict(type='PackPoseInputs') + ]) +] +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend')], + name='visualizer') +log_processor = dict( + type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) +log_level = 'INFO' +load_from = None +resume = False +file_client_args = dict(backend='disk') +train_cfg = dict(by_epoch=True, max_epochs=210, val_interval=10) +val_cfg = dict() +test_cfg = dict() +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 0.004 +randomness = dict(seed=21) +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) +param_scheduler = [ + dict( + type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=105, + end=210, + T_max=105, + by_epoch=True, + convert_to_iter_based=True) +] +auto_scale_lr = dict(base_batch_size=256) +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint=('https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-' + 'f2f7d6f6_20230130.pth'))), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=21, + input_size=(256, 256), + in_featuremap_size=(8, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10.0, + label_softmax=True), + decoder=dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False)), + test_cfg=dict(flip_test=True)) +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' +train_pipeline = [ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0) + ]), + dict( + type='GenerateTarget', + encoder=dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False)), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0.0, + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5) + ]), + dict( + type='GenerateTarget', + encoder=dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False)), + dict(type='PackPoseInputs') +] +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CocoWholeBodyHandDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0) + ]), + dict( + type='GenerateTarget', + encoder=dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False)), + dict(type='PackPoseInputs') + ])) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyHandDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='PackPoseInputs') + ])) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyHandDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImage', file_client_args=dict(backend='disk')), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=(256, 256)), + dict(type='PackPoseInputs') + ])) +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] diff --git a/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..be327212f7fd6eceee601818790fd6343daabba0 --- /dev/null +++ b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py @@ -0,0 +1,113 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +hand_layout = dict( + num_node=17, + inward=[(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), (12, 6), (9, 7), + (7, 5), (10, 8), (8, 6), (5, 0), (6, 0), (1, 0), (3, 1), (2, 0), + (4, 2)], + center=0) + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', + gcn_adaptive='init', + gcn_with_res=True, + tcn_type='mstcn', + graph_cfg=dict(layout=hand_layout, mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=27, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'jester.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=1), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=1), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=1), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/gesture_recognition/demo/hand_det.jpg b/projects/gesture_recognition/demo/hand_det.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c12616fc493b050782bdf5602b40c876e23fa877 Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det.jpg differ diff --git a/projects/gesture_recognition/demo/hand_det_out.jpg b/projects/gesture_recognition/demo/hand_det_out.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2f3c1bed417f63bf0692a63cf4d4b722bca33590 Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det_out.jpg differ diff --git a/projects/gesture_recognition/extract_keypoint.py b/projects/gesture_recognition/extract_keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1476fbb6595801c76cf5aadba309de5a93edae9c --- /dev/null +++ b/projects/gesture_recognition/extract_keypoint.py @@ -0,0 +1,115 @@ +import copy +import os +import pickle +import time +from argparse import ArgumentParser + +import cv2 +import numpy as np +import torch +from mmdet.apis import init_detector +from mmengine.dataset import Compose, pseudo_collate +from mmengine.registry import init_default_scope +from mmpose.apis import init_model +from PIL import Image + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('root', help='Video folder root') + parser.add_argument('--pose_config', help='Pose config file') + parser.add_argument('--pose_ckpt', help='Pose checkpoint file') + parser.add_argument('--det_config', help='Hand detection config file') + parser.add_argument('--det_ckpt', help='Hand detection checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + args = parser.parse_args() + return args + + +@torch.no_grad() +def inference_topdown(model, pose_pipeline, det_model, det_pipeline, folder): + + img_paths = [f'{folder}/{img}' for img in os.listdir(folder)] + + w, h = Image.open(img_paths[0]).size + bbox0 = np.array([[0, 0, w, h]], dtype=np.float32) + + imgs = [cv2.imread(img_path) for img_path in img_paths] + + data_list = [ + dict(img=copy.deepcopy(img), img_id=idx) + for idx, img in enumerate(imgs) + ] + data_list = [det_pipeline(data_info) for data_info in data_list] + batch = pseudo_collate(data_list) + bbox_results = det_model.test_step(batch) + bboxes = [i.pred_instances.bboxes[:1].cpu().numpy() for i in bbox_results] + scores = [] + for i in bbox_results: + try: + score = i.pred_instances.scores[0].item() + except Exception as ex: + print(ex) + score = 0 + scores.append(score) + data_list = [] + for img, bbox, score in zip(imgs, bboxes, scores): + data_info = dict(img=img) + if bbox.shape == bbox0.shape and score > 0.3: + if score > 0.5: + data_info['bbox'] = bbox + else: + w = (score - 0.1) / 0.4 + data_info['bbox'] = w * bbox + (1 - w) * bbox0 + else: + data_info['bbox'] = bbox0 + data_info['bbox_score'] = np.ones(1, dtype=np.float32) # shape (1,) + data_info.update(model.dataset_meta) + data_list.append(pose_pipeline(data_info)) + + batch = pseudo_collate(data_list) + results = model.test_step(batch) + + lookup = {} + for img_path, result in zip(img_paths, results): + keypoints = result.pred_instances.keypoints + scores = result.pred_instances.keypoint_scores + lookup[img_path] = (keypoints, scores, (w, h)) + return lookup + + +def main(): + args = parse_args() + + det_model = init_detector( + args.det_config, args.det_ckpt, device=args.device) + det_model.cfg.test_dataloader.dataset.pipeline[ + 0].type = 'mmdet.LoadImageFromNDArray' + det_pipeline = Compose(det_model.cfg.test_dataloader.dataset.pipeline) + + model = init_model( + args.pose_config, args.pose_checkpoint, device=args.device) + init_default_scope(model.cfg.get('default_scope', 'mmpose')) + + folders = [f'{args.root}/{folder}' for folder in os.listdir(args.root)] + + pose_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + # inference a single image + lookup = {} + L = len(folders) + t = time.time() + for idx, folder in enumerate(folders): + results = inference_topdown(model, pose_pipeline, det_model, + det_pipeline, folder) + lookup.update(results) + if idx % 100 == 99: + eta = (time.time() - t) / (idx + 1) * (L - idx) / 3600 + print('Require %.2f hours' % eta) + + with open('jester.pkl', 'wb') as f: + pickle.dump(lookup, f) + + +if __name__ == '__main__': + main() diff --git a/projects/gesture_recognition/parse_pose.py b/projects/gesture_recognition/parse_pose.py new file mode 100644 index 0000000000000000000000000000000000000000..161be9261fe8f3f4e11beb49e0a9b764c3e18508 --- /dev/null +++ b/projects/gesture_recognition/parse_pose.py @@ -0,0 +1,179 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import os + +import numpy as np + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert and merge hand pose dataset to COCO style') + parser.add_argument( + '--data_root', + type=str, + default='./data/', + help='the root to all involved datasets') + parser.add_argument( + '--out_anno_prefix', + type=str, + default='hand_det', + help='the prefix of output annotation files') + + args = parser.parse_args() + return args + + +def get_data_root(path): + path = path.split('/') + index = path.index('annotations') - 1 + root = path[index] + if root == 'halpe': + root = 'halpe/hico_20160224_det/images/train2015/' + return root + + +def parse_coco_style(file_path, anno_idx=0): + with open(file_path) as f: + contents = json.load(f) + + data_root = get_data_root(file_path) + '/' + images = contents['images'] + annos = contents['annotations'] + images_out, annos_out = [], [] + for img, anno in zip(images, annos): + assert img['id'] == anno['image_id'] + img_out = dict( + file_name=data_root + img['file_name'], + height=img['height'], + width=img['width'], + id=anno_idx) + anno_out = dict( + area=anno['area'], + iscrowd=anno['iscrowd'], + image_id=anno_idx, + bbox=anno['bbox'], + category_id=0, + id=anno_idx) + anno_idx += 1 + images_out.append(img_out) + annos_out.append(anno_out) + return images_out, annos_out, anno_idx + + +def parse_halpe(file_path, anno_idx): + + def get_bbox(keypoints): + """Get bbox from keypoints.""" + if len(keypoints) == 0: + return [0, 0, 0, 0] + x1, y1, _ = np.amin(keypoints, axis=0) + x2, y2, _ = np.amax(keypoints, axis=0) + w, h = x2 - x1, y2 - y1 + return [x1, y1, w, h] + + with open(file_path) as f: + contents = json.load(f) + + data_root = get_data_root(file_path) + '/' + images = contents['images'] + annos = contents['annotations'] + images_out, annos_out = [], [] + for img, anno in zip(images, annos): + assert img['id'] == anno['image_id'] + keypoints = np.array(anno['keypoints']).reshape(-1, 3) + lefthand_kpts = keypoints[-42:-21, :] + righthand_kpts = keypoints[-21:, :] + + left_mask = lefthand_kpts[:, 2] > 0 + right_mask = righthand_kpts[:, 2] > 0 + lefthand_box = get_bbox(lefthand_kpts[left_mask]) + righthand_box = get_bbox(righthand_kpts[right_mask]) + + if max(lefthand_box) > 0: + img_out = dict( + file_name=data_root + img['file_name'], + height=img['height'], + width=img['width'], + id=anno_idx) + anno_out = dict( + area=lefthand_box[2] * lefthand_box[3], + iscrowd=anno['iscrowd'], + image_id=anno_idx, + bbox=lefthand_box, + category_id=0, + id=anno_idx) + anno_idx += 1 + images_out.append(img_out) + annos_out.append(anno_out) + + if max(righthand_box) > 0: + img_out = dict( + file_name=data_root + img['file_name'], + height=img['height'], + width=img['width'], + id=anno_idx) + anno_out = dict( + area=righthand_box[2] * righthand_box[3], + iscrowd=anno['iscrowd'], + image_id=anno_idx, + bbox=righthand_box, + category_id=0, + id=anno_idx) + anno_idx += 1 + images_out.append(img_out) + annos_out.append(anno_out) + return images_out, annos_out, anno_idx + + +train_files = [ + 'freihand/annotations/freihand_train.json', + 'halpe/annotations/halpe_train_v1.json', + 'onehand10k/annotations/onehand10k_train.json', + '/rhd/annotations/rhd_train.json' +] + +val_files = ['onehand10k/annotations/onehand10k_test.json'] + + +def convert2dict(data_root, anno_files): + anno_files = [data_root + _ for _ in anno_files] + + images, annos, anno_idx = [], [], 0 + for anno_file in anno_files: + if 'freihand' in anno_file or 'onehand10k' in anno_file \ + or 'rhd' in anno_file: + images_out, annos_out, anno_idx = parse_coco_style( + anno_file, anno_idx) + images += images_out + annos += annos_out + elif 'halpe' in anno_file: + images_out, annos_out, anno_idx = parse_halpe(anno_file, anno_idx) + images += images_out + annos += annos_out + else: + print(f'{anno_file} not supported') + + result = dict( + images=images, + annotations=annos, + categories=[{ + 'id': 0, + 'name': 'hand' + }]) + return result + + +if __name__ == '__main__': + args = parse_args() + data_root = args.data_root + '/' + prefix = args.out_anno_prefix + os.makedirs('hand_det', exist_ok=True) + + result = convert2dict(data_root, train_files) + with open(f'hand_det/{prefix}_train.json', 'w') as f: + json.dump(result, f) + + result = convert2dict(data_root, val_files) + with open(f'hand_det/{prefix}_val.json', 'w') as f: + json.dump(result, f) diff --git a/projects/knowledge_distillation/README.md b/projects/knowledge_distillation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be34cfd80432a5239ccf16a951d77de7ee491ad3 --- /dev/null +++ b/projects/knowledge_distillation/README.md @@ -0,0 +1,132 @@ +# Knowledge Distillation Based on MMRazor + +Knowledge Distillation is a classic model compression method. The core idea is to "imitate" a teacher model (or multi-model ensemble) with better performance and more complex structure by guiding a lightweight student model, improving the performance of the student model without changing its structure. [MMRazor](https://github.com/open-mmlab/mmrazor) is a model compression toolkit for model slimming and AutoML, which supports several KD algorithms. In this project, we take TSM-MobileNetV2 as an example to show how to use MMRazor to perform knowledge distillation on action recognition models. You could refer to more [MMRazor](https://github.com/open-mmlab/mmrazor) for more model compression algorithms. + +## Description + +This is an implementation of MMRazor Knowledge Distillation Application, we provide action recognition configs and models for MMRazor. + +## Usage + +### Prerequisites + +- [MMRazor v1.0.0](https://github.com/open-mmlab/mmrazor/tree/v1.0.0) or higher + +There are two install modes: + +Option (a). Install as a Python package + +```shell +mim install "mmrazor>=1.0.0" +``` + +Option (b). Install from source + +```shell +git clone https://github.com/open-mmlab/mmrazor.git +cd mmrazor +pip install -v -e . +``` + +### Setup Environment + +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. + +At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +### Data Preparation + +Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md). + +Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link. + +```shell +ln -s ../../data ./data +``` + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +Please convert the knowledge distillation checkpoint to student-only checkpoint with following commands, you will get a checkpoint with a '\_student.pth' suffix under the same directory as the original checkpoint. Then take the student-only checkpoint for testing. + +```bash +mim run mmrazor convert_kd_ckpt_to_student $CHECKPOINT +``` + +**To test with single GPU:** + +```bash +mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results and models + +| Location | Dataset | Teacher | Student | Acc | Acc(T) | Acc(S) | Config | Download | +| :------: | :----------: | :------------: | :---------------: | :---------: | :----: | :----: | :-------------------: | :---------------------------------------------------------------------------- | +| logits | Kinetics-400 | [TSM-ResNet50] | [TSM-MobileNetV2] | 69.60(+0.9) | 73.22 | 68.71 | [config][distill_tsm] | [teacher][tsm_r50_pth] \| [model][distill_pth_tsm] \| [log][distill_log_tsm] | +| logits | Kinetics-400 | [TSN-Swin] | [TSN-ResNet50] | 75.54(+1.4) | 79.22 | 74.12 | [config][distill_tsn] | [teacher][tsn_swin_pth] \| [model][distill_pth_tsn] \| [log][distill_log_tsn] | + +## Citation + +```latex +@article{huang2022knowledge, + title={Knowledge Distillation from A Stronger Teacher}, + author={Huang, Tao and You, Shan and Wang, Fei and Qian, Chen and Xu, Chang}, + journal={arXiv preprint arXiv:2205.10536}, + year={2022} +} +``` + +[distill_log_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.log +[distill_log_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400.log +[distill_pth_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400_20230517-c3e8aa0d.pth +[distill_pth_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400_student_20230530-f938d404.pth +[distill_tsm]: configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py +[distill_tsn]: configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py +[tsm-mobilenetv2]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py +[tsm-resnet50]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py +[tsm_r50_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb_20220831-a6db1e5d.pth +[tsn-resnet50]: ../../configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py +[tsn-swin]: ../../configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py +[tsn_swin_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth diff --git a/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py new file mode 100644 index 0000000000000000000000000000000000000000..3232c4bb5e6bbb064d19bb37e8fbad7ee56843a1 --- /dev/null +++ b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py @@ -0,0 +1,36 @@ +_base_ = 'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py' # noqa: E501 + +teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth' # noqa: E501 +model = dict( + _delete_=True, + _scope_='mmrazor', + type='SingleTeacherDistill', + architecture=dict( + cfg_path= # noqa: E251 + 'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py', # noqa: E501 + pretrained=False), + teacher=dict( + cfg_path= # noqa: E251 + 'mmaction::recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py', # noqa: E501 + pretrained=False), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + student_recorders=dict( + logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')), + teacher_recorders=dict( + logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')), + distill_losses=dict( + loss_dist=dict( + type='DISTLoss', + inter_loss_weight=1.0, + intra_loss_weight=1.0, + tau=1, + loss_weight=1, + )), + loss_forward_mappings=dict( + loss_dist=dict( + logits_S=dict(from_student=True, recorder='logits'), + logits_T=dict(from_student=False, recorder='logits'))))) + +val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop') diff --git a/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py new file mode 100644 index 0000000000000000000000000000000000000000..924c1f84a98af8e26deabb13160cfaf30b06c6a8 --- /dev/null +++ b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py @@ -0,0 +1,38 @@ +_base_ = 'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py' # noqa: E501 + +teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth' # noqa: E501 + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='SingleTeacherDistill', + architecture=dict( + cfg_path= # noqa: E251 + 'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py', # noqa: E501 + backbone=dict(pretrained=False), + pretrained=False), + teacher=dict( + cfg_path= # noqa: E251 + 'mmaction::recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py', # noqa: E501 + pretrained=False), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + student_recorders=dict( + logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')), + teacher_recorders=dict( + logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')), + distill_losses=dict( + loss_dist=dict( + type='DISTLoss', + inter_loss_weight=1.0, + intra_loss_weight=1.0, + tau=1, + loss_weight=4, + )), + loss_forward_mappings=dict( + loss_dist=dict( + logits_S=dict(from_student=True, recorder='logits'), + logits_T=dict(from_student=False, recorder='logits'))))) + +val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop') diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md new file mode 100644 index 0000000000000000000000000000000000000000..56b9b08b1ff971ab9f46e630e26568631ba3427d --- /dev/null +++ b/projects/msg3d/README.md @@ -0,0 +1,117 @@ +# MSG3D Project + +[Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition](https://arxiv.org/abs/2003.14111) + + + +## Abstract + + + +Spatial-temporal graphs have been widely used by skeleton-based action recognition algorithms to model human action dynamics. To capture robust movement patterns from these graphs, long-range and multi-scale context aggregation and spatial-temporal dependency modeling are critical aspects of a powerful feature extractor. However, existing methods have limitations in achieving (1) unbiased long-range joint relationship modeling under multi-scale operators and (2) unobstructed cross-spacetime information flow for capturing complex spatial-temporal dependencies. In this work, we present (1) a simple method to disentangle multi-scale graph convolutions and (2) a unified spatial-temporal graph convolutional operator named G3D. The proposed multi-scale aggregation scheme disentangles the importance of nodes in different neighborhoods for effective long-range modeling. The proposed G3D module leverages dense cross-spacetime edges as skip connections for direct information propagation across the spatial-temporal graph. By coupling these proposals, we develop a powerful feature extractor named MS-G3D based on which our model outperforms previous state-of-the-art methods on three large-scale datasets: NTU RGB+D 60, NTU RGB+D 120, and Kinetics Skeleton 400. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. + +Assume that you are located at `$MMACTION2/projects/msg3d`. + +Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md). + +Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link. + +```shell +ln -s ../../data ./data +``` + +### Data Preparation + +Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md). + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | MSG3D | 92.3 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230309-73b97296.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | MSG3D | 89.6 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-c325d222.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | + +## Citation + + + +```bibtex +@inproceedings{liu2020disentangling, + title={Disentangling and unifying graph convolutions for skeleton-based action recognition}, + author={Liu, Ziyu and Zhang, Hongwen and Chen, Zhenghao and Wang, Zhiyong and Ouyang, Wanli}, + booktitle={CVPR}, + pages={143--152}, + year={2020} +} +``` diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000000000000000000000000000000000..5fa483e0e9dcc3e2b62a9fa8c2ce5d9e35e26bd5 --- /dev/null +++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='MSG3D', graph_cfg=dict(layout='coco', mode='binary_adj')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=384)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..07a135edf36e1f6cc2ceb6d85db568eb106e75a5 --- /dev/null +++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='MSG3D', graph_cfg=dict(layout='nturgb+d', mode='binary_adj')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=384)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/msg3d/models/__init__.py b/projects/msg3d/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f37df2570baec9615a97e89a31d75a9e299f389e --- /dev/null +++ b/projects/msg3d/models/__init__.py @@ -0,0 +1,3 @@ +from .msg3d import MSG3D + +__all__ = ['MSG3D'] diff --git a/projects/msg3d/models/msg3d.py b/projects/msg3d/models/msg3d.py new file mode 100644 index 0000000000000000000000000000000000000000..421529378a9b6ac0a86b4171daf3422239f45f90 --- /dev/null +++ b/projects/msg3d/models/msg3d.py @@ -0,0 +1,75 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModule, Sequential + +from mmaction.models.utils import Graph +from mmaction.registry import MODELS +from .msg3d_utils import MSGCN, MSTCN, MW_MSG3DBlock + + +@MODELS.register_module() +class MSG3D(BaseModule): + + def __init__(self, + graph_cfg, + in_channels=3, + base_channels=96, + num_gcn_scales=13, + num_g3d_scales=6, + num_person=2, + tcn_dropout=0): + super().__init__() + + self.graph = Graph(**graph_cfg) + # Note that A is a 2D tensor + A = torch.tensor( + self.graph.A[0], dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + self.num_point = A.shape[-1] + self.in_channels = in_channels + self.base_channels = base_channels + + self.data_bn = nn.BatchNorm1d(self.num_point * in_channels * + num_person) + c1, c2, c3 = base_channels, base_channels * 2, base_channels * 4 + + # r=3 STGC blocks + self.gcn3d1 = MW_MSG3DBlock(3, c1, A, num_g3d_scales, window_stride=1) + self.sgcn1 = Sequential( + MSGCN(num_gcn_scales, 3, c1, A), MSTCN(c1, c1), MSTCN(c1, c1)) + self.sgcn1[-1].act = nn.Identity() + self.tcn1 = MSTCN(c1, c1, tcn_dropout=tcn_dropout) + + self.gcn3d2 = MW_MSG3DBlock(c1, c2, A, num_g3d_scales, window_stride=2) + self.sgcn2 = Sequential( + MSGCN(num_gcn_scales, c1, c1, A), MSTCN(c1, c2, stride=2), + MSTCN(c2, c2)) + self.sgcn2[-1].act = nn.Identity() + self.tcn2 = MSTCN(c2, c2, tcn_dropout=tcn_dropout) + + self.gcn3d3 = MW_MSG3DBlock(c2, c3, A, num_g3d_scales, window_stride=2) + self.sgcn3 = Sequential( + MSGCN(num_gcn_scales, c2, c2, A), MSTCN(c2, c3, stride=2), + MSTCN(c3, c3)) + self.sgcn3[-1].act = nn.Identity() + self.tcn3 = MSTCN(c3, c3, tcn_dropout=tcn_dropout) + + def forward(self, x): + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous().reshape(N, M * V * C, T) + x = self.data_bn(x) + x = x.reshape(N * M, V, C, T).permute(0, 2, 3, 1).contiguous() + + # Apply activation to the sum of the pathways + x = F.relu(self.sgcn1(x) + self.gcn3d1(x), inplace=True) + x = self.tcn1(x) + + x = F.relu(self.sgcn2(x) + self.gcn3d2(x), inplace=True) + x = self.tcn2(x) + + x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True) + x = self.tcn3(x) + + # N * M, C, T, V + return x.reshape((N, M) + x.shape[1:]) diff --git a/projects/msg3d/models/msg3d_utils.py b/projects/msg3d/models/msg3d_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c9aac57ad612fae3ea01578cff18bf554aa48f9d --- /dev/null +++ b/projects/msg3d/models/msg3d_utils.py @@ -0,0 +1,342 @@ +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer +from mmengine.model import BaseModule, ModuleList, Sequential + +from mmaction.models.utils import unit_tcn +from mmaction.models.utils.graph import k_adjacency, normalize_digraph + + +class MLP(BaseModule): + + def __init__(self, + in_channels, + out_channels, + act_cfg=dict(type='ReLU'), + dropout=0): + super().__init__() + channels = [in_channels] + out_channels + self.layers = ModuleList() + for i in range(1, len(channels)): + if dropout > 1e-3: + self.layers.append(nn.Dropout(p=dropout)) + self.layers.append( + nn.Conv2d(channels[i - 1], channels[i], kernel_size=1)) + self.layers.append(nn.BatchNorm2d(channels[i])) + if act_cfg: + self.layers.append(build_activation_layer(act_cfg)) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class MSGCN(BaseModule): + + def __init__(self, + num_scales, + in_channels, + out_channels, + A, + dropout=0, + act_cfg=dict(type='ReLU')): + super().__init__() + self.num_scales = num_scales + + A_powers = [ + k_adjacency(A, k, with_self=True) for k in range(num_scales) + ] + A_powers = np.stack([normalize_digraph(g) for g in A_powers]) + + # K, V, V + self.register_buffer('A', torch.Tensor(A_powers)) + self.PA = nn.Parameter(self.A.clone()) + nn.init.uniform_(self.PA, -1e-6, 1e-6) + + self.mlp = MLP( + in_channels * num_scales, [out_channels], + dropout=dropout, + act_cfg=act_cfg) + + def forward(self, x): + N, C, T, V = x.shape + A = self.A + A = A + self.PA + + support = torch.einsum('kvu,nctv->nkctu', A, x) + support = support.reshape(N, self.num_scales * C, T, V) + out = self.mlp(support) + return out + + +# ! Notice: The implementation of MSTCN in +# MS-G3D is not the same as our implementation. +class MSTCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilations=[1, 2, 3, 4], + residual=True, + act_cfg=dict(type='ReLU'), + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ], + tcn_dropout=0): + + super().__init__(init_cfg=init_cfg) + # Multiple branches of temporal convolution + self.num_branches = len(dilations) + 2 + branch_channels = out_channels // self.num_branches + branch_channels_rem = out_channels - branch_channels * ( + self.num_branches - 1) + + if type(kernel_size) == list: + assert len(kernel_size) == len(dilations) + else: + kernel_size = [kernel_size] * len(dilations) + + self.branches = ModuleList([ + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + unit_tcn( + branch_channels, + branch_channels, + kernel_size=ks, + stride=stride, + dilation=dilation), + ) for ks, dilation in zip(kernel_size, dilations) + ]) + + # Additional Max & 1x1 branch + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + nn.MaxPool2d( + kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)), + nn.BatchNorm2d(branch_channels))) + + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, + branch_channels_rem, + kernel_size=1, + padding=0, + stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem))) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + self.act = build_activation_layer(act_cfg) + self.drop = nn.Dropout(tcn_dropout) + + def forward(self, x): + # Input dim: (N,C,T,V) + res = self.residual(x) + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + out = torch.cat(branch_outs, dim=1) + out += res + out = self.act(out) + out = self.drop(out) + return out + + +class UnfoldTemporalWindows(BaseModule): + + def __init__(self, window_size, window_stride, window_dilation=1): + super().__init__() + self.window_size = window_size + self.window_stride = window_stride + self.window_dilation = window_dilation + + self.padding = (window_size + (window_size - 1) * + (window_dilation - 1) - 1) // 2 + self.unfold = nn.Unfold( + kernel_size=(self.window_size, 1), + dilation=(self.window_dilation, 1), + stride=(self.window_stride, 1), + padding=(self.padding, 0)) + + def forward(self, x): + # Input shape: (N,C,T,V), out: (N,C,T,V*window_size) + N, C, T, V = x.shape + x = self.unfold(x) + # Permute extra channels from window size to the graph dimension; + # -1 for number of windows + x = x.reshape(N, C, self.window_size, -1, V).permute(0, 1, 3, 2, + 4).contiguous() + x = x.reshape(N, C, -1, self.window_size * V) + return x + + +class ST_MSGCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_size, + residual=False, + dropout=0, + act_cfg=dict(type='ReLU')): + + super().__init__() + self.num_scales = num_scales + self.window_size = window_size + A = self.build_st_graph(A, window_size) + + A_scales = [ + k_adjacency(A, k, with_self=True) for k in range(num_scales) + ] + A_scales = np.stack([normalize_digraph(g) for g in A_scales]) + + self.register_buffer('A', torch.Tensor(A_scales)) + self.V = len(A) + + self.PA = nn.Parameter(self.A.clone()) + nn.init.uniform_(self.PA, -1e-6, 1e-6) + + self.mlp = MLP( + in_channels * num_scales, [out_channels], + dropout=dropout, + act_cfg=act_cfg) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels): + self.residual = lambda x: x + else: + self.residual = MLP(in_channels, [out_channels], act_cfg=None) + + self.act = build_activation_layer(act_cfg) + + def build_st_graph(self, A, window_size): + if not isinstance(A, np.ndarray): + A = A.data.cpu().numpy() + + assert len(A.shape) == 2 and A.shape[0] == A.shape[1] + V = len(A) + A_with_I = A + np.eye(V, dtype=A.dtype) + + A_large = np.tile(A_with_I, (window_size, window_size)).copy() + return A_large + + def forward(self, x): + N, C, T, V = x.shape # T = number of windows, V = self.V * window_size + A = self.A + self.PA + + # Perform Graph Convolution + res = self.residual(x) + agg = torch.einsum('kvu,nctv->nkctu', A, x) + agg = agg.reshape(N, self.num_scales * C, T, V) + out = self.mlp(agg) + if res == 0: + return self.act(out) + else: + return self.act(out + res) + + +class MSG3DBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_size, + window_stride, + window_dilation, + embed_factor=1, + activation='relu'): + + super().__init__() + self.window_size = window_size + self.out_channels = out_channels + self.embed_channels_in = out_channels // embed_factor + self.embed_channels_out = out_channels // embed_factor + if embed_factor == 1: + self.in1x1 = nn.Identity() + self.embed_channels_in = self.embed_channels_out = in_channels + # The first STGC block changes channels right away; + # others change at collapse + if in_channels == 3: + self.embed_channels_out = out_channels + else: + self.in1x1 = MLP(in_channels, [self.embed_channels_in]) + + self.gcn3d = Sequential( + UnfoldTemporalWindows(window_size, window_stride, window_dilation), + ST_MSGCN( + in_channels=self.embed_channels_in, + out_channels=self.embed_channels_out, + A=A, + num_scales=num_scales, + window_size=window_size)) + + self.out_conv = nn.Conv3d( + self.embed_channels_out, + out_channels, + kernel_size=(1, self.window_size, 1)) + self.out_bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + N, _, T, V = x.shape + x = self.in1x1(x) + # Construct temporal windows and apply MS-GCN + x = self.gcn3d(x) + + # Collapse the window dimension + x = x.reshape(N, self.embed_channels_out, -1, self.window_size, V) + x = self.out_conv(x).squeeze(dim=3) + x = self.out_bn(x) + # no activation + return x + + +class MW_MSG3DBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_sizes=[3, 5], + window_stride=1, + window_dilations=[1, 1]): + + super().__init__() + self.gcn3d = ModuleList([ + MSG3DBlock(in_channels, out_channels, A, num_scales, window_size, + window_stride, window_dilation) for window_size, + window_dilation in zip(window_sizes, window_dilations) + ]) + + def forward(self, x): + out_sum = 0 + for gcn3d in self.gcn3d: + out_sum += gcn3d(x) + return out_sum diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..cfe7de00b7d0782930caf96c529ecb63c481647f --- /dev/null +++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenMMLab. All rights reserved. +model = dict( + type='FasterRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +backend_args = None +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(type='LoadImageFromFile', backend_args=None), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') + ], + backend_args=None)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImageFromFile', backend_args=None), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + backend_args=None)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImageFromFile', backend_args=None), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + backend_args=None)) +val_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=None) +test_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=None) +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) +auto_scale_lr = dict(enable=False, base_batch_size=16) +default_scope = 'mmdet' +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend')], + name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py new file mode 100644 index 0000000000000000000000000000000000000000..39a0da1fefc8a2f3438ae2afc9c264ab9612da07 --- /dev/null +++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' +model = dict(roi_head=dict(bbox_head=dict(num_classes=1))) + +# take 2 epochs as an example +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1) + +# learning rate +param_scheduler = [ + dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001)) + +dataset_type = 'CocoDataset' +# modify metainfo +metainfo = { + 'classes': ('person', ), + 'palette': [ + (220, 20, 60), + ] +} + +# specify metainfo, dataset path +data_root = 'data/multisports/' + +train_dataloader = dict( + dataset=dict( + data_root=data_root, + ann_file='annotations/multisports_det_anno_train.json', + data_prefix=dict(img='rawframes/'), + metainfo=metainfo)) + +val_dataloader = dict( + dataset=dict( + data_root=data_root, + ann_file='annotations/multisports_det_anno_val.json', + data_prefix=dict(img='rawframes/'), + metainfo=metainfo)) + +test_dataloader = dict( + dataset=dict( + data_root=data_root, + ann_file='annotations/ms_infer_anno.json', + data_prefix=dict(img='rawframes/'), + metainfo=metainfo)) + +# specify annotaition file path, modify metric items +val_evaluator = dict( + ann_file='data/multisports/annotations/multisports_det_anno_val.json', + metric_items=['mAP_50', 'AR@100'], + iou_thrs=[0.5], +) + +test_evaluator = dict( + ann_file='data/multisports/annotations/ms_infer_anno.json', + metric_items=['mAP_50', 'AR@100'], + iou_thrs=[0.5], +) + +# specify pretrain checkpoint +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501 diff --git a/projects/stad_tutorial/configs/slowonly_k400_multisports.py b/projects/stad_tutorial/configs/slowonly_k400_multisports.py new file mode 100644 index 0000000000000000000000000000000000000000..af33b683a67b9ace07059d2ac11257408f8ee8d2 --- /dev/null +++ b/projects/stad_tutorial/configs/slowonly_k400_multisports.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = [ + 'mmaction::detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py' # noqa: E501 +] + +proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl' # noqa: E501 +proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl' # noqa: E501 + +train_dataloader = dict( + batch_size=2, + num_workers=2, + dataset=dict(proposal_file=proposal_file_train)) + +val_dataloader = dict( + num_workers=2, dataset=dict(proposal_file=proposal_file_val)) + +optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01)) + +load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth' # noqa: E501 diff --git a/projects/stad_tutorial/demo_stad.ipynb b/projects/stad_tutorial/demo_stad.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..feb4e0f8a10a074ed4d3b85a721d021543667368 --- /dev/null +++ b/projects/stad_tutorial/demo_stad.ipynb @@ -0,0 +1,4096 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MxFBtHQ4ooZh" + }, + "source": [ + "\"Open" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ff6iCPqqooZp" + }, + "source": [ + "# Spatio-temporal action detection with MMAction2\n", + "Welcome to MMAction2! This is a tutorial on how to use MMAction2 for spatio-temporal action detection. In this tutorial, we will use the MultiSports dataset as an example, and provide a complete step-by-step guide for spatio-temporal action detection, including\n", + "- Prepare spatio-temporal action detection dataset\n", + "- Train detection model\n", + "- Prepare AVA format dataset\n", + "- Train spatio-temporal action detection model\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "xQlffdn7ooZq" + }, + "source": [ + "## 0. Install MMAction2 and MMDetection" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4vWjBJI-ooZr", + "outputId": "1c852c24-eb40-407d-e1c4-72d4b43385a3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting openmim\n", + " Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n", + "Collecting colorama (from openmim)\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Collecting model-index (from openmim)\n", + " Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n", + "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n", + "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n", + "Collecting ordered-set (from model-index->openmim)\n", + " Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n", + "Installing collected packages: ordered-set, colorama, model-index, openmim\n", + "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmengine\n", + " Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting addict (from mmengine)\n", + " Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n", + "Collecting yapf (from mmengine)\n", + " Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n", + "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n", + " Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n", + "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n", + " Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n", + "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n", + " Attempting uninstall: platformdirs\n", + " Found existing installation: platformdirs 3.3.0\n", + " Uninstalling platformdirs-3.3.0:\n", + " Successfully uninstalled platformdirs-3.3.0\n", + "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmcv\n", + " Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n", + "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n", + "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n", + "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n", + "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n", + "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n", + "Installing collected packages: mmcv\n", + "Successfully installed mmcv-2.0.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmdet\n", + " Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n", + "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n", + "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n", + "Collecting terminaltables (from mmdet)\n", + " Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n", + "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n", + "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n", + "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n", + "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n", + "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n", + "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n", + "Installing collected packages: terminaltables, mmdet\n", + "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n", + "Cloning into 'mmaction2'...\n", + "remote: Enumerating objects: 22869, done.\u001b[K\n", + "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n", + "remote: Compressing objects: 100% (800/800), done.\u001b[K\n", + "remote: Total 22869 (delta 855), reused 1176 (delta 686), pack-reused 21378\u001b[K\n", + "Receiving objects: 100% (22869/22869), 82.81 MiB | 15.42 MiB/s, done.\n", + "Resolving deltas: 100% (15954/15954), done.\n", + "/content/mmaction2\n", + "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Obtaining file:///content/mmaction2\n", + " Running command python setup.py egg_info\n", + " running egg_info\n", + " creating /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info\n", + " writing /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/PKG-INFO\n", + " writing dependency_links to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/dependency_links.txt\n", + " writing requirements to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/requires.txt\n", + " writing top-level names to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/top_level.txt\n", + " writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest template 'MANIFEST.in'\n", + " warning: no files found matching 'mmaction/.mim/model-index.yml'\n", + " warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n", + " warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n", + " warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n", + " warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n", + " adding license file 'LICENSE'\n", + " writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n", + " Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m71.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n", + " Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n", + "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n", + "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n", + "Installing collected packages: einops, decord, mmaction2\n", + " Running setup.py develop for mmaction2\n", + " Running command python setup.py develop\n", + " running develop\n", + " /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n", + " !!\n", + "\n", + " ********************************************************************************\n", + " Please avoid running ``setup.py`` and ``easy_install``.\n", + " Instead, use pypa/build, pypa/installer, pypa/build or\n", + " other standards-based tools.\n", + "\n", + " See https://github.com/pypa/setuptools/issues/917 for details.\n", + " ********************************************************************************\n", + "\n", + " !!\n", + " easy_install.initialize_options(self)\n", + " /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n", + " !!\n", + "\n", + " ********************************************************************************\n", + " Please avoid running ``setup.py`` directly.\n", + " Instead, use pypa/build, pypa/installer, pypa/build or\n", + " other standards-based tools.\n", + "\n", + " See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n", + " ********************************************************************************\n", + "\n", + " !!\n", + " self.initialize_options()\n", + " running egg_info\n", + " creating mmaction2.egg-info\n", + " writing mmaction2.egg-info/PKG-INFO\n", + " writing dependency_links to mmaction2.egg-info/dependency_links.txt\n", + " writing requirements to mmaction2.egg-info/requires.txt\n", + " writing top-level names to mmaction2.egg-info/top_level.txt\n", + " writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest template 'MANIFEST.in'\n", + " adding license file 'LICENSE'\n", + " writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " running build_ext\n", + " Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n", + " Adding mmaction2 1.0.0 to easy-install.pth file\n", + "\n", + " Installed /content/mmaction2\n", + "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n", + "/content/mmaction2/projects/stad_tutorial\n" + ] + } + ], + "source": [ + "%pip install -U openmim\n", + "!mim install mmengine\n", + "!mim install mmcv\n", + "!mim install mmdet\n", + "\n", + "!git clone https://github.com/open-mmlab/mmaction2.git\n", + "\n", + "%cd mmaction2\n", + "%pip install -v -e .\n", + "%cd projects/stad_tutorial" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ox0TM64FooZt" + }, + "source": [ + "## 1. Prepare spatio-temporal action detection dataset\n", + "\n", + "Similar to detection tasks that require bounding box annotations, spatio-temporal action detection tasks require temporal and spatial localization, so more complex tube annotations are required. Taking the MultiSports dataset as an example, the `gttubes` field provides all the target action annotations in the video, and the following is an annotation fragment:\n", + "\n", + "```\n", + " 'gttubes': {\n", + " 'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n", + " {\n", + " 10: # label index\n", + " [\n", + " array([[ 377., 904., 316., 1016., 584.], # 1st tube of class 10\n", + " [ 378., 882., 315., 1016., 579.], # shape (n, 5): n frames๏ผŒeach annotation includes (frame idx๏ผŒx1๏ผŒy1, x2, y2)\n", + " ...\n", + " [ 398., 861., 304., 954., 549.]], dtype=float32)๏ผŒ\n", + "\n", + " array([[ 399., 881., 308., 955., 542.], # 2nd tube of class 10\n", + " [ 400., 862., 303., 988., 539.],\n", + " [ 401., 853., 292., 1000., 535.],\n", + " ...])\n", + " ...\n", + "\n", + " ] ,\n", + " 9: # label index\n", + " [\n", + " array(...), # 1st tube of class 9\n", + " array(...), # 2nd tube of class 9\n", + " ...\n", + " ]\n", + " ...\n", + " }\n", + " }\n", + "```\n", + "\n", + "The annotation file also needs to provide other field information, and the complete ground truth file includes the following information:\n", + "\n", + "```\n", + "{\n", + " 'labels': # label list\n", + " ['aerobic push up', 'aerobic explosive push up', ...],\n", + " 'train_videos': # training video list\n", + " [\n", + " [\n", + " 'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n", + " 'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n", + " ...\n", + " ]\n", + " ]\n", + " 'test_videos': # test video list\n", + " [\n", + " [\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004',\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n", + " ...\n", + " ]\n", + " ]\n", + " 'n_frames': # dict provides frame number of each video\n", + " {\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n", + " ...\n", + " }\n", + " 'resolution': # dict provides resolution of each video\n", + " {\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n", + " ...\n", + " }\n", + " 'gt_tubes': # dict provides bouding boxes of each tube\n", + " {\n", + " ... # refer to above description\n", + " }\n", + "}\n", + "```\n", + "\n", + "The subsequent experiments are based on MultiSports-tiny, we extracted a small number of videos from MultiSports for demonstration purposes." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n5AzsRvdooZv", + "outputId": "a6cad83b-4613-43cc-8c09-86ac79242656" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-06-15 06:00:15-- https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n", + "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.215, 163.181.82.216, 163.181.82.218, ...\n", + "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.215|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 82780160 (79M) [application/x-tar]\n", + "Saving to: โ€˜data/multisports-tiny.tarโ€™\n", + "\n", + "multisports-tiny.ta 100%[===================>] 78.95M 13.3MB/s in 44s \n", + "\n", + "2023-06-15 06:01:00 (1.78 MB/s) - โ€˜data/multisports-tiny.tarโ€™ saved [82780160/82780160]\n", + "\n", + "multisports-tiny/multisports/\n", + "multisports-tiny/multisports/test/\n", + "multisports-tiny/multisports/test/aerobic_gymnastics/\n", + "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n", + "multisports-tiny/multisports/annotations/\n", + "multisports-tiny/multisports/annotations/multisports_GT.pkl\n", + "multisports-tiny/multisports/trainval/\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "The following NEW packages will be installed:\n", + " tree\n", + "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n", + "Need to get 43.0 kB of archives.\n", + "After this operation, 115 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n", + "Fetched 43.0 kB in 1s (43.0 kB/s)\n", + "Selecting previously unselected package tree.\n", + "(Reading database ... 122541 files and directories currently installed.)\n", + "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n", + "Unpacking tree (1.8.0-1) ...\n", + "Setting up tree (1.8.0-1) ...\n", + "Processing triggers for man-db (2.9.1-1) ...\n", + "\u001b[01;34mdata\u001b[00m\n", + "โ”œโ”€โ”€ \u001b[01;34mmultisports\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;34mannotations\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;34mtest\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34maerobic_gymnastics\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34mtrainval\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34maerobic_gymnastics\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n", + "โ””โ”€โ”€ \u001b[01;31mmultisports-tiny.tar\u001b[00m\n", + "\n", + "6 directories, 6 files\n" + ] + } + ], + "source": [ + "# Download dataset\n", + "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n", + "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n", + "!apt-get -q install tree\n", + "!tree data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_u69LHscooZw" + }, + "source": [ + "## 2. Train detection model\n", + "\n", + "In the SlowOnly + Det paradigm, we need to train a human detector first, and then predict actions based on the detection results. In this section, we train a detection model based on the annotation format in the previous section and the MMDetection algorithm library.\n", + "\n", + "### 2.1 Build detection dataset annotation (COCO format)\n", + "\n", + "Based on the annotation information of the spatio-temporal action detection dataset, we can build a COCO format detection dataset for training the detection model. We provide a script to convert the MultiSports format annotation, if you need to convert from other formats, you can refer to the [custom dataset](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html) document provided by MMDetection." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e8fu9VtRooZw", + "outputId": "3e7a7053-a08d-4c32-9d66-a362b3de164d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34mdata/multisports/annotations\u001b[00m\n", + "โ”œโ”€โ”€ multisports_det_anno_train.json\n", + "โ”œโ”€โ”€ multisports_det_anno_val.json\n", + "โ””โ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "\n", + "0 directories, 3 files\n" + ] + } + ], + "source": [ + "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n", + "!tree data/multisports/annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HJAb8EwwooZx", + "outputId": "1c82387c-c731-484c-a4cc-8c255b3f2e62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will generate 3 rgb dir for aerobic_gymnastics.\n", + "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n", + "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n", + "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n" + ] + } + ], + "source": [ + "!python tools/generate_rgb.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "9xIOk_XkooZx" + }, + "source": [ + "### 2.2 Modify config file\n", + "\n", + "We use faster-rcnn_x101-64x4d_fpn_1x_coco as the base configuration, and make the following modifications to train on the MultiSports dataset. The following parts need to be modified:\n", + "- Number of model categories\n", + "- Learning rate adjustment strategy\n", + "- Optimizer configuration\n", + "- Dataset/annotation file path\n", + "- Evaluator configuration\n", + "- Pre-trained model\n", + "\n", + "For more detailed tutorials, please refer to the [prepare configuration file](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9) document provided by MMDetection." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ad1QLNM8ooZy", + "outputId": "55f95e91-8fdf-40fa-dd08-5fa980444b6f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Copyright (c) OpenMMLab. All rights reserved.\n", + "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n", + "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n", + "\n", + "# take 2 epochs as an example\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "\n", + "# learning rate\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "\n", + "# optimizer\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n", + "\n", + "dataset_type = 'CocoDataset'\n", + "# modify metainfo\n", + "metainfo = {\n", + " 'classes': ('person', ),\n", + " 'palette': [\n", + " (220, 20, 60),\n", + " ]\n", + "}\n", + "\n", + "# specify metainfo, dataset path\n", + "data_root = 'data/multisports/'\n", + "\n", + "train_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "test_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "# specify annotaition file path, modify metric items\n", + "val_evaluator = dict(\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5],\n", + ")\n", + "\n", + "test_evaluator = dict(\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5],\n", + ")\n", + "\n", + "# specify pretrain checkpoint\n", + "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501\n" + ] + } + ], + "source": [ + "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "W40JO80nooZ0" + }, + "source": [ + "### 2.3 Train detection model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Oc1LWr4AooZ0" + }, + "source": [ + "By using MIM, you can directly train MMDetection models in the current directory. Here is the simplest example of training on a single GPU. For more training commands, please refer to the MIM [tutorial](https://github.com/open-mmlab/mim#command)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QpxCbvr2ooZ0", + "outputId": "ffe7b420-c359-4e5a-a1b1-3a75e923046d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n", + "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 503128501\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 503128501\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "model = dict(\n", + " type='FasterRCNN',\n", + " data_preprocessor=dict(\n", + " type='DetDataPreprocessor',\n", + " mean=[103.53, 116.28, 123.675],\n", + " std=[1.0, 1.0, 1.0],\n", + " bgr_to_rgb=False,\n", + " pad_size_divisor=32),\n", + " backbone=dict(\n", + " type='ResNet',\n", + " depth=50,\n", + " num_stages=4,\n", + " out_indices=(0, 1, 2, 3),\n", + " frozen_stages=1,\n", + " norm_cfg=dict(type='BN', requires_grad=False),\n", + " norm_eval=True,\n", + " style='caffe',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n", + " neck=dict(\n", + " type='FPN',\n", + " in_channels=[256, 512, 1024, 2048],\n", + " out_channels=256,\n", + " num_outs=5),\n", + " rpn_head=dict(\n", + " type='RPNHead',\n", + " in_channels=256,\n", + " feat_channels=256,\n", + " anchor_generator=dict(\n", + " type='AnchorGenerator',\n", + " scales=[8],\n", + " ratios=[0.5, 1.0, 2.0],\n", + " strides=[4, 8, 16, 32, 64]),\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[1.0, 1.0, 1.0, 1.0]),\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n", + " roi_head=dict(\n", + " type='StandardRoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor',\n", + " roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n", + " out_channels=256,\n", + " featmap_strides=[4, 8, 16, 32]),\n", + " bbox_head=dict(\n", + " type='Shared2FCBBoxHead',\n", + " in_channels=256,\n", + " fc_out_channels=1024,\n", + " roi_feat_size=7,\n", + " num_classes=1,\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[0.1, 0.1, 0.2, 0.2]),\n", + " reg_class_agnostic=False,\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n", + " train_cfg=dict(\n", + " rpn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.7,\n", + " neg_iou_thr=0.3,\n", + " min_pos_iou=0.3,\n", + " match_low_quality=True,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=256,\n", + " pos_fraction=0.5,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=False),\n", + " allowed_border=-1,\n", + " pos_weight=-1,\n", + " debug=False),\n", + " rpn_proposal=dict(\n", + " nms_pre=2000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.5,\n", + " neg_iou_thr=0.5,\n", + " min_pos_iou=0.5,\n", + " match_low_quality=False,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=512,\n", + " pos_fraction=0.25,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=-1,\n", + " debug=False)),\n", + " test_cfg=dict(\n", + " rpn=dict(\n", + " nms_pre=1000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " score_thr=0.05,\n", + " nms=dict(type='nms', iou_threshold=0.5),\n", + " max_per_img=100)))\n", + "dataset_type = 'CocoDataset'\n", + "data_root = 'data/multisports/'\n", + "backend_args = None\n", + "train_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " batch_sampler=dict(type='AspectRatioBatchSampler'),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " filter_cfg=dict(filter_empty_gt=True, min_size=32),\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "test_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n", + "auto_scale_lr = dict(enable=False, base_batch_size=16)\n", + "default_scope = 'mmdet'\n", + "default_hooks = dict(\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=50),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(type='CheckpointHook', interval=1),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " visualization=dict(type='DetVisualizationHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='DetLocalVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " name='visualizer')\n", + "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n", + "log_level = 'INFO'\n", + "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n", + "resume = False\n", + "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n", + "launcher = 'none'\n", + "work_dir = 'work_dirs/det_model'\n", + "\n", + "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "loading annotations into memory...\n", + "Done (t=0.01s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n", + "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n", + "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n", + "100% 89.9M/89.9M [00:02<00:00, 34.8MB/s]\n", + "06/15 06:02:21 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n", + "\n", + "unexpected key in source state_dict: conv1.bias\n", + "\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "100% 158M/158M [00:04<00:00, 37.4MB/s]\n", + "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n", + "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n", + "06/15 06:02:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118] lr: 5.0000e-03 eta: 0:01:56 time: 0.6273 data_time: 0.0111 memory: 3414 loss: 0.5456 loss_rpn_cls: 0.0070 loss_rpn_bbox: 0.0167 loss_cls: 0.1887 acc: 93.2617 loss_bbox: 0.3332\n", + "06/15 06:03:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118] lr: 5.0000e-03 eta: 0:01:16 time: 0.5041 data_time: 0.0078 memory: 3414 loss: 0.4017 loss_rpn_cls: 0.0027 loss_rpn_bbox: 0.0130 loss_cls: 0.1313 acc: 94.8242 loss_bbox: 0.2547\n", + "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n", + "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n", + "06/15 06:03:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120] eta: 0:00:08 time: 0.1196 data_time: 0.0059 memory: 3414 \n", + "06/15 06:03:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120] eta: 0:00:02 time: 0.1234 data_time: 0.0082 memory: 679 \n", + "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.05s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.01s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.872\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.709\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.886\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.964\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = 0.964\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = 0.964\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.963\n", + "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.872 -1.000 -1.000 -1.000 0.709 0.886\n", + "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: 0.9640 data_time: 0.0067 time: 0.1212\n", + "06/15 06:04:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118] lr: 5.0000e-03 eta: 0:00:37 time: 0.5316 data_time: 0.0094 memory: 3414 loss: 0.3385 loss_rpn_cls: 0.0012 loss_rpn_bbox: 0.0111 loss_cls: 0.1119 acc: 95.4102 loss_bbox: 0.2143\n", + "06/15 06:04:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118] lr: 5.0000e-03 eta: 0:00:09 time: 0.5152 data_time: 0.0078 memory: 3414 loss: 0.3152 loss_rpn_cls: 0.0017 loss_rpn_bbox: 0.0109 loss_cls: 0.1050 acc: 94.7266 loss_bbox: 0.1977\n", + "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n", + "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n", + "06/15 06:04:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120] eta: 0:00:08 time: 0.1237 data_time: 0.0080 memory: 3414 \n", + "06/15 06:05:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120] eta: 0:00:02 time: 0.1202 data_time: 0.0062 memory: 679 \n", + "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.04s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.01s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.907\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.762\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.910\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n", + "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.907 -1.000 -1.000 -1.000 0.762 0.910\n", + "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: 0.9600 data_time: 0.0066 time: 0.1214\n", + "\u001b[32mTraining finished successfully. \u001b[0m\n" + ] + } + ], + "source": [ + "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --work-dir work_dirs/det_model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "IxlO927KooZ1" + }, + "source": [ + "### 2.4 Generating Proposal BBoxes\n", + "\n", + "During the training of the spatiotemporal action detection model, we need to rely on proposals generated by the detection model, rather than annotated detection boxes. Therefore, we need to use a trained detection model to perform inference on the entire dataset and convert the resulting proposals into the required format for subsequent training.\n", + "\n", + "#### 2.4.1 Converting the Dataset to Coco Format\n", + "\n", + "We provide a script to convert the MultiSports dataset into an annotation format without ground truth, which is used for inference." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e6C7D2DSooZ1", + "outputId": "878015d1-0fc7-4eb6-af77-4f61aefcf2b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[>>] 2350/2350, 2053.0 task/s, elapsed: 1s, ETA: 0s\n", + "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n" + ] + } + ], + "source": [ + "!echo 'person' > data/multisports/annotations/label_map.txt\n", + "!python tools/images2coco.py \\\n", + " data/multisports/rawframes \\\n", + " data/multisports/annotations/label_map.txt \\\n", + " ms_infer_anno.json" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "fGL3t4MEooZ1" + }, + "source": [ + "#### 2.4.2 Inference for Generating Proposal Files\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "gerYk6q6ooZ1" + }, + "source": [ + "The inference of MMDetection models is also based on MIM. For more testing commands, please refer to the MIM [tutorial](GitHub - open-mmlab/mim: MIM Installs OpenMMLab Packages).\n", + "\n", + "After the inference is completed, the results will be saved in 'data/multisports/ms_proposals.pkl'." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lutiaqzpooZ1", + "outputId": "b05db6e8-04de-4e1e-8d99-32f4c952d633" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n", + "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 1289054678\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 1289054678\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "model = dict(\n", + " type='FasterRCNN',\n", + " data_preprocessor=dict(\n", + " type='DetDataPreprocessor',\n", + " mean=[103.53, 116.28, 123.675],\n", + " std=[1.0, 1.0, 1.0],\n", + " bgr_to_rgb=False,\n", + " pad_size_divisor=32),\n", + " backbone=dict(\n", + " type='ResNet',\n", + " depth=50,\n", + " num_stages=4,\n", + " out_indices=(0, 1, 2, 3),\n", + " frozen_stages=1,\n", + " norm_cfg=dict(type='BN', requires_grad=False),\n", + " norm_eval=True,\n", + " style='caffe',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n", + " neck=dict(\n", + " type='FPN',\n", + " in_channels=[256, 512, 1024, 2048],\n", + " out_channels=256,\n", + " num_outs=5),\n", + " rpn_head=dict(\n", + " type='RPNHead',\n", + " in_channels=256,\n", + " feat_channels=256,\n", + " anchor_generator=dict(\n", + " type='AnchorGenerator',\n", + " scales=[8],\n", + " ratios=[0.5, 1.0, 2.0],\n", + " strides=[4, 8, 16, 32, 64]),\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[1.0, 1.0, 1.0, 1.0]),\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n", + " roi_head=dict(\n", + " type='StandardRoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor',\n", + " roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n", + " out_channels=256,\n", + " featmap_strides=[4, 8, 16, 32]),\n", + " bbox_head=dict(\n", + " type='Shared2FCBBoxHead',\n", + " in_channels=256,\n", + " fc_out_channels=1024,\n", + " roi_feat_size=7,\n", + " num_classes=1,\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[0.1, 0.1, 0.2, 0.2]),\n", + " reg_class_agnostic=False,\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n", + " train_cfg=dict(\n", + " rpn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.7,\n", + " neg_iou_thr=0.3,\n", + " min_pos_iou=0.3,\n", + " match_low_quality=True,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=256,\n", + " pos_fraction=0.5,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=False),\n", + " allowed_border=-1,\n", + " pos_weight=-1,\n", + " debug=False),\n", + " rpn_proposal=dict(\n", + " nms_pre=2000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.5,\n", + " neg_iou_thr=0.5,\n", + " min_pos_iou=0.5,\n", + " match_low_quality=False,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=512,\n", + " pos_fraction=0.25,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=-1,\n", + " debug=False)),\n", + " test_cfg=dict(\n", + " rpn=dict(\n", + " nms_pre=1000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " score_thr=0.05,\n", + " nms=dict(type='nms', iou_threshold=0.5),\n", + " max_per_img=100)))\n", + "dataset_type = 'CocoDataset'\n", + "data_root = 'data/multisports/'\n", + "backend_args = None\n", + "train_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " batch_sampler=dict(type='AspectRatioBatchSampler'),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " filter_cfg=dict(filter_empty_gt=True, min_size=32),\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "test_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n", + "auto_scale_lr = dict(enable=False, base_batch_size=16)\n", + "default_scope = 'mmdet'\n", + "default_hooks = dict(\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=50),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(type='CheckpointHook', interval=1),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " visualization=dict(type='DetVisualizationHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='DetLocalVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " name='visualizer')\n", + "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n", + "log_level = 'INFO'\n", + "load_from = 'work_dirs/det_model/epoch_2.pth'\n", + "resume = False\n", + "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n", + "launcher = 'none'\n", + "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n", + "\n", + "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "06/15 06:05:20 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n", + "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n", + "06/15 06:05:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n", + "06/15 06:05:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 50/2350] eta: 0:05:50 time: 0.1523 data_time: 0.0084 memory: 512 \n", + "06/15 06:05:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350] eta: 0:05:05 time: 0.1191 data_time: 0.0042 memory: 512 \n", + "06/15 06:05:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350] eta: 0:04:45 time: 0.1178 data_time: 0.0023 memory: 512 \n", + "06/15 06:05:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350] eta: 0:04:36 time: 0.1255 data_time: 0.0074 memory: 512 \n", + "06/15 06:05:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350] eta: 0:04:26 time: 0.1205 data_time: 0.0031 memory: 512 \n", + "06/15 06:05:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350] eta: 0:04:19 time: 0.1238 data_time: 0.0063 memory: 512 \n", + "06/15 06:06:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350] eta: 0:04:11 time: 0.1206 data_time: 0.0046 memory: 512 \n", + "06/15 06:06:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350] eta: 0:04:03 time: 0.1178 data_time: 0.0030 memory: 512 \n", + "06/15 06:06:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350] eta: 0:03:56 time: 0.1212 data_time: 0.0058 memory: 512 \n", + "06/15 06:06:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350] eta: 0:03:48 time: 0.1165 data_time: 0.0031 memory: 512 \n", + "06/15 06:06:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350] eta: 0:03:41 time: 0.1202 data_time: 0.0061 memory: 512 \n", + "06/15 06:06:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350] eta: 0:03:34 time: 0.1179 data_time: 0.0044 memory: 512 \n", + "06/15 06:06:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350] eta: 0:03:27 time: 0.1156 data_time: 0.0024 memory: 512 \n", + "06/15 06:06:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350] eta: 0:03:21 time: 0.1212 data_time: 0.0058 memory: 512 \n", + "06/15 06:06:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350] eta: 0:03:14 time: 0.1161 data_time: 0.0025 memory: 512 \n", + "06/15 06:06:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350] eta: 0:03:08 time: 0.1200 data_time: 0.0058 memory: 512 \n", + "06/15 06:07:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350] eta: 0:03:02 time: 0.1203 data_time: 0.0053 memory: 512 \n", + "06/15 06:07:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350] eta: 0:02:55 time: 0.1177 data_time: 0.0030 memory: 512 \n", + "06/15 06:07:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350] eta: 0:02:50 time: 0.1233 data_time: 0.0076 memory: 512 \n", + "06/15 06:07:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350] eta: 0:02:43 time: 0.1172 data_time: 0.0025 memory: 512 \n", + "06/15 06:07:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350] eta: 0:02:37 time: 0.1202 data_time: 0.0053 memory: 512 \n", + "06/15 06:07:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350] eta: 0:02:31 time: 0.1208 data_time: 0.0059 memory: 512 \n", + "06/15 06:07:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350] eta: 0:02:25 time: 0.1167 data_time: 0.0030 memory: 512 \n", + "06/15 06:07:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350] eta: 0:02:19 time: 0.1212 data_time: 0.0053 memory: 512 \n", + "06/15 06:07:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350] eta: 0:02:12 time: 0.1163 data_time: 0.0027 memory: 512 \n", + "06/15 06:07:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350] eta: 0:02:06 time: 0.1188 data_time: 0.0046 memory: 512 \n", + "06/15 06:08:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350] eta: 0:02:00 time: 0.1201 data_time: 0.0056 memory: 512 \n", + "06/15 06:08:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350] eta: 0:01:54 time: 0.1161 data_time: 0.0024 memory: 512 \n", + "06/15 06:08:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350] eta: 0:01:48 time: 0.1234 data_time: 0.0079 memory: 512 \n", + "06/15 06:08:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350] eta: 0:01:42 time: 0.1165 data_time: 0.0024 memory: 512 \n", + "06/15 06:08:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350] eta: 0:01:36 time: 0.1191 data_time: 0.0043 memory: 512 \n", + "06/15 06:08:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350] eta: 0:01:30 time: 0.1219 data_time: 0.0071 memory: 512 \n", + "06/15 06:08:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350] eta: 0:01:24 time: 0.1166 data_time: 0.0026 memory: 512 \n", + "06/15 06:08:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350] eta: 0:01:18 time: 0.1224 data_time: 0.0067 memory: 512 \n", + "06/15 06:08:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350] eta: 0:01:12 time: 0.1175 data_time: 0.0032 memory: 512 \n", + "06/15 06:08:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350] eta: 0:01:06 time: 0.1186 data_time: 0.0041 memory: 512 \n", + "06/15 06:09:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350] eta: 0:01:00 time: 0.1227 data_time: 0.0067 memory: 512 \n", + "06/15 06:09:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350] eta: 0:00:54 time: 0.1220 data_time: 0.0070 memory: 512 \n", + "06/15 06:09:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350] eta: 0:00:48 time: 0.1229 data_time: 0.0081 memory: 512 \n", + "06/15 06:09:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350] eta: 0:00:42 time: 0.1173 data_time: 0.0029 memory: 512 \n", + "06/15 06:09:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350] eta: 0:00:36 time: 0.1184 data_time: 0.0037 memory: 512 \n", + "06/15 06:09:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350] eta: 0:00:30 time: 0.1216 data_time: 0.0066 memory: 512 \n", + "06/15 06:09:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350] eta: 0:00:24 time: 0.1166 data_time: 0.0026 memory: 512 \n", + "06/15 06:09:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350] eta: 0:00:18 time: 0.1213 data_time: 0.0052 memory: 512 \n", + "06/15 06:09:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350] eta: 0:00:12 time: 0.1180 data_time: 0.0033 memory: 512 \n", + "06/15 06:09:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350] eta: 0:00:06 time: 0.1173 data_time: 0.0032 memory: 512 \n", + "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350] eta: 0:00:00 time: 0.1203 data_time: 0.0048 memory: 512 \n", + "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.01s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.36s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.28s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n", + "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n", + "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n", + "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: -1.0000 data_time: 0.0047 time: 0.1202\n", + "\u001b[32mTesting finished successfully.\u001b[0m\n" + ] + } + ], + "source": [ + "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --checkpoint work_dirs/det_model/epoch_2.pth \\\n", + " --out data/multisports/annotations/ms_det_proposals.pkl" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jzWhc7ClooZ1" + }, + "source": [ + "## 3. Training the Spatio-temporal Action Detection Model\n", + "The provided annotation files and the proposal files generated by MMDetection need to be converted to the required format for training the spatiotemporal action detection model. We have provided relevant script to generate the specified format." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W3slJsWHooZ2", + "outputId": "42a4b7be-91f8-4443-b693-ab40b743a14f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading test result...\n", + "[>>] 2350/2350, 3799.7 task/s, elapsed: 1s, ETA: 0s\n", + "\u001b[01;34mdata/multisports/annotations\u001b[00m\n", + "โ”œโ”€โ”€ label_map.txt\n", + "โ”œโ”€โ”€ ms_det_proposals.pkl\n", + "โ”œโ”€โ”€ ms_infer_anno.json\n", + "โ”œโ”€โ”€ multisports_det_anno_train.json\n", + "โ”œโ”€โ”€ multisports_det_anno_val.json\n", + "โ”œโ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "โ”œโ”€โ”€ multisports_proposals_train.pkl\n", + "โ”œโ”€โ”€ multisports_proposals_val.pkl\n", + "โ”œโ”€โ”€ multisports_train.csv\n", + "โ””โ”€โ”€ multisports_val.csv\n", + "\n", + "0 directories, 10 files\n" + ] + } + ], + "source": [ + "# Convert annotation files\n", + "!python ../../tools/data/multisports/parse_anno.py\n", + "\n", + "# Convert proposal files\n", + "!python tools/convert_proposals.py\n", + "\n", + "!tree data/multisports/annotations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "yRSSHmw0ooZ2" + }, + "source": [ + "### 3.2 Training the Spatio-temporal Action Detection Model\n", + "\n", + "MMAction2 already supports training on the MultiSports dataset. You just need to modify the path to the proposal file. For detailed configurations, please refer to the [config](configs/slowonly_k400_multisports.py) file. Since the training data is limited, the configuration uses a pre-trained model trained on the complete MultiSports dataset. When training with a custom dataset, you don't need to specify the `load_from` configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vwaay7NvooZ2", + "outputId": "add60ddd-2a40-4356-b120-1e7940043778" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n", + "06/15 06:10:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 1735696538\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 1735696538\n", + " diff_rank_seed: False\n", + " deterministic: False\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:10:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "default_scope = 'mmaction'\n", + "default_hooks = dict(\n", + " runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n", + " timer=dict(type='IterTimerHook', _scope_='mmaction'),\n", + " logger=dict(\n", + " type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n", + " param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n", + " checkpoint=dict(\n", + " type='CheckpointHook',\n", + " interval=1,\n", + " save_best='auto',\n", + " _scope_='mmaction'),\n", + " sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n", + " sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "log_processor = dict(\n", + " type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n", + "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n", + "visualizer = dict(\n", + " type='ActionVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " _scope_='mmaction')\n", + "log_level = 'INFO'\n", + "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n", + "resume = False\n", + "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n", + "num_classes = 66\n", + "model = dict(\n", + " type='FastRCNN',\n", + " _scope_='mmdet',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint=\n", + " 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n", + " ),\n", + " backbone=dict(\n", + " type='mmaction.ResNet3dSlowOnly',\n", + " depth=50,\n", + " pretrained=None,\n", + " pretrained2d=False,\n", + " lateral=False,\n", + " num_stages=4,\n", + " conv1_kernel=(1, 7, 7),\n", + " conv1_stride_t=1,\n", + " pool1_stride_t=1,\n", + " spatial_strides=(1, 2, 2, 1)),\n", + " roi_head=dict(\n", + " type='AVARoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor3D',\n", + " roi_layer_type='RoIAlign',\n", + " output_size=8,\n", + " with_temporal_pool=True),\n", + " bbox_head=dict(\n", + " type='BBoxHeadAVA',\n", + " in_channels=2048,\n", + " num_classes=66,\n", + " multilabel=False,\n", + " dropout_ratio=0.5)),\n", + " data_preprocessor=dict(\n", + " type='mmaction.ActionDataPreprocessor',\n", + " mean=[123.675, 116.28, 103.53],\n", + " std=[58.395, 57.12, 57.375],\n", + " format_shape='NCTHW'),\n", + " train_cfg=dict(\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssignerAVA',\n", + " pos_iou_thr=0.9,\n", + " neg_iou_thr=0.9,\n", + " min_pos_iou=0.9),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=32,\n", + " pos_fraction=1,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=1.0)),\n", + " test_cfg=dict(rcnn=None))\n", + "dataset_type = 'AVADataset'\n", + "data_root = 'data/multisports/trainval'\n", + "anno_root = 'data/multisports/annotations'\n", + "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n", + "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n", + "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n", + "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n", + "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n", + "file_client_args = dict(io_backend='disk')\n", + "train_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " _scope_='mmaction'),\n", + " dict(type='DecordDecode', _scope_='mmaction'),\n", + " dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n", + " dict(type='RandomCrop', size=256, _scope_='mmaction'),\n", + " dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n", + " dict(\n", + " type='FormatShape',\n", + " input_format='NCTHW',\n", + " collapse=True,\n", + " _scope_='mmaction'),\n", + " dict(type='PackActionInputs', _scope_='mmaction')\n", + "]\n", + "val_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True,\n", + " _scope_='mmaction'),\n", + " dict(type='DecordDecode', _scope_='mmaction'),\n", + " dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n", + " dict(\n", + " type='FormatShape',\n", + " input_format='NCTHW',\n", + " collapse=True,\n", + " _scope_='mmaction'),\n", + " dict(type='PackActionInputs', _scope_='mmaction')\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_train.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n", + " dict(type='DecordDecode'),\n", + " dict(type='RandomRescale', scale_range=(256, 320)),\n", + " dict(type='RandomCrop', size=256),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_proposals_train.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_val.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_proposals_val.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " test_mode=True,\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=8,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_val.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " test_mode=True,\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "val_evaluator = dict(\n", + " type='MultiSportsMetric',\n", + " ann_file='data/multisports/annotations/multisports_GT.pkl',\n", + " _scope_='mmaction')\n", + "test_evaluator = dict(\n", + " type='MultiSportsMetric',\n", + " ann_file='data/multisports/annotations/multisports_GT.pkl',\n", + " _scope_='mmaction')\n", + "train_cfg = dict(\n", + " type='EpochBasedTrainLoop',\n", + " max_epochs=8,\n", + " val_begin=1,\n", + " val_interval=1,\n", + " _scope_='mmaction')\n", + "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n", + "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n", + "param_scheduler = [\n", + " dict(\n", + " type='LinearLR',\n", + " start_factor=0.1,\n", + " by_epoch=True,\n", + " begin=0,\n", + " end=5,\n", + " _scope_='mmaction'),\n", + " dict(\n", + " type='MultiStepLR',\n", + " begin=0,\n", + " end=8,\n", + " by_epoch=True,\n", + " milestones=[6, 7],\n", + " gamma=0.1,\n", + " _scope_='mmaction')\n", + "]\n", + "optim_wrapper = dict(\n", + " optimizer=dict(\n", + " type='SGD',\n", + " lr=0.01,\n", + " momentum=0.9,\n", + " weight_decay=1e-05,\n", + " _scope_='mmaction'),\n", + " clip_grad=dict(max_norm=5, norm_type=2))\n", + "launcher = 'none'\n", + "work_dir = 'work_dirs/stad_model/'\n", + "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n", + "\n", + "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n", + "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n", + "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "100% 124M/124M [00:01<00:00, 103MB/s]\n", + "06/15 06:10:28 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n", + "\n", + "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n", + "\n", + "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n", + "\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "100% 122M/122M [00:03<00:00, 36.1MB/s]\n", + "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n", + "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n", + "06/15 06:10:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118] lr: 1.0000e-03 eta: 0:06:07 time: 0.3982 data_time: 0.0431 memory: 1383 grad_norm: 13.0844 loss: 1.3834 recall@thr=0.5: 0.5385 prec@thr=0.5: 0.5385 recall@top3: 0.8462 prec@top3: 0.2821 recall@top5: 0.8462 prec@top5: 0.1692 loss_action_cls: 1.3834\n", + "06/15 06:10:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118] lr: 1.0000e-03 eta: 0:05:32 time: 0.3383 data_time: 0.0732 memory: 1383 grad_norm: 4.6786 loss: 0.6001 recall@thr=0.5: 0.9444 prec@thr=0.5: 0.9444 recall@top3: 0.9444 prec@top3: 0.3148 recall@top5: 0.9444 prec@top5: 0.1889 loss_action_cls: 0.6001\n", + "06/15 06:10:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118] lr: 1.0000e-03 eta: 0:04:59 time: 0.2784 data_time: 0.0300 memory: 1383 grad_norm: 2.9446 loss: 0.5144 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.5144\n", + "06/15 06:10:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118] lr: 1.0000e-03 eta: 0:04:36 time: 0.2646 data_time: 0.0144 memory: 1383 grad_norm: 1.7695 loss: 0.4988 recall@thr=0.5: 0.6923 prec@thr=0.5: 0.6923 recall@top3: 0.6923 prec@top3: 0.2308 recall@top5: 0.6923 prec@top5: 0.1385 loss_action_cls: 0.4988\n", + "06/15 06:11:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118] lr: 1.0000e-03 eta: 0:04:35 time: 0.3502 data_time: 0.0839 memory: 1383 grad_norm: 2.4095 loss: 0.3218 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 0.9333 prec@top3: 0.3111 recall@top5: 0.9333 prec@top5: 0.1867 loss_action_cls: 0.3218\n", + "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118] lr: 1.0000e-03 eta: 0:04:20 time: 0.2563 data_time: 0.0102 memory: 1383 grad_norm: 1.8156 loss: 0.3895 recall@thr=0.5: 0.8125 prec@thr=0.5: 0.8125 recall@top3: 0.9375 prec@top3: 0.3125 recall@top5: 0.9375 prec@top5: 0.1875 loss_action_cls: 0.3895\n", + "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n", + "06/15 06:11:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120] eta: 0:00:16 time: 0.1669 data_time: 0.1073 memory: 466 \n", + "06/15 06:11:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120] eta: 0:00:13 time: 0.1698 data_time: 0.1145 memory: 466 \n", + "06/15 06:11:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120] eta: 0:00:09 time: 0.1428 data_time: 0.0896 memory: 466 \n", + "06/15 06:11:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120] eta: 0:00:05 time: 0.0998 data_time: 0.0504 memory: 466 \n", + "06/15 06:11:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120] eta: 0:00:02 time: 0.1122 data_time: 0.0612 memory: 466 \n", + "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] eta: 0:00:00 time: 0.1031 data_time: 0.0528 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 59.66\n", + "aerobic split jump 30.80\n", + "aerobic scissors leap 88.34\n", + "aerobic turn 98.48\n", + "mAP 69.32\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 25.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 100.00\n", + "mAP 56.25\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 25.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 50.00\n", + "aerobic turn 100.00\n", + "mAP 43.75\n", + "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] mAP/frameAP: 69.3181 mAP/v_map@0.2: 56.2500 mAP/v_map@0.5: 43.7500 mAP/v_map_0.05:0.45: 55.1389 mAP/v_map_0.10:0.90: 41.2500 mAP/v_map_0.50:0.95: 28.1750 data_time: 0.0793 time: 0.1324\n", + "06/15 06:11:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 69.3181 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n", + "06/15 06:11:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118] lr: 3.2500e-03 eta: 0:04:10 time: 0.2884 data_time: 0.0401 memory: 1383 grad_norm: 1.3823 loss: 0.3596 recall@thr=0.5: 0.6923 prec@thr=0.5: 0.6923 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3596\n", + "06/15 06:11:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118] lr: 3.2500e-03 eta: 0:04:00 time: 0.2728 data_time: 0.0204 memory: 1383 grad_norm: 1.2185 loss: 0.5274 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.5274\n", + "06/15 06:11:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118] lr: 3.2500e-03 eta: 0:03:56 time: 0.3296 data_time: 0.0699 memory: 1383 grad_norm: 1.7120 loss: 0.3599 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3599\n", + "06/15 06:11:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118] lr: 3.2500e-03 eta: 0:03:46 time: 0.2584 data_time: 0.0120 memory: 1383 grad_norm: 1.7462 loss: 0.2598 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2598\n", + "06/15 06:12:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118] lr: 3.2500e-03 eta: 0:03:39 time: 0.2858 data_time: 0.0263 memory: 1383 grad_norm: 0.8975 loss: 0.3959 recall@thr=0.5: 0.7692 prec@thr=0.5: 0.7692 recall@top3: 0.9231 prec@top3: 0.3077 recall@top5: 0.9231 prec@top5: 0.1846 loss_action_cls: 0.3959\n", + "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118] lr: 3.2500e-03 eta: 0:03:35 time: 0.3381 data_time: 0.0807 memory: 1383 grad_norm: 0.5466 loss: 0.4871 recall@thr=0.5: 0.8333 prec@thr=0.5: 0.8333 recall@top3: 0.8333 prec@top3: 0.2778 recall@top5: 0.8333 prec@top5: 0.1667 loss_action_cls: 0.4871\n", + "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n", + "06/15 06:12:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120] eta: 0:00:12 time: 0.1230 data_time: 0.0693 memory: 466 \n", + "06/15 06:12:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120] eta: 0:00:09 time: 0.1138 data_time: 0.0632 memory: 466 \n", + "06/15 06:12:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120] eta: 0:00:07 time: 0.1214 data_time: 0.0672 memory: 466 \n", + "06/15 06:12:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120] eta: 0:00:05 time: 0.1539 data_time: 0.1001 memory: 466 \n", + "06/15 06:12:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120] eta: 0:00:02 time: 0.1488 data_time: 0.0936 memory: 466 \n", + "06/15 06:12:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] eta: 0:00:00 time: 0.1030 data_time: 0.0539 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 39.91\n", + "aerobic split jump 29.66\n", + "aerobic scissors leap 90.70\n", + "aerobic turn 96.92\n", + "mAP 64.30\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 100.00\n", + "mAP 55.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 100.00\n", + "mAP 34.00\n", + "06/15 06:12:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] mAP/frameAP: 64.2982 mAP/v_map@0.2: 55.0000 mAP/v_map@0.5: 34.0000 mAP/v_map_0.05:0.45: 53.8889 mAP/v_map_0.10:0.90: 34.5833 mAP/v_map_0.50:0.95: 19.1250 data_time: 0.0744 time: 0.1270\n", + "06/15 06:12:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118] lr: 5.5000e-03 eta: 0:03:28 time: 0.2786 data_time: 0.0358 memory: 1383 grad_norm: 1.0935 loss: 0.3780 recall@thr=0.5: 0.8667 prec@thr=0.5: 0.8667 recall@top3: 0.8667 prec@top3: 0.2889 recall@top5: 0.8667 prec@top5: 0.1733 loss_action_cls: 0.3780\n", + "06/15 06:12:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118] lr: 5.5000e-03 eta: 0:03:22 time: 0.3217 data_time: 0.0573 memory: 1383 grad_norm: 1.4278 loss: 0.3261 recall@thr=0.5: 0.8750 prec@thr=0.5: 0.8750 recall@top3: 0.9375 prec@top3: 0.3125 recall@top5: 0.9375 prec@top5: 0.1875 loss_action_cls: 0.3261\n", + "06/15 06:12:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118] lr: 5.5000e-03 eta: 0:03:15 time: 0.2823 data_time: 0.0358 memory: 1383 grad_norm: 0.6230 loss: 0.4514 recall@thr=0.5: 0.9286 prec@thr=0.5: 0.9286 recall@top3: 0.9286 prec@top3: 0.3095 recall@top5: 0.9286 prec@top5: 0.1857 loss_action_cls: 0.4514\n", + "06/15 06:12:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118] lr: 5.5000e-03 eta: 0:03:08 time: 0.2561 data_time: 0.0115 memory: 1383 grad_norm: 0.1768 loss: 0.3241 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3241\n", + "06/15 06:12:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118] lr: 5.5000e-03 eta: 0:03:02 time: 0.3094 data_time: 0.0422 memory: 1383 grad_norm: 0.4979 loss: 0.4081 recall@thr=0.5: 0.8333 prec@thr=0.5: 0.8333 recall@top3: 0.8333 prec@top3: 0.2778 recall@top5: 0.8333 prec@top5: 0.1667 loss_action_cls: 0.4081\n", + "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118] lr: 5.5000e-03 eta: 0:02:56 time: 0.2776 data_time: 0.0266 memory: 1383 grad_norm: 0.7488 loss: 0.4131 recall@thr=0.5: 0.6667 prec@thr=0.5: 0.6667 recall@top3: 0.6667 prec@top3: 0.2222 recall@top5: 0.6667 prec@top5: 0.1333 loss_action_cls: 0.4131\n", + "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n", + "06/15 06:13:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120] eta: 0:00:11 time: 0.1182 data_time: 0.0691 memory: 466 \n", + "06/15 06:13:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120] eta: 0:00:09 time: 0.1132 data_time: 0.0628 memory: 466 \n", + "06/15 06:13:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120] eta: 0:00:07 time: 0.1542 data_time: 0.0996 memory: 466 \n", + "06/15 06:13:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120] eta: 0:00:05 time: 0.1479 data_time: 0.0937 memory: 466 \n", + "06/15 06:13:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120] eta: 0:00:02 time: 0.1232 data_time: 0.0726 memory: 466 \n", + "06/15 06:13:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120] eta: 0:00:00 time: 0.1029 data_time: 0.0529 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 29.65\n", + "aerobic split jump 20.83\n", + "aerobic scissors leap 90.63\n", + "aerobic turn 97.10\n", + "mAP 59.55\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 100.00\n", + "mAP 50.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 100.00\n", + "mAP 34.00\n", + "06/15 06:13:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120] mAP/frameAP: 59.5538 mAP/v_map@0.2: 50.0000 mAP/v_map@0.5: 34.0000 mAP/v_map_0.05:0.45: 50.0000 mAP/v_map_0.10:0.90: 32.9167 mAP/v_map_0.50:0.95: 19.1250 data_time: 0.0750 time: 0.1264\n", + "06/15 06:13:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118] lr: 7.7500e-03 eta: 0:02:50 time: 0.3089 data_time: 0.0514 memory: 1383 grad_norm: 0.2046 loss: 0.3238 recall@thr=0.5: 0.9091 prec@thr=0.5: 0.9091 recall@top3: 0.9091 prec@top3: 0.3030 recall@top5: 0.9091 prec@top5: 0.1818 loss_action_cls: 0.3238\n", + "06/15 06:13:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118] lr: 7.7500e-03 eta: 0:02:46 time: 0.3790 data_time: 0.0937 memory: 1383 grad_norm: 0.7468 loss: 0.4123 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4123\n", + "06/15 06:13:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118] lr: 7.7500e-03 eta: 0:02:39 time: 0.2685 data_time: 0.0171 memory: 1383 grad_norm: 0.1904 loss: 0.4407 recall@thr=0.5: 0.6667 prec@thr=0.5: 0.6667 recall@top3: 0.6667 prec@top3: 0.2222 recall@top5: 0.6667 prec@top5: 0.1333 loss_action_cls: 0.4407\n", + "06/15 06:13:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118] lr: 7.7500e-03 eta: 0:02:32 time: 0.2546 data_time: 0.0100 memory: 1383 grad_norm: 0.1966 loss: 0.4266 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4266\n", + "06/15 06:13:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118] lr: 7.7500e-03 eta: 0:02:27 time: 0.3283 data_time: 0.0548 memory: 1383 grad_norm: 0.3165 loss: 0.3308 recall@thr=0.5: 0.8000 prec@thr=0.5: 0.8000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3308\n", + "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118] lr: 7.7500e-03 eta: 0:02:21 time: 0.2671 data_time: 0.0151 memory: 1383 grad_norm: 0.1487 loss: 0.3003 recall@thr=0.5: 0.8333 prec@thr=0.5: 0.8333 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3003\n", + "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n", + "06/15 06:13:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120] eta: 0:00:12 time: 0.1273 data_time: 0.0729 memory: 466 \n", + "06/15 06:14:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120] eta: 0:00:10 time: 0.1306 data_time: 0.0797 memory: 466 \n", + "06/15 06:14:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120] eta: 0:00:08 time: 0.1539 data_time: 0.0979 memory: 466 \n", + "06/15 06:14:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120] eta: 0:00:05 time: 0.1355 data_time: 0.0815 memory: 466 \n", + "06/15 06:14:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120] eta: 0:00:02 time: 0.1132 data_time: 0.0646 memory: 466 \n", + "06/15 06:14:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120] eta: 0:00:00 time: 0.1050 data_time: 0.0553 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 23.92\n", + "aerobic split jump 19.60\n", + "aerobic scissors leap 91.02\n", + "aerobic turn 96.05\n", + "mAP 57.64\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 100.00\n", + "mAP 50.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 100.00\n", + "mAP 34.00\n", + "06/15 06:14:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120] mAP/frameAP: 57.6444 mAP/v_map@0.2: 50.0000 mAP/v_map@0.5: 34.0000 mAP/v_map_0.05:0.45: 50.0000 mAP/v_map_0.10:0.90: 32.9167 mAP/v_map_0.50:0.95: 18.3250 data_time: 0.0753 time: 0.1274\n", + "06/15 06:14:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118] lr: 1.0000e-02 eta: 0:02:14 time: 0.2810 data_time: 0.0329 memory: 1383 grad_norm: 0.6113 loss: 0.4312 recall@thr=0.5: 0.8182 prec@thr=0.5: 0.8182 recall@top3: 0.8182 prec@top3: 0.2727 recall@top5: 0.8182 prec@top5: 0.1636 loss_action_cls: 0.4312\n", + "06/15 06:14:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118] lr: 1.0000e-02 eta: 0:02:09 time: 0.3316 data_time: 0.0732 memory: 1383 grad_norm: 0.2282 loss: 0.3932 recall@thr=0.5: 0.8182 prec@thr=0.5: 0.8182 recall@top3: 0.8182 prec@top3: 0.2727 recall@top5: 0.8182 prec@top5: 0.1636 loss_action_cls: 0.3932\n", + "06/15 06:14:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118] lr: 1.0000e-02 eta: 0:02:03 time: 0.2738 data_time: 0.0286 memory: 1383 grad_norm: 0.2938 loss: 0.3828 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 0.8571 prec@top3: 0.2857 recall@top5: 0.8571 prec@top5: 0.1714 loss_action_cls: 0.3828\n", + "06/15 06:14:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118] lr: 1.0000e-02 eta: 0:01:56 time: 0.2756 data_time: 0.0192 memory: 1383 grad_norm: 0.1112 loss: 0.3722 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3722\n", + "06/15 06:14:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118] lr: 1.0000e-02 eta: 0:01:51 time: 0.3193 data_time: 0.0573 memory: 1383 grad_norm: 0.6399 loss: 0.4427 recall@thr=0.5: 0.8000 prec@thr=0.5: 0.8000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4427\n", + "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118] lr: 1.0000e-02 eta: 0:01:45 time: 0.2535 data_time: 0.0093 memory: 1383 grad_norm: 0.0985 loss: 0.2719 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2719\n", + "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n", + "06/15 06:14:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120] eta: 0:00:13 time: 0.1329 data_time: 0.0774 memory: 466 \n", + "06/15 06:14:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120] eta: 0:00:12 time: 0.1787 data_time: 0.1259 memory: 466 \n", + "06/15 06:14:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120] eta: 0:00:08 time: 0.1363 data_time: 0.0829 memory: 466 \n", + "06/15 06:14:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120] eta: 0:00:05 time: 0.1012 data_time: 0.0513 memory: 466 \n", + "06/15 06:15:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120] eta: 0:00:02 time: 0.1095 data_time: 0.0593 memory: 466 \n", + "06/15 06:15:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120] eta: 0:00:00 time: 0.1033 data_time: 0.0536 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 14.21\n", + "aerobic split jump 15.37\n", + "aerobic scissors leap 91.25\n", + "aerobic turn 91.43\n", + "mAP 53.06\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 80.00\n", + "mAP 45.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 20.00\n", + "mAP 14.00\n", + "06/15 06:15:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120] mAP/frameAP: 53.0627 mAP/v_map@0.2: 45.0000 mAP/v_map@0.5: 14.0000 mAP/v_map_0.05:0.45: 40.0000 mAP/v_map_0.10:0.90: 22.4444 mAP/v_map_0.50:0.95: 7.0250 data_time: 0.0749 time: 0.1267\n", + "06/15 06:15:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118] lr: 1.0000e-02 eta: 0:01:39 time: 0.3193 data_time: 0.0634 memory: 1383 grad_norm: 0.5229 loss: 0.3929 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3929\n", + "06/15 06:15:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118] lr: 1.0000e-02 eta: 0:01:33 time: 0.2972 data_time: 0.0439 memory: 1383 grad_norm: 0.4621 loss: 0.2891 recall@thr=0.5: 0.7692 prec@thr=0.5: 0.7692 recall@top3: 0.9231 prec@top3: 0.3077 recall@top5: 0.9231 prec@top5: 0.1846 loss_action_cls: 0.2891\n", + "06/15 06:15:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118] lr: 1.0000e-02 eta: 0:01:27 time: 0.2567 data_time: 0.0127 memory: 1383 grad_norm: 0.2534 loss: 0.3438 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 0.9333 prec@top3: 0.3111 recall@top5: 0.9333 prec@top5: 0.1867 loss_action_cls: 0.3438\n", + "06/15 06:15:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118] lr: 1.0000e-02 eta: 0:01:21 time: 0.3277 data_time: 0.0645 memory: 1383 grad_norm: 0.0856 loss: 0.1859 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 0.8571 prec@top3: 0.2857 recall@top5: 0.8571 prec@top5: 0.1714 loss_action_cls: 0.1859\n", + "06/15 06:15:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118] lr: 1.0000e-02 eta: 0:01:15 time: 0.2995 data_time: 0.0503 memory: 1383 grad_norm: 0.3619 loss: 0.3205 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3205\n", + "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118] lr: 1.0000e-02 eta: 0:01:10 time: 0.2619 data_time: 0.0190 memory: 1383 grad_norm: 0.3812 loss: 0.3911 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3911\n", + "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n", + "06/15 06:15:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120] eta: 0:00:17 time: 0.1739 data_time: 0.1178 memory: 466 \n", + "06/15 06:15:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120] eta: 0:00:13 time: 0.1519 data_time: 0.1032 memory: 466 \n", + "06/15 06:15:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120] eta: 0:00:08 time: 0.1031 data_time: 0.0536 memory: 466 \n", + "06/15 06:15:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120] eta: 0:00:05 time: 0.0998 data_time: 0.0505 memory: 466 \n", + "06/15 06:15:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120] eta: 0:00:02 time: 0.1126 data_time: 0.0620 memory: 466 \n", + "06/15 06:15:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120] eta: 0:00:00 time: 0.0995 data_time: 0.0506 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 10.49\n", + "aerobic split jump 14.53\n", + "aerobic scissors leap 90.24\n", + "aerobic turn 87.53\n", + "mAP 50.70\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 40.00\n", + "mAP 35.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 40.00\n", + "mAP 19.00\n", + "06/15 06:15:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120] mAP/frameAP: 50.6970 mAP/v_map@0.2: 35.0000 mAP/v_map@0.5: 19.0000 mAP/v_map_0.05:0.45: 35.0000 mAP/v_map_0.10:0.90: 20.7778 mAP/v_map_0.50:0.95: 8.4000 data_time: 0.0724 time: 0.1229\n", + "06/15 06:16:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118] lr: 1.0000e-03 eta: 0:01:04 time: 0.3578 data_time: 0.0847 memory: 1383 grad_norm: 0.5369 loss: 0.3628 recall@thr=0.5: 0.9167 prec@thr=0.5: 0.9167 recall@top3: 0.9167 prec@top3: 0.3056 recall@top5: 0.9167 prec@top5: 0.1833 loss_action_cls: 0.3628\n", + "06/15 06:16:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118] lr: 1.0000e-03 eta: 0:00:58 time: 0.2652 data_time: 0.0202 memory: 1383 grad_norm: 0.1603 loss: 0.2293 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2293\n", + "06/15 06:16:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118] lr: 1.0000e-03 eta: 0:00:52 time: 0.2710 data_time: 0.0178 memory: 1383 grad_norm: 0.3857 loss: 0.2737 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2737\n", + "06/15 06:16:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118] lr: 1.0000e-03 eta: 0:00:46 time: 0.3420 data_time: 0.0698 memory: 1383 grad_norm: 0.1271 loss: 0.2149 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2149\n", + "06/15 06:16:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118] lr: 1.0000e-03 eta: 0:00:40 time: 0.2673 data_time: 0.0232 memory: 1383 grad_norm: 0.0990 loss: 0.2749 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2749\n", + "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118] lr: 1.0000e-03 eta: 0:00:34 time: 0.2612 data_time: 0.0156 memory: 1383 grad_norm: 0.1387 loss: 0.3211 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3211\n", + "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n", + "06/15 06:16:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120] eta: 0:00:16 time: 0.1657 data_time: 0.1063 memory: 466 \n", + "06/15 06:16:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120] eta: 0:00:11 time: 0.1164 data_time: 0.0654 memory: 466 \n", + "06/15 06:16:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120] eta: 0:00:07 time: 0.1053 data_time: 0.0546 memory: 466 \n", + "06/15 06:16:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120] eta: 0:00:04 time: 0.1005 data_time: 0.0511 memory: 466 \n", + "06/15 06:16:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120] eta: 0:00:02 time: 0.1035 data_time: 0.0533 memory: 466 \n", + "06/15 06:16:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120] eta: 0:00:00 time: 0.1382 data_time: 0.0850 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 11.65\n", + "aerobic split jump 15.62\n", + "aerobic scissors leap 89.83\n", + "aerobic turn 93.96\n", + "mAP 52.77\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 80.00\n", + "mAP 45.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 38.67\n", + "aerobic turn 20.00\n", + "mAP 14.67\n", + "06/15 06:16:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120] mAP/frameAP: 52.7652 mAP/v_map@0.2: 45.0000 mAP/v_map@0.5: 14.6667 mAP/v_map_0.05:0.45: 40.6944 mAP/v_map_0.10:0.90: 22.6389 mAP/v_map_0.50:0.95: 6.6833 data_time: 0.0691 time: 0.1213\n", + "06/15 06:16:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118] lr: 1.0000e-04 eta: 0:00:29 time: 0.3243 data_time: 0.0649 memory: 1383 grad_norm: 0.1808 loss: 0.3648 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3648\n", + "06/15 06:16:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118] lr: 1.0000e-04 eta: 0:00:23 time: 0.2578 data_time: 0.0117 memory: 1383 grad_norm: 0.0784 loss: 0.2355 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2355\n", + "06/15 06:17:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118] lr: 1.0000e-04 eta: 0:00:17 time: 0.3075 data_time: 0.0490 memory: 1383 grad_norm: 0.1707 loss: 0.3776 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3776\n", + "06/15 06:17:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118] lr: 1.0000e-04 eta: 0:00:11 time: 0.3092 data_time: 0.0576 memory: 1383 grad_norm: 0.1387 loss: 0.3873 recall@thr=0.5: 0.8182 prec@thr=0.5: 0.8182 recall@top3: 0.8182 prec@top3: 0.2727 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3873\n", + "06/15 06:17:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118] lr: 1.0000e-04 eta: 0:00:05 time: 0.2578 data_time: 0.0100 memory: 1383 grad_norm: 0.2137 loss: 0.3337 recall@thr=0.5: 0.8462 prec@thr=0.5: 0.8462 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3337\n", + "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n", + "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118] lr: 1.0000e-04 eta: 0:00:00 time: 0.2755 data_time: 0.0148 memory: 1383 grad_norm: 0.0712 loss: 0.2038 recall@thr=0.5: 0.9091 prec@thr=0.5: 0.9091 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2038\n", + "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n", + "06/15 06:17:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120] eta: 0:00:11 time: 0.1180 data_time: 0.0649 memory: 466 \n", + "06/15 06:17:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120] eta: 0:00:09 time: 0.1168 data_time: 0.0667 memory: 466 \n", + "06/15 06:17:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120] eta: 0:00:06 time: 0.1026 data_time: 0.0535 memory: 466 \n", + "06/15 06:17:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120] eta: 0:00:04 time: 0.1017 data_time: 0.0533 memory: 466 \n", + "06/15 06:17:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120] eta: 0:00:02 time: 0.1444 data_time: 0.0915 memory: 466 \n", + "06/15 06:17:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120] eta: 0:00:00 time: 0.1496 data_time: 0.0962 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 11.34\n", + "aerobic split jump 12.82\n", + "aerobic scissors leap 90.68\n", + "aerobic turn 90.47\n", + "mAP 51.33\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 80.00\n", + "mAP 45.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 72.00\n", + "aerobic turn 20.00\n", + "mAP 23.00\n", + "06/15 06:17:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120] mAP/frameAP: 51.3281 mAP/v_map@0.2: 45.0000 mAP/v_map@0.5: 23.0000 mAP/v_map_0.05:0.45: 40.0000 mAP/v_map_0.10:0.90: 24.4444 mAP/v_map_0.50:0.95: 9.7250 data_time: 0.0704 time: 0.1216\n", + "\u001b[32mTraining finished successfully. \u001b[0m\n" + ] + } + ], + "source": [ + "# Train the model using MIM\n", + "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n", + " --work-dir work_dirs/stad_model/" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "yVjHqupPooZ2" + }, + "source": [ + "## 4. Inferring the Spatiotemporal Action Detection Model\n", + "\n", + "After training the detection model and the spatiotemporal action detection model, we can use the spatiotemporal action detection demo for inference and visualize the model's performance.\n", + "\n", + "Since the tutorial uses a limited training dataset, the model's performance is not optimal, so a pre-trained model is used for visualization." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NQF1yrEhooZ3", + "outputId": "5331fbb6-7075-415c-f6f0-ec41c4b584a4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n", + "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n", + "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n", + "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n", + "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n", + "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n", + "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n", + "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n", + "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n", + "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n", + "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n", + "Performing Human Detection for each frame\n", + "[>>] 99/99, 7.0 task/s, elapsed: 14s, ETA: 0s\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "Performing SpatioTemporal Action Detection for each clip\n", + "[>>] 99/99, 17.1 task/s, elapsed: 6s, ETA: 0sPerforming visualization\n", + "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n", + "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n", + "\n", + "Moviepy - Done !\n", + "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n" + ] + } + ], + "source": [ + "!python ../../demo/demo_spatiotemporal_det.py \\\n", + " data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n", + " data/demo_spatiotemporal_det.mp4 \\\n", + " --config configs/slowonly_k400_multisports.py \\\n", + " --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n", + " --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n", + " --det-score-thr 0.85 \\\n", + " --action-score-thr 0.8 \\\n", + " --label-map ../../tools/data/multisports/label_map.txt \\\n", + " --predict-stepsize 8 \\\n", + " --output-stepsize 1 \\\n", + " --output-fps 24" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 741 + }, + "id": "9JmeIkh5ooZ3", + "outputId": "7fc38469-d8c4-4a02-81e7-ff93b88a62b2" + }, + "outputs": [], + "source": [ + "# Show Video\n", + "import moviepy.editor\n", + "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "ipy_stad", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/projects/stad_tutorial/demo_stad_zh_CN.ipynb b/projects/stad_tutorial/demo_stad_zh_CN.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e095ebaa9cc3d606ef473755349b90e265bac85c --- /dev/null +++ b/projects/stad_tutorial/demo_stad_zh_CN.ipynb @@ -0,0 +1,4107 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "B74HkZjCxQ_6" + }, + "source": [ + "\"Open" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MwmrGv9exRAH" + }, + "source": [ + "# ๅŸบไบŽ MMAction2 ่ฟ›่กŒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ไปปๅŠก\n", + "ๆฌข่ฟŽไฝฟ็”จ MMAction2! ่ฟ™ๆ˜ฏไธ€็ฏ‡ๅ…ณไบŽๅฆ‚ไฝ•ไฝฟ็”จ MMAction2 ่ฟ›่กŒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹็š„ๆ•™็จ‹ใ€‚ๅœจๆญคๆ•™็จ‹ไธญ๏ผŒๆˆ‘ไปฌไผšไปฅ MultiSports ๆ•ฐๆฎ้›†ไธบไพ‹๏ผŒๆไพ›ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹็š„ๅฎŒๆ•ดๆญฅ้ชคๆ•™็จ‹๏ผŒๅŒ…ๆ‹ฌ\n", + "- ๅ‡†ๅค‡ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆ•ฐๆฎ้›†\n", + "- ่ฎญ็ปƒๆฃ€ๆต‹ๆจกๅž‹\n", + "- ๅ‡†ๅค‡ AVA ๆ ผๅผ็š„ๆ•ฐๆฎ้›†\n", + "- ่ฎญ็ปƒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "em5lgDTUxRAI" + }, + "source": [ + "## 0. ๅฎ‰่ฃ… MMAction2 ๅ’Œ MMDetection" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bBM9DCrsxRAJ", + "outputId": "b310311f-f05e-4a5c-b6e5-8e6ee7e0dfae" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting openmim\n", + " Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n", + "Collecting colorama (from openmim)\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Collecting model-index (from openmim)\n", + " Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n", + "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n", + "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n", + "Collecting ordered-set (from model-index->openmim)\n", + " Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n", + "Installing collected packages: ordered-set, colorama, model-index, openmim\n", + "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmengine\n", + " Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting addict (from mmengine)\n", + " Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n", + "Collecting yapf (from mmengine)\n", + " Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m29.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n", + "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n", + " Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n", + "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n", + " Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n", + "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n", + " Attempting uninstall: platformdirs\n", + " Found existing installation: platformdirs 3.3.0\n", + " Uninstalling platformdirs-3.3.0:\n", + " Successfully uninstalled platformdirs-3.3.0\n", + "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmcv\n", + " Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n", + "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n", + "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n", + "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n", + "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n", + "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n", + "Installing collected packages: mmcv\n", + "Successfully installed mmcv-2.0.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n", + "Collecting mmdet\n", + " Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n", + "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n", + "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n", + "Collecting terminaltables (from mmdet)\n", + " Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n", + "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n", + "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n", + "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n", + "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n", + "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n", + "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n", + "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n", + "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n", + "Installing collected packages: terminaltables, mmdet\n", + "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n", + "Cloning into 'mmaction2'...\n", + "remote: Enumerating objects: 22869, done.\u001b[K\n", + "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n", + "remote: Compressing objects: 100% (801/801), done.\u001b[K\n", + "remote: Total 22869 (delta 854), reused 1171 (delta 685), pack-reused 21378\u001b[K\n", + "Receiving objects: 100% (22869/22869), 82.81 MiB | 27.92 MiB/s, done.\n", + "Resolving deltas: 100% (15952/15952), done.\n", + "/content/mmaction2\n", + "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Obtaining file:///content/mmaction2\n", + " Running command python setup.py egg_info\n", + " running egg_info\n", + " creating /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info\n", + " writing /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/PKG-INFO\n", + " writing dependency_links to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/dependency_links.txt\n", + " writing requirements to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/requires.txt\n", + " writing top-level names to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/top_level.txt\n", + " writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest template 'MANIFEST.in'\n", + " warning: no files found matching 'mmaction/.mim/model-index.yml'\n", + " warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n", + " warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n", + " warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n", + " warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n", + " adding license file 'LICENSE'\n", + " writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n", + " Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m98.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n", + " Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n", + "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n", + "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n", + "Installing collected packages: einops, decord, mmaction2\n", + " Running setup.py develop for mmaction2\n", + " Running command python setup.py develop\n", + " running develop\n", + " /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n", + " !!\n", + "\n", + " ********************************************************************************\n", + " Please avoid running ``setup.py`` and ``easy_install``.\n", + " Instead, use pypa/build, pypa/installer, pypa/build or\n", + " other standards-based tools.\n", + "\n", + " See https://github.com/pypa/setuptools/issues/917 for details.\n", + " ********************************************************************************\n", + "\n", + " !!\n", + " easy_install.initialize_options(self)\n", + " /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n", + " !!\n", + "\n", + " ********************************************************************************\n", + " Please avoid running ``setup.py`` directly.\n", + " Instead, use pypa/build, pypa/installer, pypa/build or\n", + " other standards-based tools.\n", + "\n", + " See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n", + " ********************************************************************************\n", + "\n", + " !!\n", + " self.initialize_options()\n", + " running egg_info\n", + " creating mmaction2.egg-info\n", + " writing mmaction2.egg-info/PKG-INFO\n", + " writing dependency_links to mmaction2.egg-info/dependency_links.txt\n", + " writing requirements to mmaction2.egg-info/requires.txt\n", + " writing top-level names to mmaction2.egg-info/top_level.txt\n", + " writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " reading manifest template 'MANIFEST.in'\n", + " adding license file 'LICENSE'\n", + " writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n", + " running build_ext\n", + " Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n", + " Adding mmaction2 1.0.0 to easy-install.pth file\n", + "\n", + " Installed /content/mmaction2\n", + "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n", + "/content/mmaction2/projects/stad_tutorial\n" + ] + } + ], + "source": [ + "%pip install -U openmim\n", + "!mim install mmengine\n", + "!mim install mmcv\n", + "!mim install mmdet\n", + "\n", + "!git clone https://github.com/open-mmlab/mmaction2.git\n", + "\n", + "%cd mmaction2\n", + "%pip install -v -e .\n", + "%cd projects/stad_tutorial" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "4M1PQASJxRAM" + }, + "source": [ + "## 1. ๅ‡†ๅค‡ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆ•ฐๆฎ้›†\n", + "\n", + "็ฑปไผผไบŽๆฃ€ๆต‹ไปปๅŠก้œ€่ฆๆไพ›ๆฃ€ๆต‹ๆก†ๆ ‡ๆณจ๏ผŒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ไปปๅŠก้œ€่ฆๅฏนๆ—ถ้—ดๅ’Œ็ฉบ้—ดๅŒๆ—ถๅฎšไฝ๏ผŒๆ‰€ไปฅ้œ€่ฆๆไพ›ๆ›ดๅคๆ‚็š„ tube ๆ ‡ๆณจใ€‚ไปฅ MultiSports ๆ•ฐๆฎ้›†็š„ๆ ‡ๆณจไธบไพ‹๏ผŒ`gttubes` ๅญ—ๆฎตๆไพ›ไบ†่ง†้ข‘ไธญๆ‰€ๆœ‰็š„็›ฎๆ ‡ๅŠจไฝœๆ ‡ๆณจ๏ผŒไปฅไธ‹ไธบไธ€ไธชๆ ‡ๆณจ็‰‡ๆฎต๏ผš\n", + "\n", + "```\n", + " 'gttubes': {\n", + " 'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n", + " {\n", + " 10: # ็ฑปๅˆซๆ ‡ๅท\n", + " [\n", + " array([[ 377., 904., 316., 1016., 584.], # ็ฑปๅˆซ 10 ็š„็ฌฌ 1 ไธช tube,\n", + " [ 378., 882., 315., 1016., 579.], # shape (n, 5): ่กจ็คบ n ๅธง๏ผŒๆฏๅธงๆ ‡ๆณจไธญๅŒ…ๆ‹ฌ (ๅธงๅท๏ผŒx1๏ผŒy1, x2, y2)\n", + " ...\n", + " [ 398., 861., 304., 954., 549.]], dtype=float32)๏ผŒ\n", + "\n", + " array([[ 399., 881., 308., 955., 542.], # ็ฑปๅˆซ 10 ็š„็ฌฌ 2 ไธช tube\n", + " [ 400., 862., 303., 988., 539.],\n", + " [ 401., 853., 292., 1000., 535.],\n", + " ...])\n", + " ...\n", + "\n", + " ] ,\n", + " 9: # ็ฑปๅˆซๆ ‡ๅท\n", + " [\n", + " array(...), # ็ฑปๅˆซ 9 ็š„็ฌฌ 1 ไธช tube\n", + " array(...), # ็ฑปๅˆซ 9 ็š„็ฌฌ 2 ไธช tube\n", + " ...\n", + " ]\n", + " ...\n", + " }\n", + " }\n", + "```\n", + "\n", + "ๆ ‡ๆณจๆ–‡ไปถไธญ่ฟ˜้œ€่ฆๆไพ›ๅ…ถไป–ๅญ—ๆฎต็š„ไฟกๆฏ๏ผŒๅฎŒๆ•ด็š„็œŸๅ€ผๆ–‡ไปถๅŒ…ๆ‹ฌไปฅไธ‹ไฟกๆฏ๏ผš\n", + "```\n", + "{\n", + " 'labels': # ๆ ‡็ญพๅˆ—่กจ\n", + " ['aerobic push up', 'aerobic explosive push up', ...],\n", + " 'train_videos': # ่ฎญ็ปƒ่ง†้ข‘ๅˆ—่กจ\n", + " [\n", + " [\n", + " 'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n", + " 'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n", + " ...\n", + " ]\n", + " ]\n", + " 'test_videos': # ๆต‹่ฏ•่ง†้ข‘ๅˆ—่กจ\n", + " [\n", + " [\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004',\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n", + " ...\n", + " ]\n", + " ]\n", + " 'n_frames': # dict ๆ–‡ไปถ๏ผŒๆไพ›ๅ„ไธช่ง†้ข‘็š„ๅธงๆ•ฐไฟกๆฏ\n", + " {\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n", + " ...\n", + " }\n", + " 'resolution': # dict ๆ–‡ไปถ๏ผŒๆไพ›ๅ„ไธช่ง†้ข‘็š„ๅˆ†่พจ็އไฟกๆฏ\n", + " {\n", + " 'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n", + " 'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n", + " ...\n", + " }\n", + " 'gt_tubes': # dict ๆ–‡ไปถ๏ผŒๆไพ› tube ็š„ๆฃ€ๆต‹ๆก†ไฟกๆฏ\n", + " {\n", + " ... # ๆ ผๅผๅ‚่€ƒไธŠ่ฟฐ่ฏดๆ˜Ž\n", + " }\n", + "}\n", + "```\n", + "ๅŽ็ปญ็š„ๅฎž้ชŒๅŸบไบŽ MultiSports-tiny ่ฟ›่กŒ๏ผŒๆˆ‘ไปฌไปŽ MultiSports ไธญๆŠฝๅ–ไบ†ๅฐ‘้‡่ง†้ข‘๏ผŒ็”จไบŽๆผ”็คบๆ•ดไธชๆต็จ‹ใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fiJPDuR9xRAQ", + "outputId": "8b3d8719-a9c0-4a59-d220-a3626fa34d3b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-06-15 06:41:29-- https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n", + "Resolving download.openmmlab.com (download.openmmlab.com)... 8.48.85.214, 8.48.85.207, 8.48.85.208, ...\n", + "Connecting to download.openmmlab.com (download.openmmlab.com)|8.48.85.214|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 82780160 (79M) [application/x-tar]\n", + "Saving to: โ€˜data/multisports-tiny.tarโ€™\n", + "\n", + "multisports-tiny.ta 100%[===================>] 78.95M 27.9MB/s in 2.8s \n", + "\n", + "2023-06-15 06:41:32 (27.9 MB/s) - โ€˜data/multisports-tiny.tarโ€™ saved [82780160/82780160]\n", + "\n", + "multisports-tiny/multisports/\n", + "multisports-tiny/multisports/test/\n", + "multisports-tiny/multisports/test/aerobic_gymnastics/\n", + "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n", + "multisports-tiny/multisports/annotations/\n", + "multisports-tiny/multisports/annotations/multisports_GT.pkl\n", + "multisports-tiny/multisports/trainval/\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n", + "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "The following NEW packages will be installed:\n", + " tree\n", + "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n", + "Need to get 43.0 kB of archives.\n", + "After this operation, 115 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n", + "Fetched 43.0 kB in 0s (253 kB/s)\n", + "Selecting previously unselected package tree.\n", + "(Reading database ... 122541 files and directories currently installed.)\n", + "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n", + "Unpacking tree (1.8.0-1) ...\n", + "Setting up tree (1.8.0-1) ...\n", + "Processing triggers for man-db (2.9.1-1) ...\n", + "\u001b[01;34mdata\u001b[00m\n", + "โ”œโ”€โ”€ \u001b[01;34mmultisports\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;34mannotations\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;34mtest\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34maerobic_gymnastics\u001b[00m\n", + "โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34mtrainval\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;34maerobic_gymnastics\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n", + "โ”‚ย ย  โ”œโ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n", + "โ”‚ย ย  โ””โ”€โ”€ \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n", + "โ””โ”€โ”€ \u001b[01;31mmultisports-tiny.tar\u001b[00m\n", + "\n", + "6 directories, 6 files\n" + ] + } + ], + "source": [ + "# ไธ‹่ฝฝๆ•ฐๆฎ้›†\n", + "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n", + "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n", + "!apt-get -q install tree\n", + "!tree data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "XjG0dEE8xRAS" + }, + "source": [ + "## 2. ่ฎญ็ปƒๆฃ€ๆต‹ๆจกๅž‹\n", + "\n", + "ๅœจ SlowOnly + Det ็š„่Œƒๅผไธญ๏ผŒ้œ€่ฆๅ…ˆ่ฎญ็ปƒไบบไฝ“ๆฃ€ๆต‹ๅ™จ๏ผŒๅ†ๅŸบไบŽๆฃ€ๆต‹็ป“ๆžœๆฅ้ข„ๆต‹่กŒไธบใ€‚่ฟ™ไธ€่Š‚ไธญ๏ผŒๆˆ‘ไปฌๅŸบไบŽไธŠไธ€่Š‚ไธญ็š„ๆ ‡ๆณจๆ ผๅผๅ’Œ MMDetection ็ฎ—ๆณ•ๅบ“่ฎญ็ปƒๆฃ€ๆต‹ๆจกๅž‹ใ€‚\n", + "\n", + "### 2.1 ๆž„ๅปบๆฃ€ๆต‹ๆ•ฐๆฎ้›†ๆ ‡ๆณจ๏ผˆCOCO ๆ ผๅผ๏ผ‰\n", + "\n", + "ๅŸบไบŽๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆ•ฐๆฎ้›†็š„ๆ ‡ๆณจไฟกๆฏ๏ผŒๆˆ‘ไปฌๅฏไปฅๆž„ๅปบไธ€ไธช COCO ๆ ผๅผ็š„ๆฃ€ๆต‹ๆ•ฐๆฎ้›†๏ผŒ็”จไบŽ่ฎญ็ปƒๆฃ€ๆต‹ๆจกๅž‹ใ€‚ๆˆ‘ไปฌๆไพ›ไบ†ไธ€ไธชๅทฅๅ…ท่„šๆœฌๅฏน MultiSports ๆ ผๅผ็š„ๆ ‡ๆณจ่ฟ›่กŒ่ฝฌๆข๏ผŒๅฆ‚ๆžœ้œ€่ฆๅŸบไบŽๅ…ถไป–ๆ ผๅผ่ฝฌๆข๏ผŒๅฏไปฅๅ‚่€ƒ MMDetection ๆไพ›็š„[่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html)ๆ–‡ๆกฃใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "inBtClMIxRAV", + "outputId": "3ac5199b-562f-48c4-da27-819d34069213" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34mdata/multisports/annotations\u001b[00m\n", + "โ”œโ”€โ”€ multisports_det_anno_train.json\n", + "โ”œโ”€โ”€ multisports_det_anno_val.json\n", + "โ””โ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "\n", + "0 directories, 3 files\n" + ] + } + ], + "source": [ + "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n", + "!tree data/multisports/annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TkPONRezxRAZ", + "outputId": "0f8075a1-47fb-490d-9c88-4904f45363fb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will generate 3 rgb dir for aerobic_gymnastics.\n", + "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n", + "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n", + "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n" + ] + } + ], + "source": [ + "!python tools/generate_rgb.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MP-umqqnxRAa" + }, + "source": [ + "### 2.2 ไฟฎๆ”น config ๆ–‡ไปถ\n", + "\n", + "ๆˆ‘ไปฌไปฅ faster-rcnn_x101-64x4d_fpn_1x_coco ไธบๅŸบ็ก€้…็ฝฎ๏ผŒๅšๅฆ‚ไธ‹ไฟฎๆ”น๏ผŒๅœจ MultiSports ๆ•ฐๆฎ้›†ไธŠ่ฟ›่กŒ่ฎญ็ปƒใ€‚้œ€่ฆไฟฎๆ”นไปฅไธ‹ๅ‡ ไธช้ƒจๅˆ†๏ผš\n", + "- ๆจกๅž‹็š„็ฑปๅˆซๆ•ฐ้‡\n", + "- ๅญฆไน ็އ่ฐƒๆ•ด็ญ–็•ฅ\n", + "- ไผ˜ๅŒ–ๅ™จ้…็ฝฎ\n", + "- ๆ•ฐๆฎ้›†/ๆ ‡ๆณจๆ–‡ไปถ่ทฏๅพ„\n", + "- ่ฏ„ๆต‹ๅ™จ้…็ฝฎ\n", + "- ้ข„่ฎญ็ปƒๆจกๅž‹\n", + "\n", + "ๆ›ด่ฏฆ็ป†็š„ๆ•™็จ‹ๅฏไปฅๅ‚่€ƒ MMDetection ๆไพ›็š„[ๅ‡†ๅค‡้…็ฝฎๆ–‡ไปถ](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9)ๆ–‡ๆกฃใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yMw9MrI0xRAc", + "outputId": "1f5ee99a-d4cb-45b0-df71-f0209a9b6275" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Copyright (c) OpenMMLab. All rights reserved.\n", + "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n", + "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n", + "\n", + "# take 2 epochs as an example\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "\n", + "# learning rate\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "\n", + "# optimizer\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n", + "\n", + "dataset_type = 'CocoDataset'\n", + "# modify metainfo\n", + "metainfo = {\n", + " 'classes': ('person', ),\n", + " 'palette': [\n", + " (220, 20, 60),\n", + " ]\n", + "}\n", + "\n", + "# specify metainfo, dataset path\n", + "data_root = 'data/multisports/'\n", + "\n", + "train_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "test_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " metainfo=metainfo))\n", + "\n", + "# specify annotaition file path, modify metric items\n", + "val_evaluator = dict(\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5],\n", + ")\n", + "\n", + "test_evaluator = dict(\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5],\n", + ")\n", + "\n", + "# specify pretrain checkpoint\n", + "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501\n" + ] + } + ], + "source": [ + "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "S3Ux8echxRAe" + }, + "source": [ + "### 2.3 ่ฎญ็ปƒๆฃ€ๆต‹ๆจกๅž‹" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "MYtjYFU5xRAf" + }, + "source": [ + "ๅˆฉ็”จ MIM ๅฏไปฅๅœจๅฝ“ๅ‰่ทฏๅพ„็›ดๆŽฅ่ฎญ็ปƒ MMDetection ๆจกๅž‹๏ผŒ่ฟ™้‡Œๆไพ›ๆœ€็ฎ€ๅ•็š„ๅ•ๅก่ฎญ็ปƒ็คบไพ‹๏ผŒๆ›ดๅคš่ฎญ็ปƒๅ‘ฝไปคๅฏไปฅๅ‚่€ƒ MIM [ๆ•™็จ‹](https://github.com/open-mmlab/mim#command)ใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "56m--2T8xRAg", + "outputId": "d47ceca0-e930-4063-e25d-739a44410b86" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n", + "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 1318688827\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 1318688827\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "model = dict(\n", + " type='FasterRCNN',\n", + " data_preprocessor=dict(\n", + " type='DetDataPreprocessor',\n", + " mean=[103.53, 116.28, 123.675],\n", + " std=[1.0, 1.0, 1.0],\n", + " bgr_to_rgb=False,\n", + " pad_size_divisor=32),\n", + " backbone=dict(\n", + " type='ResNet',\n", + " depth=50,\n", + " num_stages=4,\n", + " out_indices=(0, 1, 2, 3),\n", + " frozen_stages=1,\n", + " norm_cfg=dict(type='BN', requires_grad=False),\n", + " norm_eval=True,\n", + " style='caffe',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n", + " neck=dict(\n", + " type='FPN',\n", + " in_channels=[256, 512, 1024, 2048],\n", + " out_channels=256,\n", + " num_outs=5),\n", + " rpn_head=dict(\n", + " type='RPNHead',\n", + " in_channels=256,\n", + " feat_channels=256,\n", + " anchor_generator=dict(\n", + " type='AnchorGenerator',\n", + " scales=[8],\n", + " ratios=[0.5, 1.0, 2.0],\n", + " strides=[4, 8, 16, 32, 64]),\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[1.0, 1.0, 1.0, 1.0]),\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n", + " roi_head=dict(\n", + " type='StandardRoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor',\n", + " roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n", + " out_channels=256,\n", + " featmap_strides=[4, 8, 16, 32]),\n", + " bbox_head=dict(\n", + " type='Shared2FCBBoxHead',\n", + " in_channels=256,\n", + " fc_out_channels=1024,\n", + " roi_feat_size=7,\n", + " num_classes=1,\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[0.1, 0.1, 0.2, 0.2]),\n", + " reg_class_agnostic=False,\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n", + " train_cfg=dict(\n", + " rpn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.7,\n", + " neg_iou_thr=0.3,\n", + " min_pos_iou=0.3,\n", + " match_low_quality=True,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=256,\n", + " pos_fraction=0.5,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=False),\n", + " allowed_border=-1,\n", + " pos_weight=-1,\n", + " debug=False),\n", + " rpn_proposal=dict(\n", + " nms_pre=2000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.5,\n", + " neg_iou_thr=0.5,\n", + " min_pos_iou=0.5,\n", + " match_low_quality=False,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=512,\n", + " pos_fraction=0.25,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=-1,\n", + " debug=False)),\n", + " test_cfg=dict(\n", + " rpn=dict(\n", + " nms_pre=1000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " score_thr=0.05,\n", + " nms=dict(type='nms', iou_threshold=0.5),\n", + " max_per_img=100)))\n", + "dataset_type = 'CocoDataset'\n", + "data_root = 'data/multisports/'\n", + "backend_args = None\n", + "train_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " batch_sampler=dict(type='AspectRatioBatchSampler'),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " filter_cfg=dict(filter_empty_gt=True, min_size=32),\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "test_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n", + "auto_scale_lr = dict(enable=False, base_batch_size=16)\n", + "default_scope = 'mmdet'\n", + "default_hooks = dict(\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=50),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(type='CheckpointHook', interval=1),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " visualization=dict(type='DetVisualizationHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='DetLocalVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " name='visualizer')\n", + "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n", + "log_level = 'INFO'\n", + "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n", + "resume = False\n", + "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n", + "launcher = 'none'\n", + "work_dir = 'work_dirs/det_model'\n", + "\n", + "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n", + "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n", + "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n", + "100% 89.9M/89.9M [00:03<00:00, 31.4MB/s]\n", + "06/15 06:42:53 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n", + "\n", + "unexpected key in source state_dict: conv1.bias\n", + "\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "100% 158M/158M [00:06<00:00, 24.4MB/s]\n", + "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n", + "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n", + "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n", + "06/15 06:43:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118] lr: 5.0000e-03 eta: 0:02:00 time: 0.6468 data_time: 0.0127 memory: 3419 loss: 0.4823 loss_rpn_cls: 0.0063 loss_rpn_bbox: 0.0151 loss_cls: 0.1676 acc: 95.0195 loss_bbox: 0.2933\n", + "06/15 06:43:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118] lr: 5.0000e-03 eta: 0:01:17 time: 0.4922 data_time: 0.0077 memory: 3419 loss: 0.4234 loss_rpn_cls: 0.0031 loss_rpn_bbox: 0.0134 loss_cls: 0.1394 acc: 91.9922 loss_bbox: 0.2676\n", + "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n", + "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n", + "06/15 06:44:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120] eta: 0:00:08 time: 0.1269 data_time: 0.0112 memory: 3419 \n", + "06/15 06:44:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120] eta: 0:00:02 time: 0.1159 data_time: 0.0032 memory: 682 \n", + "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.04s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.01s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.913\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.817\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.908\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = 0.960\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n", + "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.913 -1.000 -1.000 -1.000 0.817 0.908\n", + "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: 0.9600 data_time: 0.0065 time: 0.1205\n", + "06/15 06:44:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118] lr: 5.0000e-03 eta: 0:00:37 time: 0.5233 data_time: 0.0099 memory: 3419 loss: 0.3250 loss_rpn_cls: 0.0025 loss_rpn_bbox: 0.0107 loss_cls: 0.1116 acc: 95.2148 loss_bbox: 0.2002\n", + "06/15 06:45:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118] lr: 5.0000e-03 eta: 0:00:09 time: 0.5354 data_time: 0.0083 memory: 3419 loss: 0.3042 loss_rpn_cls: 0.0013 loss_rpn_bbox: 0.0105 loss_cls: 0.0946 acc: 94.9219 loss_bbox: 0.1978\n", + "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n", + "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n", + "06/15 06:45:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120] eta: 0:00:08 time: 0.1237 data_time: 0.0050 memory: 3419 \n", + "06/15 06:45:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120] eta: 0:00:02 time: 0.1225 data_time: 0.0058 memory: 682 \n", + "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.07s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.01s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.912\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.747\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.916\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = 0.955\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = 0.955\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = 0.955\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.954\n", + "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.912 -1.000 -1.000 -1.000 0.747 0.916\n", + "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: 0.9550 data_time: 0.0052 time: 0.1228\n", + "\u001b[32mTraining finished successfully. \u001b[0m\n" + ] + } + ], + "source": [ + "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --work-dir work_dirs/det_model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "-pf9MnuUxRAh" + }, + "source": [ + "### 2.4 ็”Ÿๆˆ proposal bboxes\n", + "\n", + "ๅœจๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹่ฎญ็ปƒๆ—ถ๏ผŒ้œ€่ฆๅŸบไบŽๆฃ€ๆต‹ๆจกๅž‹ๆŽจ็†ๅพ—ๅˆฐ็š„ proposal๏ผŒ่€Œไธ่ƒฝๅŸบไบŽๆ ‡ๆณจ็š„ๆฃ€ๆต‹ๆก†ใ€‚ๅ› ๆญคๆˆ‘ไปฌ้œ€่ฆๅˆฉ็”จ่ฎญ็ปƒๅฅฝ็š„ๆฃ€ๆต‹ๆจกๅž‹ๅฏนๆ•ดไธชๆ•ฐๆฎ้›†่ฟ›่กŒๆŽจ็†๏ผŒๅพ—ๅˆฐ proposal ๅŽ่ฝฌๆขไธบ้œ€่ฆ็š„ๆ ผๅผ๏ผŒ็”จไบŽๅŽ็ปญ่ฎญ็ปƒใ€‚\n", + "\n", + "#### 2.4.1 ๅฐ†ๆ•ฐๆฎ้›†่ฝฌๆขไธบ Coco ๆ ผๅผ\n", + "\n", + "ๆˆ‘ไปฌๆไพ›ไบ†่„šๆœฌๅฐ† MultiSports ๆ•ฐๆฎ้›†่ฝฌๆขๆˆๆฒกๆœ‰็œŸๅ€ผ็š„ๆ ‡ๆณจๆ ผๅผ๏ผŒ็”จไบŽๆŽจ็†ใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nL2n0AKJxRAi", + "outputId": "51907af1-7da3-4713-8e90-a61b052000aa" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[>>] 2350/2350, 1606.7 task/s, elapsed: 1s, ETA: 0s\n", + "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n" + ] + } + ], + "source": [ + "!echo 'person' > data/multisports/annotations/label_map.txt\n", + "!python tools/images2coco.py \\\n", + " data/multisports/rawframes \\\n", + " data/multisports/annotations/label_map.txt \\\n", + " ms_infer_anno.json" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_REQniysxRAj" + }, + "source": [ + "#### 2.4.2 ๆŽจ็†็”Ÿๆˆ proposal file" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ShnTsjs1xRAk" + }, + "source": [ + "MMDetection ๆจกๅž‹็š„ๆŽจ็†ๅŒๆ ทๅŸบไบŽ MIM๏ผŒๆ›ดๅคšๆต‹่ฏ•ๅ‘ฝไปค่ฏทๅ‚่€ƒ MIM [ๆ•™็จ‹](https://github.com/open-mmlab/mim#command)ใ€‚\n", + "\n", + "ๆŽจ็†ๅฎŒๆˆๅŽ๏ผŒไผšๅฐ†ๆŽจ็†็ป“ๆžœไฟๅญ˜ๅœจ 'data/multisports/ms_proposals.pkl'ใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DXnT4aArxRAm", + "outputId": "565faf02-4b7f-49ab-f30f-b20e7eb09085" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n", + "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 1403639615\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 1403639615\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "model = dict(\n", + " type='FasterRCNN',\n", + " data_preprocessor=dict(\n", + " type='DetDataPreprocessor',\n", + " mean=[103.53, 116.28, 123.675],\n", + " std=[1.0, 1.0, 1.0],\n", + " bgr_to_rgb=False,\n", + " pad_size_divisor=32),\n", + " backbone=dict(\n", + " type='ResNet',\n", + " depth=50,\n", + " num_stages=4,\n", + " out_indices=(0, 1, 2, 3),\n", + " frozen_stages=1,\n", + " norm_cfg=dict(type='BN', requires_grad=False),\n", + " norm_eval=True,\n", + " style='caffe',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n", + " neck=dict(\n", + " type='FPN',\n", + " in_channels=[256, 512, 1024, 2048],\n", + " out_channels=256,\n", + " num_outs=5),\n", + " rpn_head=dict(\n", + " type='RPNHead',\n", + " in_channels=256,\n", + " feat_channels=256,\n", + " anchor_generator=dict(\n", + " type='AnchorGenerator',\n", + " scales=[8],\n", + " ratios=[0.5, 1.0, 2.0],\n", + " strides=[4, 8, 16, 32, 64]),\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[1.0, 1.0, 1.0, 1.0]),\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n", + " roi_head=dict(\n", + " type='StandardRoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor',\n", + " roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n", + " out_channels=256,\n", + " featmap_strides=[4, 8, 16, 32]),\n", + " bbox_head=dict(\n", + " type='Shared2FCBBoxHead',\n", + " in_channels=256,\n", + " fc_out_channels=1024,\n", + " roi_feat_size=7,\n", + " num_classes=1,\n", + " bbox_coder=dict(\n", + " type='DeltaXYWHBBoxCoder',\n", + " target_means=[0.0, 0.0, 0.0, 0.0],\n", + " target_stds=[0.1, 0.1, 0.2, 0.2]),\n", + " reg_class_agnostic=False,\n", + " loss_cls=dict(\n", + " type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n", + " loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n", + " train_cfg=dict(\n", + " rpn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.7,\n", + " neg_iou_thr=0.3,\n", + " min_pos_iou=0.3,\n", + " match_low_quality=True,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=256,\n", + " pos_fraction=0.5,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=False),\n", + " allowed_border=-1,\n", + " pos_weight=-1,\n", + " debug=False),\n", + " rpn_proposal=dict(\n", + " nms_pre=2000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssigner',\n", + " pos_iou_thr=0.5,\n", + " neg_iou_thr=0.5,\n", + " min_pos_iou=0.5,\n", + " match_low_quality=False,\n", + " ignore_iof_thr=-1),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=512,\n", + " pos_fraction=0.25,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=-1,\n", + " debug=False)),\n", + " test_cfg=dict(\n", + " rpn=dict(\n", + " nms_pre=1000,\n", + " max_per_img=1000,\n", + " nms=dict(type='nms', iou_threshold=0.7),\n", + " min_bbox_size=0),\n", + " rcnn=dict(\n", + " score_thr=0.05,\n", + " nms=dict(type='nms', iou_threshold=0.5),\n", + " max_per_img=100)))\n", + "dataset_type = 'CocoDataset'\n", + "data_root = 'data/multisports/'\n", + "backend_args = None\n", + "train_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + "]\n", + "test_pipeline = [\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True),\n", + " batch_sampler=dict(type='AspectRatioBatchSampler'),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_train.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " filter_cfg=dict(filter_empty_gt=True, min_size=32),\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='RandomChoiceResize',\n", + " scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n", + " (1333, 768), (1333, 800)],\n", + " keep_ratio=True),\n", + " dict(type='RandomFlip', prob=0.5),\n", + " dict(type='PackDetInputs')\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/multisports_det_anno_val.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " drop_last=False,\n", + " sampler=dict(type='DefaultSampler', shuffle=False),\n", + " dataset=dict(\n", + " type='CocoDataset',\n", + " data_root='data/multisports/',\n", + " ann_file='annotations/ms_infer_anno.json',\n", + " data_prefix=dict(img='rawframes/'),\n", + " test_mode=True,\n", + " pipeline=[\n", + " dict(type='LoadImageFromFile', backend_args=None),\n", + " dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n", + " dict(type='LoadAnnotations', with_bbox=True),\n", + " dict(\n", + " type='PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + " ],\n", + " backend_args=None,\n", + " metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n", + "val_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "test_evaluator = dict(\n", + " type='CocoMetric',\n", + " ann_file='data/multisports/annotations/ms_infer_anno.json',\n", + " metric='bbox',\n", + " format_only=False,\n", + " backend_args=None,\n", + " metric_items=['mAP_50', 'AR@100'],\n", + " iou_thrs=[0.5])\n", + "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n", + "val_cfg = dict(type='ValLoop')\n", + "test_cfg = dict(type='TestLoop')\n", + "param_scheduler = [\n", + " dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n", + "]\n", + "optim_wrapper = dict(\n", + " type='OptimWrapper',\n", + " optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n", + "auto_scale_lr = dict(enable=False, base_batch_size=16)\n", + "default_scope = 'mmdet'\n", + "default_hooks = dict(\n", + " timer=dict(type='IterTimerHook'),\n", + " logger=dict(type='LoggerHook', interval=50),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " checkpoint=dict(type='CheckpointHook', interval=1),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " visualization=dict(type='DetVisualizationHook'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "vis_backends = [dict(type='LocalVisBackend')]\n", + "visualizer = dict(\n", + " type='DetLocalVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " name='visualizer')\n", + "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n", + "log_level = 'INFO'\n", + "load_from = 'work_dirs/det_model/epoch_2.pth'\n", + "resume = False\n", + "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n", + "launcher = 'none'\n", + "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n", + "\n", + "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DetVisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "loading annotations into memory...\n", + "Done (t=0.00s)\n", + "creating index...\n", + "index created!\n", + "06/15 06:45:56 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n", + "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n", + "06/15 06:45:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n", + "06/15 06:46:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 50/2350] eta: 0:05:46 time: 0.1507 data_time: 0.0046 memory: 512 \n", + "06/15 06:46:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350] eta: 0:05:06 time: 0.1217 data_time: 0.0059 memory: 512 \n", + "06/15 06:46:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350] eta: 0:04:47 time: 0.1193 data_time: 0.0022 memory: 512 \n", + "06/15 06:46:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350] eta: 0:04:34 time: 0.1197 data_time: 0.0023 memory: 512 \n", + "06/15 06:46:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350] eta: 0:04:27 time: 0.1258 data_time: 0.0073 memory: 512 \n", + "06/15 06:46:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350] eta: 0:04:19 time: 0.1215 data_time: 0.0026 memory: 512 \n", + "06/15 06:46:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350] eta: 0:04:12 time: 0.1242 data_time: 0.0046 memory: 512 \n", + "06/15 06:46:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350] eta: 0:04:04 time: 0.1218 data_time: 0.0029 memory: 512 \n", + "06/15 06:46:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350] eta: 0:03:58 time: 0.1229 data_time: 0.0042 memory: 512 \n", + "06/15 06:46:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350] eta: 0:03:51 time: 0.1229 data_time: 0.0048 memory: 512 \n", + "06/15 06:47:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350] eta: 0:03:44 time: 0.1193 data_time: 0.0020 memory: 512 \n", + "06/15 06:47:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350] eta: 0:03:37 time: 0.1234 data_time: 0.0060 memory: 512 \n", + "06/15 06:47:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350] eta: 0:03:30 time: 0.1184 data_time: 0.0025 memory: 512 \n", + "06/15 06:47:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350] eta: 0:03:24 time: 0.1200 data_time: 0.0041 memory: 512 \n", + "06/15 06:47:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350] eta: 0:03:17 time: 0.1216 data_time: 0.0046 memory: 512 \n", + "06/15 06:47:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350] eta: 0:03:11 time: 0.1184 data_time: 0.0024 memory: 512 \n", + "06/15 06:47:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350] eta: 0:03:04 time: 0.1234 data_time: 0.0064 memory: 512 \n", + "06/15 06:47:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350] eta: 0:02:58 time: 0.1196 data_time: 0.0028 memory: 512 \n", + "06/15 06:47:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350] eta: 0:02:52 time: 0.1217 data_time: 0.0046 memory: 512 \n", + "06/15 06:48:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350] eta: 0:02:45 time: 0.1220 data_time: 0.0046 memory: 512 \n", + "06/15 06:48:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350] eta: 0:02:39 time: 0.1203 data_time: 0.0028 memory: 512 \n", + "06/15 06:48:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350] eta: 0:02:33 time: 0.1231 data_time: 0.0055 memory: 512 \n", + "06/15 06:48:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350] eta: 0:02:27 time: 0.1207 data_time: 0.0033 memory: 512 \n", + "06/15 06:48:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350] eta: 0:02:21 time: 0.1217 data_time: 0.0049 memory: 512 \n", + "06/15 06:48:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350] eta: 0:02:14 time: 0.1211 data_time: 0.0038 memory: 512 \n", + "06/15 06:48:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350] eta: 0:02:08 time: 0.1242 data_time: 0.0070 memory: 512 \n", + "06/15 06:48:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350] eta: 0:02:02 time: 0.1249 data_time: 0.0077 memory: 512 \n", + "06/15 06:48:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350] eta: 0:01:56 time: 0.1181 data_time: 0.0022 memory: 512 \n", + "06/15 06:48:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350] eta: 0:01:50 time: 0.1219 data_time: 0.0055 memory: 512 \n", + "06/15 06:49:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350] eta: 0:01:44 time: 0.1198 data_time: 0.0034 memory: 512 \n", + "06/15 06:49:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350] eta: 0:01:37 time: 0.1194 data_time: 0.0028 memory: 512 \n", + "06/15 06:49:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350] eta: 0:01:31 time: 0.1228 data_time: 0.0059 memory: 512 \n", + "06/15 06:49:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350] eta: 0:01:25 time: 0.1193 data_time: 0.0026 memory: 512 \n", + "06/15 06:49:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350] eta: 0:01:19 time: 0.1232 data_time: 0.0060 memory: 512 \n", + "06/15 06:49:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350] eta: 0:01:13 time: 0.1199 data_time: 0.0028 memory: 512 \n", + "06/15 06:49:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350] eta: 0:01:07 time: 0.1205 data_time: 0.0035 memory: 512 \n", + "06/15 06:49:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350] eta: 0:01:01 time: 0.1237 data_time: 0.0067 memory: 512 \n", + "06/15 06:49:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350] eta: 0:00:54 time: 0.1190 data_time: 0.0024 memory: 512 \n", + "06/15 06:49:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350] eta: 0:00:48 time: 0.1238 data_time: 0.0069 memory: 512 \n", + "06/15 06:50:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350] eta: 0:00:42 time: 0.1183 data_time: 0.0020 memory: 512 \n", + "06/15 06:50:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350] eta: 0:00:36 time: 0.1212 data_time: 0.0049 memory: 512 \n", + "06/15 06:50:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350] eta: 0:00:30 time: 0.1212 data_time: 0.0044 memory: 512 \n", + "06/15 06:50:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350] eta: 0:00:24 time: 0.1180 data_time: 0.0019 memory: 512 \n", + "06/15 06:50:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350] eta: 0:00:18 time: 0.1233 data_time: 0.0062 memory: 512 \n", + "06/15 06:50:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350] eta: 0:00:12 time: 0.1186 data_time: 0.0021 memory: 512 \n", + "06/15 06:50:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350] eta: 0:00:06 time: 0.1227 data_time: 0.0064 memory: 512 \n", + "06/15 06:50:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350] eta: 0:00:00 time: 0.1196 data_time: 0.0033 memory: 512 \n", + "06/15 06:50:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n", + "Loading and preparing results...\n", + "DONE (t=0.01s)\n", + "creating index...\n", + "index created!\n", + "Running per image evaluation...\n", + "Evaluate annotation type *bbox*\n", + "DONE (t=0.37s).\n", + "Accumulating evaluation results...\n", + "DONE (t=0.28s).\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=300 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n", + "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n", + "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n", + "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350] coco/bbox_mAP_50: -1.0000 coco/bbox_AR@100: -1.0000 data_time: 0.0042 time: 0.1219\n", + "\u001b[32mTesting finished successfully.\u001b[0m\n" + ] + } + ], + "source": [ + "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --checkpoint work_dirs/det_model/epoch_2.pth \\\n", + " --out data/multisports/annotations/ms_det_proposals.pkl" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "1zErF-nsxRAo" + }, + "source": [ + "## 3. ่ฎญ็ปƒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹\n", + "\n", + "### 3.1 ่ฝฌๆขๆ ‡ๆณจๆ–‡ไปถไปฅๅŠ proposal ๆ–‡ไปถ\n", + "\n", + "MultiSports ๆ•ฐๆฎ้›†ๆไพ›็š„ๆ ‡ๆณจๆ–‡ไปถ๏ผŒไปฅๅŠ MMDetection ๆŽจ็†็”Ÿๆˆ็š„ proposal ้ƒฝ้œ€่ฆ่ฟ›่กŒๆ ผๅผ่ฝฌๆข๏ผŒๆ‰่ƒฝ็”จไบŽๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹็š„่ฎญ็ปƒใ€‚ๆˆ‘ไปฌๅทฒ็ปๆไพ›ไบ†็›ธๅ…ณ็š„่„šๆœฌๅทฅๅ…ท๏ผŒๆ‰ง่กŒๅŽๅณๅฏ็”ŸๆˆๆŒ‡ๅฎšๆ ผๅผ" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "naAfcO4QxRAo", + "outputId": "2a309bef-241f-44fc-8276-b2ea4735e37d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading test result...\n", + "[>>] 2350/2350, 3582.6 task/s, elapsed: 1s, ETA: 0s\n", + "\u001b[01;34mdata/multisports/annotations\u001b[00m\n", + "โ”œโ”€โ”€ label_map.txt\n", + "โ”œโ”€โ”€ ms_det_proposals.pkl\n", + "โ”œโ”€โ”€ ms_infer_anno.json\n", + "โ”œโ”€โ”€ multisports_det_anno_train.json\n", + "โ”œโ”€โ”€ multisports_det_anno_val.json\n", + "โ”œโ”€โ”€ \u001b[01;32mmultisports_GT.pkl\u001b[00m\n", + "โ”œโ”€โ”€ multisports_proposals_train.pkl\n", + "โ”œโ”€โ”€ multisports_proposals_val.pkl\n", + "โ”œโ”€โ”€ multisports_train.csv\n", + "โ””โ”€โ”€ multisports_val.csv\n", + "\n", + "0 directories, 10 files\n" + ] + } + ], + "source": [ + "# ่ฝฌๆข anno ๆ–‡ไปถ\n", + "!python ../../tools/data/multisports/parse_anno.py\n", + "\n", + "# ่ฝฌๆข proposal ๆ–‡ไปถ\n", + "!python tools/convert_proposals.py\n", + "\n", + "!tree data/multisports/annotations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "balpcJEbxRAp" + }, + "source": [ + "### 3.2 ่ฎญ็ปƒๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹\n", + "\n", + "MMAction2 ไธญๅทฒ็ปๆ”ฏๆŒ่ฎญ็ปƒ MultiSports ๆ•ฐๆฎ้›†๏ผŒ่ฟ™้‡Œๅช้œ€่ฆไฟฎๆ”น proposal ๆ–‡ไปถ็š„่ทฏๅพ„ๅณๅฏ, ่ฏฆ็ป†้…็ฝฎๅฏไปฅๅ‚่€ƒ [config](configs/slowonly_k400_multisports.py) ๆ–‡ไปถใ€‚็”ฑไบŽ่ฎญ็ปƒๆ•ฐๆฎ่พƒๅฐ‘๏ผŒ้…็ฝฎไธญๅฐ†ๅœจๅฎŒๆ•ด MultiSports ๆ•ฐๆฎ้›†ไธŠ่ฎญ็ปƒๅพ—ๅˆฐ็š„ๆจกๅž‹ไฝœไธบ้ข„่ฎญ็ปƒๆจกๅž‹๏ผŒไฝฟ็”จ่‡ชๅฎšไน‰ๆ•ฐๆฎ้›†่ฎญ็ปƒๆ—ถไธ้œ€่ฆๆŒ‡ๅฎš `load_from` ้…็ฝฎใ€‚" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cIuQTmnuxRAq", + "outputId": "253d7f08-3c89-4e31-c5f4-3880aed5d817" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n", + "06/15 06:50:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: linux\n", + " Python: 3.10.12 (main, Jun 7 2023, 12:45:35) [GCC 9.4.0]\n", + " CUDA available: True\n", + " numpy_random_seed: 546414243\n", + " GPU 0: Tesla T4\n", + " CUDA_HOME: /usr/local/cuda\n", + " NVCC: Cuda compilation tools, release 11.8, V11.8.89\n", + " GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + " PyTorch: 2.0.1+cu118\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 9.3\n", + " - C++ Version: 201703\n", + " - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n", + " - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n", + " - OpenMP 201511 (a.k.a. OpenMP 4.5)\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: AVX2\n", + " - CUDA Runtime 11.8\n", + " - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n", + " - CuDNN 8.7\n", + " - Magma 2.6.1\n", + " - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.15.2+cu118\n", + " OpenCV: 4.7.0\n", + " MMEngine: 0.7.4\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 546414243\n", + " diff_rank_seed: False\n", + " deterministic: False\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "06/15 06:50:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "default_scope = 'mmaction'\n", + "default_hooks = dict(\n", + " runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n", + " timer=dict(type='IterTimerHook', _scope_='mmaction'),\n", + " logger=dict(\n", + " type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n", + " param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n", + " checkpoint=dict(\n", + " type='CheckpointHook',\n", + " interval=1,\n", + " save_best='auto',\n", + " _scope_='mmaction'),\n", + " sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n", + " sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n", + " dist_cfg=dict(backend='nccl'))\n", + "log_processor = dict(\n", + " type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n", + "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n", + "visualizer = dict(\n", + " type='ActionVisualizer',\n", + " vis_backends=[dict(type='LocalVisBackend')],\n", + " _scope_='mmaction')\n", + "log_level = 'INFO'\n", + "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n", + "resume = False\n", + "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n", + "num_classes = 66\n", + "model = dict(\n", + " type='FastRCNN',\n", + " _scope_='mmdet',\n", + " init_cfg=dict(\n", + " type='Pretrained',\n", + " checkpoint=\n", + " 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n", + " ),\n", + " backbone=dict(\n", + " type='mmaction.ResNet3dSlowOnly',\n", + " depth=50,\n", + " pretrained=None,\n", + " pretrained2d=False,\n", + " lateral=False,\n", + " num_stages=4,\n", + " conv1_kernel=(1, 7, 7),\n", + " conv1_stride_t=1,\n", + " pool1_stride_t=1,\n", + " spatial_strides=(1, 2, 2, 1)),\n", + " roi_head=dict(\n", + " type='AVARoIHead',\n", + " bbox_roi_extractor=dict(\n", + " type='SingleRoIExtractor3D',\n", + " roi_layer_type='RoIAlign',\n", + " output_size=8,\n", + " with_temporal_pool=True),\n", + " bbox_head=dict(\n", + " type='BBoxHeadAVA',\n", + " in_channels=2048,\n", + " num_classes=66,\n", + " multilabel=False,\n", + " dropout_ratio=0.5)),\n", + " data_preprocessor=dict(\n", + " type='mmaction.ActionDataPreprocessor',\n", + " mean=[123.675, 116.28, 103.53],\n", + " std=[58.395, 57.12, 57.375],\n", + " format_shape='NCTHW'),\n", + " train_cfg=dict(\n", + " rcnn=dict(\n", + " assigner=dict(\n", + " type='MaxIoUAssignerAVA',\n", + " pos_iou_thr=0.9,\n", + " neg_iou_thr=0.9,\n", + " min_pos_iou=0.9),\n", + " sampler=dict(\n", + " type='RandomSampler',\n", + " num=32,\n", + " pos_fraction=1,\n", + " neg_pos_ub=-1,\n", + " add_gt_as_proposals=True),\n", + " pos_weight=1.0)),\n", + " test_cfg=dict(rcnn=None))\n", + "dataset_type = 'AVADataset'\n", + "data_root = 'data/multisports/trainval'\n", + "anno_root = 'data/multisports/annotations'\n", + "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n", + "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n", + "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n", + "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n", + "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n", + "file_client_args = dict(io_backend='disk')\n", + "train_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " _scope_='mmaction'),\n", + " dict(type='DecordDecode', _scope_='mmaction'),\n", + " dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n", + " dict(type='RandomCrop', size=256, _scope_='mmaction'),\n", + " dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n", + " dict(\n", + " type='FormatShape',\n", + " input_format='NCTHW',\n", + " collapse=True,\n", + " _scope_='mmaction'),\n", + " dict(type='PackActionInputs', _scope_='mmaction')\n", + "]\n", + "val_pipeline = [\n", + " dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True,\n", + " _scope_='mmaction'),\n", + " dict(type='DecordDecode', _scope_='mmaction'),\n", + " dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n", + " dict(\n", + " type='FormatShape',\n", + " input_format='NCTHW',\n", + " collapse=True,\n", + " _scope_='mmaction'),\n", + " dict(type='PackActionInputs', _scope_='mmaction')\n", + "]\n", + "train_dataloader = dict(\n", + " batch_size=2,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_train.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n", + " dict(type='DecordDecode'),\n", + " dict(type='RandomRescale', scale_range=(256, 320)),\n", + " dict(type='RandomCrop', size=256),\n", + " dict(type='Flip', flip_ratio=0.5),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_proposals_train.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=2,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_val.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_proposals_val.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " test_mode=True,\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " num_workers=8,\n", + " persistent_workers=True,\n", + " sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n", + " dataset=dict(\n", + " type='AVADataset',\n", + " ann_file='data/multisports/annotations/multisports_val.csv',\n", + " pipeline=[\n", + " dict(type='DecordInit', io_backend='disk'),\n", + " dict(\n", + " type='SampleAVAFrames',\n", + " clip_len=4,\n", + " frame_interval=16,\n", + " test_mode=True),\n", + " dict(type='DecordDecode'),\n", + " dict(type='Resize', scale=(-1, 256)),\n", + " dict(type='FormatShape', input_format='NCTHW', collapse=True),\n", + " dict(type='PackActionInputs')\n", + " ],\n", + " num_classes=66,\n", + " proposal_file=\n", + " 'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n", + " data_prefix=dict(img='data/multisports/trainval'),\n", + " test_mode=True,\n", + " timestamp_start=1,\n", + " start_index=0,\n", + " use_frames=False,\n", + " fps=1,\n", + " _scope_='mmaction'))\n", + "val_evaluator = dict(\n", + " type='MultiSportsMetric',\n", + " ann_file='data/multisports/annotations/multisports_GT.pkl',\n", + " _scope_='mmaction')\n", + "test_evaluator = dict(\n", + " type='MultiSportsMetric',\n", + " ann_file='data/multisports/annotations/multisports_GT.pkl',\n", + " _scope_='mmaction')\n", + "train_cfg = dict(\n", + " type='EpochBasedTrainLoop',\n", + " max_epochs=8,\n", + " val_begin=1,\n", + " val_interval=1,\n", + " _scope_='mmaction')\n", + "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n", + "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n", + "param_scheduler = [\n", + " dict(\n", + " type='LinearLR',\n", + " start_factor=0.1,\n", + " by_epoch=True,\n", + " begin=0,\n", + " end=5,\n", + " _scope_='mmaction'),\n", + " dict(\n", + " type='MultiStepLR',\n", + " begin=0,\n", + " end=8,\n", + " by_epoch=True,\n", + " milestones=[6, 7],\n", + " gamma=0.1,\n", + " _scope_='mmaction')\n", + "]\n", + "optim_wrapper = dict(\n", + " optimizer=dict(\n", + " type='SGD',\n", + " lr=0.01,\n", + " momentum=0.9,\n", + " weight_decay=1e-05,\n", + " _scope_='mmaction'),\n", + " clip_grad=dict(max_norm=5, norm_type=2))\n", + "launcher = 'none'\n", + "work_dir = 'work_dirs/stad_model/'\n", + "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n", + "\n", + "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n", + "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n", + "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n", + "100% 124M/124M [00:05<00:00, 25.9MB/s]\n", + "06/15 06:51:12 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n", + "\n", + "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n", + "\n", + "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n", + "\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "100% 122M/122M [00:04<00:00, 29.7MB/s]\n", + "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n", + "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n", + "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n", + "06/15 06:51:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118] lr: 1.0000e-03 eta: 0:07:06 time: 0.4613 data_time: 0.0472 memory: 1381 grad_norm: 17.8613 loss: 1.1505 recall@thr=0.5: 0.6667 prec@thr=0.5: 0.6667 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 1.1505\n", + "06/15 06:51:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118] lr: 1.0000e-03 eta: 0:05:28 time: 0.2655 data_time: 0.0204 memory: 1381 grad_norm: 6.8642 loss: 0.5417 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.5417\n", + "06/15 06:51:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118] lr: 1.0000e-03 eta: 0:05:06 time: 0.3121 data_time: 0.0505 memory: 1381 grad_norm: 5.3190 loss: 0.6625 recall@thr=0.5: 0.9000 prec@thr=0.5: 0.9000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.6625\n", + "06/15 06:51:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118] lr: 1.0000e-03 eta: 0:04:44 time: 0.2771 data_time: 0.0255 memory: 1381 grad_norm: 3.0057 loss: 0.6646 recall@thr=0.5: 0.9231 prec@thr=0.5: 0.9231 recall@top3: 0.9231 prec@top3: 0.3077 recall@top5: 0.9231 prec@top5: 0.1846 loss_action_cls: 0.6646\n", + "06/15 06:51:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118] lr: 1.0000e-03 eta: 0:04:26 time: 0.2625 data_time: 0.0130 memory: 1381 grad_norm: 1.8442 loss: 0.5711 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.5711\n", + "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118] lr: 1.0000e-03 eta: 0:04:18 time: 0.2930 data_time: 0.0322 memory: 1381 grad_norm: 2.5183 loss: 0.6887 recall@thr=0.5: 0.6923 prec@thr=0.5: 0.6923 recall@top3: 0.6923 prec@top3: 0.2308 recall@top5: 0.6923 prec@top5: 0.1385 loss_action_cls: 0.6887\n", + "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n", + "06/15 06:51:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120] eta: 0:00:14 time: 0.1446 data_time: 0.0853 memory: 466 \n", + "06/15 06:52:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120] eta: 0:00:10 time: 0.1124 data_time: 0.0612 memory: 466 \n", + "06/15 06:52:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120] eta: 0:00:07 time: 0.1016 data_time: 0.0505 memory: 466 \n", + "06/15 06:52:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120] eta: 0:00:04 time: 0.1083 data_time: 0.0581 memory: 466 \n", + "06/15 06:52:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120] eta: 0:00:02 time: 0.1650 data_time: 0.1102 memory: 466 \n", + "06/15 06:52:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] eta: 0:00:00 time: 0.1410 data_time: 0.0866 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 47.41\n", + "aerobic split jump 30.01\n", + "aerobic scissors leap 88.94\n", + "aerobic turn 98.43\n", + "mAP 66.20\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 25.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 100.00\n", + "mAP 56.25\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 100.00\n", + "mAP 36.25\n", + "06/15 06:52:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120] mAP/frameAP: 66.1965 mAP/v_map@0.2: 56.2500 mAP/v_map@0.5: 36.2500 mAP/v_map_0.05:0.45: 50.4167 mAP/v_map_0.10:0.90: 37.7963 mAP/v_map_0.50:0.95: 26.8167 data_time: 0.0753 time: 0.1288\n", + "06/15 06:52:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 66.1965 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n", + "06/15 06:52:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118] lr: 3.2500e-03 eta: 0:04:11 time: 0.3098 data_time: 0.0484 memory: 1381 grad_norm: 1.1745 loss: 0.4384 recall@thr=0.5: 0.7857 prec@thr=0.5: 0.7857 recall@top3: 0.9286 prec@top3: 0.3095 recall@top5: 0.9286 prec@top5: 0.1857 loss_action_cls: 0.4384\n", + "06/15 06:52:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118] lr: 3.2500e-03 eta: 0:04:06 time: 0.3245 data_time: 0.0667 memory: 1381 grad_norm: 1.0271 loss: 0.3960 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 0.9333 prec@top3: 0.3111 recall@top5: 0.9333 prec@top5: 0.1867 loss_action_cls: 0.3960\n", + "06/15 06:52:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118] lr: 3.2500e-03 eta: 0:03:55 time: 0.2572 data_time: 0.0111 memory: 1381 grad_norm: 0.8150 loss: 0.3958 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3958\n", + "06/15 06:52:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118] lr: 3.2500e-03 eta: 0:03:47 time: 0.2843 data_time: 0.0167 memory: 1381 grad_norm: 1.4691 loss: 0.4575 recall@thr=0.5: 0.9333 prec@thr=0.5: 0.9333 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4575\n", + "06/15 06:52:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118] lr: 3.2500e-03 eta: 0:03:41 time: 0.3118 data_time: 0.0559 memory: 1381 grad_norm: 1.9420 loss: 0.5529 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.5529\n", + "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118] lr: 3.2500e-03 eta: 0:03:33 time: 0.2532 data_time: 0.0082 memory: 1381 grad_norm: 1.6790 loss: 0.4253 recall@thr=0.5: 0.7500 prec@thr=0.5: 0.7500 recall@top3: 0.8333 prec@top3: 0.2778 recall@top5: 0.8333 prec@top5: 0.1667 loss_action_cls: 0.4253\n", + "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n", + "06/15 06:52:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120] eta: 0:00:15 time: 0.1515 data_time: 0.0968 memory: 466 \n", + "06/15 06:53:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120] eta: 0:00:12 time: 0.1679 data_time: 0.1143 memory: 466 \n", + "06/15 06:53:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120] eta: 0:00:08 time: 0.1134 data_time: 0.0631 memory: 466 \n", + "06/15 06:53:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120] eta: 0:00:05 time: 0.0961 data_time: 0.0459 memory: 466 \n", + "06/15 06:53:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120] eta: 0:00:02 time: 0.1063 data_time: 0.0549 memory: 466 \n", + "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] eta: 0:00:00 time: 0.1017 data_time: 0.0522 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 42.09\n", + "aerobic split jump 27.71\n", + "aerobic scissors leap 90.02\n", + "aerobic turn 95.76\n", + "mAP 63.89\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 100.00\n", + "mAP 55.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 100.00\n", + "mAP 34.00\n", + "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120] mAP/frameAP: 63.8934 mAP/v_map@0.2: 55.0000 mAP/v_map@0.5: 34.0000 mAP/v_map_0.05:0.45: 51.8889 mAP/v_map_0.10:0.90: 34.0278 mAP/v_map_0.50:0.95: 18.7250 data_time: 0.0710 time: 0.1226\n", + "06/15 06:53:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118] lr: 5.5000e-03 eta: 0:03:34 time: 0.4330 data_time: 0.1493 memory: 1381 grad_norm: 0.4795 loss: 0.5049 recall@thr=0.5: 0.8462 prec@thr=0.5: 0.8462 recall@top3: 0.8462 prec@top3: 0.2821 recall@top5: 0.8462 prec@top5: 0.1692 loss_action_cls: 0.5049\n", + "06/15 06:53:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118] lr: 5.5000e-03 eta: 0:03:27 time: 0.2948 data_time: 0.0370 memory: 1381 grad_norm: 0.8584 loss: 0.4820 recall@thr=0.5: 0.6154 prec@thr=0.5: 0.6154 recall@top3: 0.6154 prec@top3: 0.2051 recall@top5: 0.6154 prec@top5: 0.1231 loss_action_cls: 0.4820\n", + "06/15 06:53:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118] lr: 5.5000e-03 eta: 0:03:19 time: 0.2622 data_time: 0.0118 memory: 1381 grad_norm: 1.1041 loss: 0.2944 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2944\n", + "06/15 06:53:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118] lr: 5.5000e-03 eta: 0:03:13 time: 0.3111 data_time: 0.0470 memory: 1381 grad_norm: 0.8394 loss: 0.3393 recall@thr=0.5: 0.9091 prec@thr=0.5: 0.9091 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3393\n", + "06/15 06:53:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118] lr: 5.5000e-03 eta: 0:03:06 time: 0.2989 data_time: 0.0417 memory: 1381 grad_norm: 0.2155 loss: 0.4345 recall@thr=0.5: 0.8182 prec@thr=0.5: 0.8182 recall@top3: 0.8182 prec@top3: 0.2727 recall@top5: 0.8182 prec@top5: 0.1636 loss_action_cls: 0.4345\n", + "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118] lr: 5.5000e-03 eta: 0:02:59 time: 0.2576 data_time: 0.0112 memory: 1381 grad_norm: 0.2509 loss: 0.4634 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4634\n", + "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n", + "06/15 06:53:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120] eta: 0:00:18 time: 0.1815 data_time: 0.1180 memory: 466 \n", + "06/15 06:53:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120] eta: 0:00:13 time: 0.1451 data_time: 0.0905 memory: 466 \n", + "06/15 06:53:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120] eta: 0:00:08 time: 0.1020 data_time: 0.0510 memory: 466 \n", + "06/15 06:53:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120] eta: 0:00:05 time: 0.1008 data_time: 0.0528 memory: 466 \n", + "06/15 06:54:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120] eta: 0:00:02 time: 0.1072 data_time: 0.0569 memory: 466 \n", + "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120] eta: 0:00:00 time: 0.1018 data_time: 0.0536 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 37.09\n", + "aerobic split jump 27.98\n", + "aerobic scissors leap 89.41\n", + "aerobic turn 95.67\n", + "mAP 62.54\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 100.00\n", + "aerobic turn 100.00\n", + "mAP 55.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 36.00\n", + "aerobic turn 100.00\n", + "mAP 34.00\n", + "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120] mAP/frameAP: 62.5361 mAP/v_map@0.2: 55.0000 mAP/v_map@0.5: 34.0000 mAP/v_map_0.05:0.45: 51.2222 mAP/v_map_0.10:0.90: 34.1389 mAP/v_map_0.50:0.95: 18.7250 data_time: 0.0704 time: 0.1229\n", + "06/15 06:54:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118] lr: 7.7500e-03 eta: 0:02:55 time: 0.3717 data_time: 0.0993 memory: 1381 grad_norm: 0.2139 loss: 0.3119 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3119\n", + "06/15 06:54:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118] lr: 7.7500e-03 eta: 0:02:48 time: 0.2730 data_time: 0.0230 memory: 1381 grad_norm: 0.6102 loss: 0.4782 recall@thr=0.5: 0.9375 prec@thr=0.5: 0.9375 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4782\n", + "06/15 06:54:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118] lr: 7.7500e-03 eta: 0:02:41 time: 0.2895 data_time: 0.0311 memory: 1381 grad_norm: 0.4057 loss: 0.3422 recall@thr=0.5: 0.9474 prec@thr=0.5: 0.9474 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3422\n", + "06/15 06:54:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118] lr: 7.7500e-03 eta: 0:02:36 time: 0.3170 data_time: 0.0490 memory: 1381 grad_norm: 0.3051 loss: 0.3628 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3628\n", + "06/15 06:54:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118] lr: 7.7500e-03 eta: 0:02:29 time: 0.2633 data_time: 0.0131 memory: 1381 grad_norm: 0.1671 loss: 0.3691 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3691\n", + "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118] lr: 7.7500e-03 eta: 0:02:23 time: 0.2721 data_time: 0.0181 memory: 1381 grad_norm: 0.1954 loss: 0.3076 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3076\n", + "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n", + "06/15 06:54:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120] eta: 0:00:14 time: 0.1431 data_time: 0.0854 memory: 466 \n", + "06/15 06:54:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120] eta: 0:00:10 time: 0.1086 data_time: 0.0584 memory: 466 \n", + "06/15 06:54:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120] eta: 0:00:07 time: 0.1056 data_time: 0.0552 memory: 466 \n", + "06/15 06:54:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120] eta: 0:00:04 time: 0.0922 data_time: 0.0399 memory: 466 \n", + "06/15 06:54:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120] eta: 0:00:02 time: 0.1166 data_time: 0.0671 memory: 466 \n", + "06/15 06:54:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120] eta: 0:00:00 time: 0.1468 data_time: 0.0927 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 25.62\n", + "aerobic split jump 28.75\n", + "aerobic scissors leap 89.02\n", + "aerobic turn 93.30\n", + "mAP 59.17\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 20.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 100.00\n", + "mAP 50.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 100.00\n", + "mAP 36.25\n", + "06/15 06:54:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120] mAP/frameAP: 59.1749 mAP/v_map@0.2: 50.0000 mAP/v_map@0.5: 36.2500 mAP/v_map_0.05:0.45: 46.9444 mAP/v_map_0.10:0.90: 28.9352 mAP/v_map_0.50:0.95: 14.6667 data_time: 0.0663 time: 0.1186\n", + "06/15 06:55:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118] lr: 1.0000e-02 eta: 0:02:17 time: 0.3090 data_time: 0.0513 memory: 1381 grad_norm: 0.2988 loss: 0.3067 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3067\n", + "06/15 06:55:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118] lr: 1.0000e-02 eta: 0:02:10 time: 0.2584 data_time: 0.0142 memory: 1381 grad_norm: 0.6702 loss: 0.3996 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3996\n", + "06/15 06:55:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118] lr: 1.0000e-02 eta: 0:02:04 time: 0.3286 data_time: 0.0617 memory: 1381 grad_norm: 0.4347 loss: 0.4374 recall@thr=0.5: 0.8462 prec@thr=0.5: 0.8462 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4374\n", + "06/15 06:55:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118] lr: 1.0000e-02 eta: 0:01:58 time: 0.2774 data_time: 0.0247 memory: 1381 grad_norm: 0.4373 loss: 0.3679 recall@thr=0.5: 0.7500 prec@thr=0.5: 0.7500 recall@top3: 0.8750 prec@top3: 0.2917 recall@top5: 0.8750 prec@top5: 0.1750 loss_action_cls: 0.3679\n", + "06/15 06:55:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118] lr: 1.0000e-02 eta: 0:01:51 time: 0.2603 data_time: 0.0108 memory: 1381 grad_norm: 0.2507 loss: 0.3226 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3226\n", + "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118] lr: 1.0000e-02 eta: 0:01:46 time: 0.3256 data_time: 0.0497 memory: 1381 grad_norm: 0.0940 loss: 0.2914 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2914\n", + "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n", + "06/15 06:55:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120] eta: 0:00:11 time: 0.1166 data_time: 0.0625 memory: 466 \n", + "06/15 06:55:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120] eta: 0:00:09 time: 0.1119 data_time: 0.0618 memory: 466 \n", + "06/15 06:55:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120] eta: 0:00:06 time: 0.1012 data_time: 0.0504 memory: 466 \n", + "06/15 06:55:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120] eta: 0:00:04 time: 0.1017 data_time: 0.0537 memory: 466 \n", + "06/15 06:55:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120] eta: 0:00:02 time: 0.1766 data_time: 0.1239 memory: 466 \n", + "06/15 06:55:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120] eta: 0:00:00 time: 0.1421 data_time: 0.0884 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 17.82\n", + "aerobic split jump 20.05\n", + "aerobic scissors leap 89.00\n", + "aerobic turn 91.20\n", + "mAP 54.52\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 60.00\n", + "mAP 35.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 26.67\n", + "mAP 17.92\n", + "06/15 06:55:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120] mAP/frameAP: 54.5189 mAP/v_map@0.2: 35.0000 mAP/v_map@0.5: 17.9167 mAP/v_map_0.05:0.45: 31.2037 mAP/v_map_0.10:0.90: 19.0741 mAP/v_map_0.50:0.95: 9.5833 data_time: 0.0733 time: 0.1249\n", + "06/15 06:55:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118] lr: 1.0000e-02 eta: 0:01:40 time: 0.2867 data_time: 0.0385 memory: 1381 grad_norm: 0.1572 loss: 0.3008 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3008\n", + "06/15 06:55:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118] lr: 1.0000e-02 eta: 0:01:34 time: 0.2720 data_time: 0.0167 memory: 1381 grad_norm: 0.0803 loss: 0.2377 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2377\n", + "06/15 06:56:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118] lr: 1.0000e-02 eta: 0:01:28 time: 0.3423 data_time: 0.0840 memory: 1381 grad_norm: 0.3120 loss: 0.2442 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2442\n", + "06/15 06:56:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118] lr: 1.0000e-02 eta: 0:01:22 time: 0.2580 data_time: 0.0112 memory: 1381 grad_norm: 0.5726 loss: 0.3794 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3794\n", + "06/15 06:56:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118] lr: 1.0000e-02 eta: 0:01:16 time: 0.2949 data_time: 0.0347 memory: 1381 grad_norm: 0.1732 loss: 0.3004 recall@thr=0.5: 0.8750 prec@thr=0.5: 0.8750 recall@top3: 0.8750 prec@top3: 0.2917 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3004\n", + "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118] lr: 1.0000e-02 eta: 0:01:10 time: 0.3258 data_time: 0.0625 memory: 1381 grad_norm: 0.3709 loss: 0.3439 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3439\n", + "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n", + "06/15 06:56:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120] eta: 0:00:11 time: 0.1169 data_time: 0.0624 memory: 466 \n", + "06/15 06:56:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120] eta: 0:00:09 time: 0.1131 data_time: 0.0631 memory: 466 \n", + "06/15 06:56:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120] eta: 0:00:06 time: 0.1064 data_time: 0.0553 memory: 466 \n", + "06/15 06:56:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120] eta: 0:00:04 time: 0.1401 data_time: 0.0862 memory: 466 \n", + "06/15 06:56:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120] eta: 0:00:02 time: 0.1519 data_time: 0.0982 memory: 466 \n", + "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120] eta: 0:00:00 time: 0.0986 data_time: 0.0486 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 19.05\n", + "aerobic split jump 22.20\n", + "aerobic scissors leap 85.83\n", + "aerobic turn 79.04\n", + "mAP 51.53\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 0.00\n", + "mAP 20.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 0.00\n", + "mAP 11.25\n", + "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120] mAP/frameAP: 51.5300 mAP/v_map@0.2: 20.0000 mAP/v_map@0.5: 11.2500 mAP/v_map_0.05:0.45: 18.0556 mAP/v_map_0.10:0.90: 11.8519 mAP/v_map_0.50:0.95: 6.9167 data_time: 0.0688 time: 0.1209\n", + "06/15 06:56:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118] lr: 1.0000e-03 eta: 0:01:04 time: 0.2819 data_time: 0.0331 memory: 1381 grad_norm: 0.2811 loss: 0.2776 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2776\n", + "06/15 06:56:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118] lr: 1.0000e-03 eta: 0:00:58 time: 0.3114 data_time: 0.0473 memory: 1381 grad_norm: 0.1573 loss: 0.2043 recall@thr=0.5: 0.8182 prec@thr=0.5: 0.8182 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2043\n", + "06/15 06:56:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118] lr: 1.0000e-03 eta: 0:00:52 time: 0.2903 data_time: 0.0342 memory: 1381 grad_norm: 0.1343 loss: 0.3411 recall@thr=0.5: 0.8667 prec@thr=0.5: 0.8667 recall@top3: 0.8667 prec@top3: 0.2889 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3411\n", + "06/15 06:57:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118] lr: 1.0000e-03 eta: 0:00:46 time: 0.2623 data_time: 0.0128 memory: 1381 grad_norm: 0.1026 loss: 0.2895 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2895\n", + "06/15 06:57:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118] lr: 1.0000e-03 eta: 0:00:40 time: 0.3206 data_time: 0.0503 memory: 1381 grad_norm: 0.1911 loss: 0.3552 recall@thr=0.5: 0.7333 prec@thr=0.5: 0.7333 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3552\n", + "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118] lr: 1.0000e-03 eta: 0:00:35 time: 0.2884 data_time: 0.0335 memory: 1381 grad_norm: 0.1274 loss: 0.4391 recall@thr=0.5: 0.8571 prec@thr=0.5: 0.8571 recall@top3: 0.8571 prec@top3: 0.2857 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.4391\n", + "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n", + "06/15 06:57:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120] eta: 0:00:11 time: 0.1193 data_time: 0.0693 memory: 466 \n", + "06/15 06:57:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120] eta: 0:00:09 time: 0.1188 data_time: 0.0670 memory: 466 \n", + "06/15 06:57:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120] eta: 0:00:08 time: 0.1645 data_time: 0.1114 memory: 466 \n", + "06/15 06:57:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120] eta: 0:00:05 time: 0.1391 data_time: 0.0850 memory: 466 \n", + "06/15 06:57:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120] eta: 0:00:02 time: 0.1104 data_time: 0.0585 memory: 466 \n", + "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120] eta: 0:00:00 time: 0.1025 data_time: 0.0512 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 20.79\n", + "aerobic split jump 20.11\n", + "aerobic scissors leap 84.84\n", + "aerobic turn 78.58\n", + "mAP 51.08\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 20.00\n", + "mAP 25.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 0.00\n", + "mAP 11.25\n", + "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120] mAP/frameAP: 51.0794 mAP/v_map@0.2: 25.0000 mAP/v_map@0.5: 11.2500 mAP/v_map_0.05:0.45: 22.5000 mAP/v_map_0.10:0.90: 14.0741 mAP/v_map_0.50:0.95: 6.9167 data_time: 0.0735 time: 0.1255\n", + "06/15 06:57:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118] lr: 1.0000e-04 eta: 0:00:29 time: 0.2894 data_time: 0.0322 memory: 1381 grad_norm: 0.1227 loss: 0.3286 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3286\n", + "06/15 06:57:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118] lr: 1.0000e-04 eta: 0:00:23 time: 0.4105 data_time: 0.1257 memory: 1381 grad_norm: 0.1948 loss: 0.3202 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3202\n", + "06/15 06:57:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118] lr: 1.0000e-04 eta: 0:00:17 time: 0.3095 data_time: 0.0537 memory: 1381 grad_norm: 0.7997 loss: 0.2428 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2428\n", + "06/15 06:57:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118] lr: 1.0000e-04 eta: 0:00:11 time: 0.2918 data_time: 0.0330 memory: 1381 grad_norm: 0.8157 loss: 0.3045 recall@thr=0.5: 1.0000 prec@thr=0.5: 1.0000 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.3045\n", + "06/15 06:58:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118] lr: 1.0000e-04 eta: 0:00:05 time: 0.3443 data_time: 0.0786 memory: 1381 grad_norm: 0.0966 loss: 0.2605 recall@thr=0.5: 0.9375 prec@thr=0.5: 0.9375 recall@top3: 1.0000 prec@top3: 0.3333 recall@top5: 1.0000 prec@top5: 0.2000 loss_action_cls: 0.2605\n", + "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n", + "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118] lr: 1.0000e-04 eta: 0:00:00 time: 0.2611 data_time: 0.0148 memory: 1381 grad_norm: 0.3034 loss: 0.2694 recall@thr=0.5: 0.9231 prec@thr=0.5: 0.9231 recall@top3: 0.9231 prec@top3: 0.3077 recall@top5: 0.9231 prec@top5: 0.1846 loss_action_cls: 0.2694\n", + "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n", + "06/15 06:58:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120] eta: 0:00:14 time: 0.1433 data_time: 0.0869 memory: 466 \n", + "06/15 06:58:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120] eta: 0:00:12 time: 0.1664 data_time: 0.1160 memory: 466 \n", + "06/15 06:58:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120] eta: 0:00:08 time: 0.1269 data_time: 0.0772 memory: 466 \n", + "06/15 06:58:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120] eta: 0:00:05 time: 0.0951 data_time: 0.0455 memory: 466 \n", + "06/15 06:58:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120] eta: 0:00:02 time: 0.1144 data_time: 0.0630 memory: 466 \n", + "06/15 06:58:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120] eta: 0:00:00 time: 0.1028 data_time: 0.0530 memory: 466 \n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluate aerobic kick jump\n", + "do not evaluate aerobic off axis jump\n", + "do not evaluate aerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluate aerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluate basketball save\n", + "do not evaluate basketball jump ball\n", + "frameAP_0.5\n", + "\n", + "aerobic straight jump 15.29\n", + "aerobic split jump 20.74\n", + "aerobic scissors leap 86.38\n", + "aerobic turn 80.98\n", + "mAP 50.85\n", + "\u001b[2Klinking tubes... \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hno such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.2\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 80.00\n", + "aerobic turn 20.00\n", + "mAP 25.00\n", + "no such label 0 aerobic push up\n", + "no such label 1 aerobic explosive push up\n", + "no such label 2 aerobic explosive support\n", + "no such label 3 aerobic leg circle\n", + "no such label 4 aerobic helicopter\n", + "no such label 5 aerobic support\n", + "no such label 6 aerobic v support\n", + "no such label 7 aerobic horizontal support\n", + "no such label 9 aerobic illusion\n", + "no such label 10 aerobic bent leg(s) jump\n", + "no such label 11 aerobic pike jump\n", + "no such label 12 aerobic straddle jump\n", + "do not evaluateaerobic kick jump\n", + "do not evaluateaerobic off axis jump\n", + "do not evaluateaerobic butterfly jump\n", + "no such label 18 aerobic split\n", + "do not evaluateaerobic balance turn\n", + "no such label 21 volleyball serve\n", + "no such label 22 volleyball block\n", + "no such label 23 volleyball first pass\n", + "no such label 24 volleyball defend\n", + "no such label 25 volleyball protect\n", + "no such label 26 volleyball second pass\n", + "no such label 27 volleyball adjust\n", + "no such label 28 volleyball save\n", + "no such label 29 volleyball second attack\n", + "no such label 30 volleyball spike\n", + "no such label 31 volleyball dink\n", + "no such label 32 volleyball no offensive attack\n", + "no such label 33 football shoot\n", + "no such label 34 football long pass\n", + "no such label 35 football short pass\n", + "no such label 36 football through pass\n", + "no such label 37 football cross\n", + "no such label 38 football dribble\n", + "no such label 39 football trap\n", + "no such label 40 football throw\n", + "no such label 41 football diving\n", + "no such label 42 football tackle\n", + "no such label 43 football steal\n", + "no such label 44 football clearance\n", + "no such label 45 football block\n", + "no such label 46 football press\n", + "no such label 47 football aerial duels\n", + "no such label 48 basketball pass\n", + "no such label 49 basketball drive\n", + "no such label 50 basketball dribble\n", + "no such label 51 basketball 3-point shot\n", + "no such label 52 basketball 2-point shot\n", + "no such label 53 basketball free throw\n", + "no such label 54 basketball block\n", + "no such label 55 basketball offensive rebound\n", + "no such label 56 basketball defensive rebound\n", + "no such label 57 basketball pass steal\n", + "no such label 58 basketball dribble steal\n", + "no such label 59 basketball interfere shot\n", + "no such label 60 basketball pick-and-roll defensive\n", + "no such label 61 basketball sag\n", + "no such label 62 basketball screen\n", + "no such label 63 basketball pass-inbound\n", + "do not evaluatebasketball save\n", + "do not evaluatebasketball jump ball\n", + "VideoAP_0.5\n", + "\n", + "aerobic straight jump 0.00\n", + "aerobic split jump 0.00\n", + "aerobic scissors leap 45.00\n", + "aerobic turn 20.00\n", + "mAP 16.25\n", + "06/15 06:58:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120] mAP/frameAP: 50.8487 mAP/v_map@0.2: 25.0000 mAP/v_map@0.5: 16.2500 mAP/v_map_0.05:0.45: 23.0556 mAP/v_map_0.10:0.90: 15.1852 mAP/v_map_0.50:0.95: 8.4167 data_time: 0.0732 time: 0.1244\n", + "\u001b[32mTraining finished successfully. \u001b[0m\n" + ] + } + ], + "source": [ + "# ไฝฟ็”จ MIM ่ฎญ็ปƒๆจกๅž‹\n", + "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n", + " --work-dir work_dirs/stad_model/" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "HCg6C9HYxRAt" + }, + "source": [ + "## 4. ๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹ๆŽจ็†\n", + "\n", + "่ฎญ็ปƒๅพ—ๅˆฐๆฃ€ๆต‹ๆจกๅž‹ๅ’Œๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ๆจกๅž‹ๅŽ๏ผŒๆˆ‘ไปฌๅฏไปฅๅˆฉ็”จๆ—ถ็ฉบ่กŒไธบๆฃ€ๆต‹ demo ่ฟ›่กŒๆŽจ็†๏ผŒๅฏ่ง†ๅŒ–ๆจกๅž‹ๆ•ˆๆžœใ€‚\n", + "\n", + "็”ฑไบŽ tutorial ไธญไฝฟ็”จ็š„่ฎญ็ปƒๆ•ฐๆฎ่พƒๅฐ‘๏ผŒๆจกๅž‹ๆ€ง่ƒฝ่พƒๅทฎ๏ผŒๆ‰€ไปฅๅฏ่ง†ๅŒ–ๆ—ถไฝฟ็”จ้ข„ๅ…ˆ่ฎญ็ปƒๅฅฝ็š„ๆจกๅž‹ใ€‚" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "WW5-IJ7IxRAu" + }, + "source": [ + "###" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FofW_5RoxRAu", + "outputId": "91217660-946d-48ab-f663-b0f7f2d6a6f6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n", + "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n", + "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n", + "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n", + "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n", + "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n", + "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n", + "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n", + "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n", + "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n", + "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n", + "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n", + "Performing Human Detection for each frame\n", + "[>>] 99/99, 6.8 task/s, elapsed: 15s, ETA: 0s\n", + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n", + "Performing SpatioTemporal Action Detection for each clip\n", + "[>>] 99/99, 16.6 task/s, elapsed: 6s, ETA: 0sPerforming visualization\n", + "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n", + "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n", + "\n", + "Moviepy - Done !\n", + "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n" + ] + } + ], + "source": [ + "!python ../../demo/demo_spatiotemporal_det.py \\\n", + " data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n", + " data/demo_spatiotemporal_det.mp4 \\\n", + " --config configs/slowonly_k400_multisports.py \\\n", + " --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n", + " --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n", + " --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n", + " --det-score-thr 0.85 \\\n", + " --action-score-thr 0.8 \\\n", + " --label-map ../../tools/data/multisports/label_map.txt \\\n", + " --predict-stepsize 8 \\\n", + " --output-stepsize 1 \\\n", + " --output-fps 24" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 741 + }, + "id": "677FUWFRxRAv", + "outputId": "f702d544-3492-494c-af81-9e90f43d6b6c" + }, + "outputs": [], + "source": [ + "# Show Video\n", + "import moviepy.editor\n", + "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/projects/stad_tutorial/tools/convert_proposals.py b/projects/stad_tutorial/tools/convert_proposals.py new file mode 100644 index 0000000000000000000000000000000000000000..1ea7af4b26cd64381c993c5e3d2942f4ccf542a0 --- /dev/null +++ b/projects/stad_tutorial/tools/convert_proposals.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import numpy as np +from mmengine import dump, load, track_iter_progress + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--det_test_result', + default='data/multisports/annotations/ms_det_proposals.pkl') + parser.add_argument( + '--stad_gt', + help='spatio-temporal action detection ground truth file', + default='data/multisports/annotations/multisports_GT.pkl') + parser.add_argument( + '--out_result', + default='data/multisports/annotations/multisports_proposals.pkl') + args = parser.parse_args() + return args + + +def dump_det_result(args): + print('loading test result...') + det_result = load(args.det_test_result) + stad_gt = load(args.stad_gt) + train_list = stad_gt['train_videos'][0] + val_list = stad_gt['test_videos'][0] + train_bbox_result = {} + val_bbox_result = {} + for sample in track_iter_progress(det_result): + bboxes = sample['pred_instances']['bboxes'] + scores = sample['pred_instances']['scores'] + h, w = sample['ori_shape'] + bboxes[:, ::2] /= w + bboxes[:, 1::2] /= h + img_path = sample['img_path'] + frm_key_list = img_path.split('.jpg')[0].split('/') + frm_key = ','.join([ + f'{frm_key_list[-3]}/{frm_key_list[-2]}.mp4', + f'{int(frm_key_list[-1]):04d}' + ]) + bbox = np.concatenate([bboxes, scores[:, None]], axis=1) + + vid_key = '/'.join(frm_key_list[-3:-1]) + if vid_key in train_list: + train_bbox_result[frm_key] = bbox + elif vid_key in val_list: + val_bbox_result[frm_key] = bbox + else: + raise KeyError(vid_key) + dump(train_bbox_result, args.out_result[:-4] + '_train.pkl') + dump(val_bbox_result, args.out_result[:-4] + '_val.pkl') + + +if __name__ == '__main__': + args = parse_args() + dump_det_result(args) diff --git a/projects/stad_tutorial/tools/generate_mmdet_anno.py b/projects/stad_tutorial/tools/generate_mmdet_anno.py new file mode 100644 index 0000000000000000000000000000000000000000..5d26be41959f30dfb46b55cb65462e22ddcc2a98 --- /dev/null +++ b/projects/stad_tutorial/tools/generate_mmdet_anno.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import defaultdict + +from mmengine import dump, load + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + 'stad_anno', help='spatiotemporal action detection anno path') + parser.add_argument('det_path', help='output detection anno path') + args = parser.parse_args() + return args + + +def generate_mmdet_coco_anno(args): + ori_anno = load(args.stad_anno) + train_videos = ori_anno['train_videos'] + val_videos = ori_anno['test_videos'] + videos = {'train': train_videos, 'val': val_videos} + for split in ['train', 'val']: + img_id = 0 + bbox_id = 0 + img_list = [] + anno_list = [] + for vid in videos[split][0]: + vid_tubes = ori_anno['gttubes'][vid] + height, width = ori_anno['resolution'][vid] + frm2bbox = defaultdict(list) + for label_idx, tube_list in vid_tubes.items(): + for tube in tube_list: + for frm_anno in tube: + frm_idx, bbox = frm_anno[0], frm_anno[1:] + frm2bbox[frm_idx].append({'label': 0, 'bbox': bbox}) + for frm_idx, frm_bboxes in frm2bbox.items(): + img_path = f'{vid}/{int(frm_idx):05d}.jpg' + img_instance = { + 'file_name': img_path, + 'height': height, + 'width': width, + 'id': img_id + } + img_list.append(img_instance) + + for bbox_info in frm_bboxes: + label = bbox_info['label'] + x1, y1, x2, y2 = bbox_info['bbox'] + bbox = [x1, y1, x2 - x1, y2 - y1] + anno_instance = { + 'area': bbox[2] * bbox[3], + 'image_id': img_id, + 'bbox': bbox, + 'category_id': label, + 'iscrowd': 0, + 'id': bbox_id + } + anno_list.append(anno_instance) + bbox_id += 1 + img_id += 1 + total_anno = { + 'images': img_list, + 'annotations': anno_list, + 'categories': [{ + 'id': 0, + 'name': 'person' + }], + } + dump(total_anno, args.det_path[:-5] + f'_{split}' + args.det_path[-5:]) + + +if __name__ == '__main__': + args = parse_args() + generate_mmdet_coco_anno(args) diff --git a/projects/stad_tutorial/tools/generate_rgb.py b/projects/stad_tutorial/tools/generate_rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7520a8b4e069ebaf36de777b9a96aacddf9b8b68 --- /dev/null +++ b/projects/stad_tutorial/tools/generate_rgb.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp + +import cv2 + +src_dir = 'data/multisports/trainval' +target_dir = 'data/multisports/rawframes' + +sport_list = ['aerobic_gymnastics'] +for sport in sport_list: + video_root = osp.join(src_dir, sport) + if not osp.exists(video_root): + print('No {} video dir to generate rgb images.'.format(video_root)) + continue + print('Will generate {} rgb dir for {}.'.format( + len(os.listdir(video_root)), osp.basename(sport))) + for clip_name in os.listdir(video_root): + mp4_path = osp.join(video_root, clip_name) + save_dir = osp.join(target_dir, sport, clip_name[:-4]) + if not osp.exists(save_dir): + os.makedirs(save_dir) + cap = cv2.VideoCapture(mp4_path) + fps = cap.get(cv2.CAP_PROP_FPS) + size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), + int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))) + fourcc = cv2.VideoWriter_fourcc(*'I420') + ii = 1 + while (cap.isOpened()): + ret, frame = cap.read() + aa = str(ii) + s = aa.zfill(5) + image_name = osp.join(save_dir + '/' + s + '.jpg') + if ret is True: + cv2.imwrite(image_name, frame) + else: + break + ii = ii + 1 + cap.release() + print('Generate {} rgb dir successfully.'.format(clip_name[:-4])) diff --git a/projects/stad_tutorial/tools/images2coco.py b/projects/stad_tutorial/tools/images2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0000cebb7f6496792c88110264f554c6affd46c9 --- /dev/null +++ b/projects/stad_tutorial/tools/images2coco.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os + +from mmengine.fileio import dump, list_from_file +from mmengine.utils import mkdir_or_exist, scandir, track_parallel_progress +from PIL import Image + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert images to coco format without annotations') + parser.add_argument('img_path', help='The root path of images') + parser.add_argument( + 'classes', type=str, help='The text file name of storage class list') + parser.add_argument( + 'out', + type=str, + help='The output annotation json file name, The save dir is in the ' + 'same directory as img_path') + parser.add_argument( + '-e', + '--exclude-extensions', + type=str, + nargs='+', + help='The suffix of images to be excluded, such as "png" and "bmp"') + args = parser.parse_args() + return args + + +def get_img_info(args): + path, image_path, exclude_extensions = args + if exclude_extensions is None or ( + exclude_extensions is not None + and not image_path.lower().endswith(exclude_extensions)): + # image_path = + img_pillow = Image.open(os.path.join(path, image_path)) + img_info = { + 'filename': image_path, + 'width': img_pillow.width, + 'height': img_pillow.height, + } + return img_info + + +def collect_image_infos(path, exclude_extensions=None): + img_infos = [] + + images_generator = scandir(path, recursive=True) + + img_infos = track_parallel_progress( + get_img_info, [(path, image_path, exclude_extensions) + for image_path in images_generator], + nproc=64) + + return img_infos + + +def cvt_to_coco_json(img_infos, classes): + image_id = 0 + coco = dict() + coco['images'] = [] + coco['type'] = 'instance' + coco['categories'] = [] + coco['annotations'] = [] + image_set = set() + + for category_id, name in enumerate(classes): + category_item = dict() + category_item['supercategory'] = str('none') + category_item['id'] = int(category_id) + category_item['name'] = str(name) + coco['categories'].append(category_item) + + for img_dict in img_infos: + file_name = img_dict['filename'] + assert file_name not in image_set + image_item = dict() + image_item['id'] = int(image_id) + image_item['file_name'] = str(file_name) + image_item['height'] = int(img_dict['height']) + image_item['width'] = int(img_dict['width']) + coco['images'].append(image_item) + image_set.add(file_name) + + image_id += 1 + return coco + + +def main(): + args = parse_args() + assert args.out.endswith( + 'json'), 'The output file name must be json suffix' + + # 1 load image list info + img_infos = collect_image_infos(args.img_path, args.exclude_extensions) + + # 2 convert to coco format data + classes = list_from_file(args.classes) + coco_info = cvt_to_coco_json(img_infos, classes) + + # 3 dump + save_dir = os.path.join(args.img_path, '..', 'annotations') + mkdir_or_exist(save_dir) + save_path = os.path.join(save_dir, args.out) + dump(coco_info, save_path) + print(f'save json file: {save_path}') + + +if __name__ == '__main__': + main() diff --git a/projects/umt/README.md b/projects/umt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b1879d7824a036a80495b720d17e762c15f00282 --- /dev/null +++ b/projects/umt/README.md @@ -0,0 +1,93 @@ +# UMT Project + +[Unmasked Teacher: Towards Training-Efficient Video Foundation Models](https://arxiv.org/abs/2303.16058) + + + +## Abstract + + + +Video Foundation Models (VFMs) have received limited exploration due to high computational costs and data scarcity. Previous VFMs rely on Image Foundation Models (IFMs), which face challenges in transferring to the video domain. Although VideoMAE has trained a robust ViT from limited data, its low-level reconstruction poses convergence difficulties and conflicts with high-level cross-modal alignment. This paper proposes a training-efficient method for temporal-sensitive VFMs that integrates the benefits of existing methods. To increase data efficiency, we mask out most of the low-semantics video tokens, but selectively align the unmasked tokens with IFM, which serves as the UnMasked Teacher (UMT). By providing semantic guidance, our method enables faster convergence and multimodal friendliness. With a progressive pre-training framework, our model can handle various tasks including scene-related, temporal-related, and complex video-language understanding. Using only public sources for pre-training in 6 days on 32 A100 GPUs, our scratch-built ViT-L/16 achieves state-of-the-art performances on various video tasks. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. + +Assume that you are located at `$MMACTION2/projects/umt`. + +Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the Kinetics dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/kinetics#readme). + +Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link. + +```shell +ln -s ../../data ./data +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### Kinetics400 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | testing protocol | config | ckpt | +| :---------------------: | :--------: | :------: | :---------: | :------: | :--------------: | :-------------------------------------------------------------: | :-----------------------------------------------------------: | +| uniform 8 | 224x224 | UMT-B | Kinetics710 | 87.33 | 4 clips x 3 crop | [config](./configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.pth) | +| uniform 8 | 224x224 | UMT-L | Kinetics710 | 90.21 | 4 clips x 3 crop | [config](./configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.pth) | + +### Kinetics700 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | testing protocol | config | ckpt | +| :---------------------: | :--------: | :------: | :---------: | :------: | :--------------: | :-------------------------------------------------------------: | :-----------------------------------------------------------: | +| uniform 8 | 224x224 | UMT-B | Kinetics710 | 77.95 | 4 clips x 3 crop | [config](./configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.pth) | +| uniform 8 | 224x224 | UMT-L | Kinetics710 | 82.79 | 4 clips x 3 crop | [config](./configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.pth) | + +## Citation + + + +```bibtex +@article{li2023unmasked, + title={Unmasked teacher: Towards training-efficient video foundation models}, + author={Li, Kunchang and Wang, Yali and Li, Yizhuo and Wang, Yi and He, Yinan and Wang, Limin and Qiao, Yu}, + journal={arXiv preprint arXiv:2303.16058}, + year={2023} +} +``` diff --git a/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..6991f7aaa37cc6f51697491d3f692b7d7a1dbcf4 --- /dev/null +++ b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py @@ -0,0 +1,82 @@ +custom_imports = dict(imports='models') + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UMTViT', + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + all_frames=8, + qkv_bias=True), + cls_head=dict( + type='TimeSformerHead', + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=8, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..7b050d984cab630a991ea413b30d7b14e848d10a --- /dev/null +++ b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py @@ -0,0 +1,82 @@ +custom_imports = dict(imports='models') + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UMTViT', + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + all_frames=8, + qkv_bias=True), + cls_head=dict( + type='TimeSformerHead', + num_classes=700, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics700/videos_val' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=8, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..a8afaf17c8b2a4f4528ac313ede9810aa1c6edbf --- /dev/null +++ b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py @@ -0,0 +1,82 @@ +custom_imports = dict(imports='models') + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UMTViT', + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + all_frames=8, + qkv_bias=True), + cls_head=dict( + type='TimeSformerHead', + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=8, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..33c7793f31faed2420ec02cd08fce10ba1aeddd6 --- /dev/null +++ b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py @@ -0,0 +1,82 @@ +custom_imports = dict(imports='models') + +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='UMTViT', + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + all_frames=8, + qkv_bias=True), + cls_head=dict( + type='TimeSformerHead', + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics700/videos_val' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=8, + num_workers=16, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='AccMetric') +test_cfg = dict(type='TestLoop') + +default_scope = 'mmaction' + +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/projects/umt/models/__init__.py b/projects/umt/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d822e47d27a4077e10e1d95fb4b6e8b77f0f355 --- /dev/null +++ b/projects/umt/models/__init__.py @@ -0,0 +1,3 @@ +from .vit import UMTViT + +__all__ = ['UMTViT'] diff --git a/projects/umt/models/vit.py b/projects/umt/models/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..b02e24f970bd25f0b8cedeecdc90caa619bea4ae --- /dev/null +++ b/projects/umt/models/vit.py @@ -0,0 +1,344 @@ +from functools import partial + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from mmcv.cnn.bricks import DropPath +from mmengine import to_2tuple + +from mmaction.registry import MODELS + + +class Mlp(nn.Module): + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the original BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + (self.q_bias, + torch.zeros_like(self.v_bias, + requires_grad=False), self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + attn_head_dim=attn_head_dim) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + if init_values > 0: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + num_frames=16, + tubelet_size=2): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.tubelet_size = int(tubelet_size) + num_patches = (img_size[1] // + patch_size[1]) * (img_size[0] // patch_size[0]) * ( + num_frames // self.tubelet_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.proj = nn.Conv3d( + in_channels=in_chans, + out_channels=embed_dim, + kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]), + stride=(self.tubelet_size, patch_size[0], patch_size[1])) + + def forward(self, x): + B, C, T, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model " \ + f'({self.img_size[0]}*{self.img_size[1]}).' + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +# sin-cos position encoding +def get_sinusoid_encoding_table(n_position, + d_hid, + cur_frame=-1, + pre_n_position=1568): + """Sinusoid position encoding table.""" + + def get_position_angle_vec(position): + return [ + position / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ] + + sinusoid_table = np.array( + [get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + sinusoid_table = torch.tensor( + sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0) + print(f'n_position: {n_position}') + print(f'pre_n_position: {pre_n_position}') + if n_position // cur_frame * 8 != pre_n_position and cur_frame != -1: + T = 8 # checkpoint frame + P = 14 # checkpoint size + C = d_hid + new_P = int((n_position // cur_frame)**0.5) # testing size + print( + f'Pretraining uses 14x14, but current version is {new_P}x{new_P}') + print('Interpolate the position embedding') + sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C) + sinusoid_table = sinusoid_table.reshape(-1, P, P, + C).permute(0, 3, 1, 2) + sinusoid_table = torch.nn.functional.interpolate( + sinusoid_table, + size=(new_P, new_P), + mode='bicubic', + align_corners=False) + # BT, C, H, W -> BT, H, W, C -> B, T, H, W, C + sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape( + -1, T, new_P, new_P, C) + sinusoid_table = sinusoid_table.flatten(1, 3) + if cur_frame != -1 and cur_frame != 8: + print(f'Pretraining uses 8 frames, but current frame is {cur_frame}') + print('Interpolate the position embedding') + T = 8 # checkpoint frame + new_T = cur_frame # testing frame + # interpolate + P = int((n_position // cur_frame)**0.5) # testing size + C = d_hid + sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C) + sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, + 1).reshape(-1, C, + T) # BHW, C, T + sinusoid_table = torch.nn.functional.interpolate( + sinusoid_table, size=new_T, mode='linear') + sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute( + 0, 4, 1, 2, 3) # B, T, H, W, C + sinusoid_table = sinusoid_table.flatten(1, 3) + if n_position == pre_n_position: + return sinusoid_table + else: + print('Use learnable position embedding') + return nn.Parameter(sinusoid_table, requires_grad=True) + + +@MODELS.register_module() +class UMTViT(nn.Module): + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=0., + use_learnable_pos_emb=False, + all_frames=16, + tubelet_size=1, + use_checkpoint=False, + checkpoint_num=0, + use_mean_pooling=True): + super().__init__() + self.num_features = self.embed_dim = embed_dim + self.tubelet_size = tubelet_size + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + num_frames=all_frames, + tubelet_size=self.tubelet_size) + num_patches = self.patch_embed.num_patches + self.use_checkpoint = use_checkpoint + self.checkpoint_num = checkpoint_num + print(f'Use checkpoint: {use_checkpoint}') + print(f'Checkpoint number: {checkpoint_num}') + + if use_learnable_pos_emb: + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches, embed_dim)) + else: + # sine-cosine positional embeddings is on the way + if patch_size == 14: + pre_n_position = 2048 + else: + pre_n_position = 1568 + self.pos_embed = get_sinusoid_encoding_table( + num_patches, + embed_dim, + all_frames // tubelet_size, + pre_n_position=pre_n_position) + + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values) for i in range(depth) + ]) + self.norm = nn.Identity() if use_mean_pooling else norm_layer( + embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None + + def forward_features(self, x): + x = self.patch_embed(x) + B, _, _ = x.size() + + if self.pos_embed is not None: + x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to( + x.device).clone().detach() + x = self.pos_drop(x) + + for idx, blk in enumerate(self.blocks): + if self.use_checkpoint and idx < self.checkpoint_num: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + + x = self.norm(x) + if self.fc_norm is not None: + return self.fc_norm(x.mean(1)) + else: + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5ad5aebeea24989b5ce86a30ae0c8621b066210 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +-r requirements/build.txt +-r requirements/optional.txt +-r requirements/tests.txt diff --git a/requirements/build.txt b/requirements/build.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c94c9e11d4294978873d5a28252fa577df76dba --- /dev/null +++ b/requirements/build.txt @@ -0,0 +1,8 @@ +decord >= 0.4.1 +einops +matplotlib +numpy +opencv-contrib-python +Pillow +scipy +torch>=1.3 diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a24c45daba67d1fbff9bd192ccabf6669d10dc8 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,14 @@ +docutils==0.18.1 +einops +modelindex +myst-parser +opencv-python +-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +scipy +sphinx==6.1.3 +sphinx-notfound-page +sphinx-tabs +sphinx_copybutton +sphinx_markdown_tables +sphinxcontrib-jquery +tabulate diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ea0d12422b7cfc77acd3ad715b8fe434e17e014 --- /dev/null +++ b/requirements/mminstall.txt @@ -0,0 +1,2 @@ +mmcv>=2.0.0rc4,<2.2.0 +mmengine>=0.7.1,<1.0.0 diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt new file mode 100644 index 0000000000000000000000000000000000000000..c236cced2b3f48a7b8d5522c18d34294cddb0ba6 --- /dev/null +++ b/requirements/multimodal.txt @@ -0,0 +1 @@ +transformers>=4.28.0 diff --git a/requirements/optional.txt b/requirements/optional.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a88a10cfe07a3186a8a5ce7320599fbcd806722 --- /dev/null +++ b/requirements/optional.txt @@ -0,0 +1,13 @@ +av>=9.0 +future +imgaug +librosa +lmdb +moviepy +openai-clip +packaging +pims +PyTurboJPEG +soundfile +tensorboard +wandb diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt new file mode 100644 index 0000000000000000000000000000000000000000..448ab5abed9c1a13e4b2a423ee710539e1a4eec6 --- /dev/null +++ b/requirements/readthedocs.txt @@ -0,0 +1,4 @@ +mmcv +titlecase +torch +torchvision diff --git a/requirements/tests.txt b/requirements/tests.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ca5d863987fcffdea81833565116d8e8703cd4b --- /dev/null +++ b/requirements/tests.txt @@ -0,0 +1,9 @@ +coverage +flake8 +interrogate +isort==4.3.21 +parameterized +pytest +pytest-runner +xdoctest >= 0.10.0 +yapf diff --git a/resources/acc_curve.png b/resources/acc_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..27a2f0851e7d9ee0c912f73af947b11453422988 Binary files /dev/null and b/resources/acc_curve.png differ diff --git a/resources/data_pipeline.png b/resources/data_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..1c73cd638649b27c071dc828b1341fd151de293c --- /dev/null +++ b/resources/data_pipeline.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21610750aefe62edee36272eb8321f7fcbca95c717c7b5dfb86b846428c78a54 +size 117332 diff --git a/resources/miaomiao_qrcode.jpg b/resources/miaomiao_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f5348e21ea6b398cd5a1cd621ce58f4c9a08e300 --- /dev/null +++ b/resources/miaomiao_qrcode.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f9ac900623cc0e3e68c5fee382f78901800b5c9d84493afc03418e94dce018 +size 225737 diff --git a/resources/mmaction2_logo.png b/resources/mmaction2_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add Binary files /dev/null and b/resources/mmaction2_logo.png differ diff --git a/resources/mmaction2_overview.gif b/resources/mmaction2_overview.gif new file mode 100644 index 0000000000000000000000000000000000000000..123a65cff421325c740c0ef089b7bb659e4a555a --- /dev/null +++ b/resources/mmaction2_overview.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6b778f476ee46d3e136ad0bd596ae6b2c76be37a6edb6214f06f1edca02884 +size 1701421 diff --git a/resources/qq_group_qrcode.jpg b/resources/qq_group_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cfd399858cac8bd164cf172140a76d8c8a7b8bf2 --- /dev/null +++ b/resources/qq_group_qrcode.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7afbe414bbdfb299d0efec06baf4f21d9121897f338f8d6684592e215e9e7317 +size 204806 diff --git a/resources/spatio-temporal-det.gif b/resources/spatio-temporal-det.gif new file mode 100644 index 0000000000000000000000000000000000000000..ce134cdb949e3f0fd9b41343d3003770cc82948e --- /dev/null +++ b/resources/spatio-temporal-det.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77a76b06d3b72373da76aaa53ff8b87a77e9a021390137d3116d44b3a1bf637 +size 1302833 diff --git a/resources/zhihu_qrcode.jpg b/resources/zhihu_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f791e858c942e8d4da3098e8d18a687b7eca6f73 --- /dev/null +++ b/resources/zhihu_qrcode.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:171db0200db2735325ab96a5aa6955343852c12af90dc79c9ae36f73694611c7 +size 397245 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..f22ad55baf0ada2db8fcfbfc59dcadf2849775ea --- /dev/null +++ b/setup.cfg @@ -0,0 +1,25 @@ +[bdist_wheel] +universal=1 + +[aliases] +test=pytest + +[yapf] +based_on_style = pep8 +blank_line_before_nested_class_or_def = true +split_before_expression_after_opening_paren = true +split_penalty_import_names=0 +SPLIT_PENALTY_AFTER_OPENING_BRACKET=800 + +[isort] +line_length = 79 +multi_line_output = 0 +extra_standard_library = pkg_resources,setuptools +known_first_party = mmaction +known_third_party = cv2,decord,einops,joblib,matplotlib,mmcv,numpy,pandas,pytest,pytorch_sphinx_theme,scipy,seaborn,titlecase,torch,webcolors +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY + +[flake8] +per-file-ignores = + mmaction/configs/*:F401,F403,F405 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..625976e69de4a8a60a2bde88678aae0d3e9ab173 --- /dev/null +++ b/setup.py @@ -0,0 +1,196 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import shutil +import sys +import warnings +from setuptools import find_packages, setup + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +version_file = 'mmaction/version.py' + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a requirements file but strips + specific versioning information. + + Args: + fname (str): path to requirements file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + import re + import sys + from os.path import exists + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + elif '@git+' in line: + info['package'] = line + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + for info in parse_line(line): + yield info + + def gen_packages_items(): + if exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + return packages + + +def add_mim_extension(): + """Add extra files that are required to support MIM into the package. + + These files will be added by creating a symlink to the originals if the + package is installed in `editable` mode (e.g. pip install -e .), or by + copying from the originals otherwise. + """ + + # parse installment mode + if 'develop' in sys.argv: + # installed by `pip install -e .` + mode = 'symlink' + elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: + # installed by `pip install .` + # or create source distribution by `python setup.py sdist` + mode = 'copy' + else: + return + + filenames = ['tools', 'configs', 'model-index.yml', 'dataset-index.yml'] + repo_path = osp.dirname(__file__) + mim_path = osp.join(repo_path, 'mmaction', '.mim') + os.makedirs(mim_path, exist_ok=True) + + for filename in filenames: + if osp.exists(filename): + src_path = osp.join(repo_path, filename) + tar_path = osp.join(mim_path, filename) + + if osp.isfile(tar_path) or osp.islink(tar_path): + os.remove(tar_path) + elif osp.isdir(tar_path): + shutil.rmtree(tar_path) + + if mode == 'symlink': + src_relpath = osp.relpath(src_path, osp.dirname(tar_path)) + try: + os.symlink(src_relpath, tar_path) + except OSError: + # Creating a symbolic link on windows may raise an + # `OSError: [WinError 1314]` due to privilege. If + # the error happens, the src file will be copied + mode = 'copy' + warnings.warn( + f'Failed to create a symbolic link for {src_relpath}, ' + f'and it will be copied to {tar_path}') + else: + continue + elif mode == 'copy': + if osp.isfile(src_path): + shutil.copyfile(src_path, tar_path) + elif osp.isdir(src_path): + shutil.copytree(src_path, tar_path) + else: + warnings.warn(f'Cannot copy file {src_path}.') + else: + raise ValueError(f'Invalid mode {mode}') + + +if __name__ == '__main__': + add_mim_extension() + setup( + name='mmaction2', + version=get_version(), + description='OpenMMLab Video Understanding Toolbox and Benchmark', + long_description=readme(), + long_description_content_type='text/markdown', + author='MMAction2 Contributors', + author_email='openmmlab@gmail.com', + maintainer='MMAction2 Contributors', + maintainer_email='openmmlab@gmail.com', + packages=find_packages(exclude=('configs', 'tools', 'demo')), + keywords='computer vision, video understanding', + include_package_data=True, + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + ], + url='https://github.com/open-mmlab/mmaction2', + license='Apache License 2.0', + install_requires=parse_requirements('requirements/build.txt'), + extras_require={ + 'all': parse_requirements('requirements.txt'), + 'tests': parse_requirements('requirements/tests.txt'), + 'optional': parse_requirements('requirements/optional.txt'), + 'mim': parse_requirements('requirements/mminstall.txt'), + 'multimodal': parse_requirements('requirements/multimodal.txt'), + }, + zip_safe=False) diff --git a/tests/apis/test_inference.py b/tests/apis/test_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7ff627f694a7fb3ba7c0d339f699f83913a10a5a --- /dev/null +++ b/tests/apis/test_inference.py @@ -0,0 +1,155 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest import TestCase + +import torch +from mmengine.testing import assert_dict_has_keys +from parameterized import parameterized + +from mmaction.apis import (detection_inference, inference_recognizer, + init_recognizer, pose_inference) +from mmaction.structures import ActionDataSample +from mmaction.utils import frame_extract, get_str_type + + +class TestInference(TestCase): + + @parameterized.expand([(('configs/recognition/tsn/' + 'tsn_imagenet-pretrained-r50_8xb32-' + '1x1x3-100e_kinetics400-rgb.py'), ('cpu', 'cuda')) + ]) + def test_init_recognizer(self, config, devices): + project_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) + project_dir = osp.join(project_dir, '..') + config_file = osp.join(project_dir, config) + + for device in devices: + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + + # test `init_recognizer` with str path + _ = init_recognizer(config_file, device=device) + + # test `init_recognizer` with :obj:`Path` + _ = init_recognizer(Path(config_file), device=device) + + # test `init_recognizer` with undesirable type + with self.assertRaisesRegex( + TypeError, 'config must be a filename or Config object'): + config_list = [config_file] + _ = init_recognizer(config_list) + + @parameterized.expand([(('configs/recognition/tsn/' + 'tsn_imagenet-pretrained-r50_8xb32-' + '1x1x3-100e_kinetics400-rgb.py'), 'demo/demo.mp4', + ('cpu', 'cuda'))]) + def test_inference_recognizer(self, config, video_path, devices): + project_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) + project_dir = osp.join(project_dir, '..') + config_file = osp.join(project_dir, config) + video_path = osp.join(project_dir, video_path) + + for device in devices: + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + model = init_recognizer(config_file, device=device) + + for ops in model.cfg.test_pipeline: + if get_str_type(ops['type']) in ('TenCrop', 'ThreeCrop'): + # Use CenterCrop to reduce memory in order to pass CI + ops['type'] = 'CenterCrop' + + result = inference_recognizer(model, video_path) + + self.assertIsInstance(result, ActionDataSample) + self.assertTrue(result.pred_score.shape, (400, )) + + def test_detection_inference(self): + from mmdet.apis import init_detector + from mmdet.structures import DetDataSample + + for device in ('cpu', 'cuda'): + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + project_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) + project_dir = osp.join(project_dir, '..') + det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py' # noqa: E501 + det_ckpt = 'http://download.openmmlab.com/mmdetection/' \ + 'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth' # noqa: E501 + video_path = 'demo/demo_skeleton.mp4' + video_path = osp.join(project_dir, video_path) + config_file = osp.join(project_dir, det_config) + with TemporaryDirectory() as tmpdir: + frm_paths, _ = frame_extract(video_path, out_dir=tmpdir) + # skip remaining frames to speed up ut + frm_paths = frm_paths[:10] + results, data_samples = detection_inference( + config_file, det_ckpt, frm_paths, device=device) + self.assertTrue(results[0].shape, (4, )) + self.assertIsInstance(data_samples[0], DetDataSample) + # test with_score + results, data_samples = detection_inference( + config_file, + det_ckpt, + frm_paths, + with_score=True, + device=device) + self.assertTrue(results[0].shape, (5, )) + # test inference with model object + model = init_detector( + config=det_config, checkpoint=det_ckpt, device=device) + results, data_samples = detection_inference( + model, None, frm_paths, device=device) + self.assertTrue(results[0].shape, (4, )) + self.assertIsInstance(data_samples[0], DetDataSample) + + def test_pose_inference(self): + from mmpose.apis import init_model + from mmpose.structures import PoseDataSample + + for device in ('cpu', 'cuda'): + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + project_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) + project_dir = osp.join(project_dir, '..') + det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py' # noqa: E501 + det_ckpt = 'http://download.openmmlab.com/mmdetection/' \ + 'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth' # noqa: E501 + pose_config = 'demo/demo_configs/' \ + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py' + pose_ckpt = 'https://download.openmmlab.com/mmpose/top_down/' \ + 'hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' + video_path = 'demo/demo_skeleton.mp4' + video_path = osp.join(project_dir, video_path) + pose_config = osp.join(project_dir, pose_config) + with TemporaryDirectory() as tmpdir: + frm_paths, _ = frame_extract(video_path, out_dir=tmpdir) + # skip remaining frames to speed up ut + frm_paths = frm_paths[:10] + det_results, _ = detection_inference( + det_config, det_ckpt, frm_paths, device=device) + + results, data_samples = pose_inference( + pose_config, + pose_ckpt, + frm_paths, + det_results, + device=device) + assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores', + 'bboxes', 'keypoint_scores')) + self.assertIsInstance(data_samples[0], PoseDataSample) + + # test inference with model object + model = init_model( + config=pose_config, checkpoint=pose_ckpt, device=device) + results, data_samples = pose_inference( + model, None, frm_paths, det_results, device=device) + assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores', + 'bboxes', 'keypoint_scores')) + self.assertIsInstance(data_samples[0], PoseDataSample) diff --git a/tests/apis/test_inferencer.py b/tests/apis/test_inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..fabcff31f4e17dc630f6702419ba43cb9a3b38c7 --- /dev/null +++ b/tests/apis/test_inferencer.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from tempfile import TemporaryDirectory +from unittest import TestCase + +import torch +from parameterized import parameterized + +from mmaction.apis import MMAction2Inferencer + + +class TestMMActionInferencer(TestCase): + + def test_init_recognizer(self): + # Initialzied by alias + _ = MMAction2Inferencer(rec='tsn') + + # Initialzied by config + _ = MMAction2Inferencer( + rec='tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb' + ) # noqa: E501 + + with self.assertRaisesRegex(ValueError, + 'rec algorithm should provided.'): + _ = MMAction2Inferencer() + + @parameterized.expand([ + (('tsn'), ('tools/data/kinetics/label_map_k400.txt'), + ('demo/demo.mp4'), ('cpu', 'cuda')) + ]) + def test_infer_recognizer(self, config, label_file, video_path, devices): + with TemporaryDirectory() as tmp_dir: + for device in devices: + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + + # test video file input and return datasample + inferencer = MMAction2Inferencer( + config, label_file=label_file, device=device) + results = inferencer(video_path, vid_out_dir=tmp_dir) + self.assertIn('predictions', results) + self.assertIn('visualization', results) + assert osp.exists(osp.join(tmp_dir, osp.basename(video_path))) + + results = inferencer( + video_path, vid_out_dir=tmp_dir, out_type='gif') + self.assertIsInstance(results['predictions'][0], dict) + assert osp.exists( + osp.join(tmp_dir, + osp.basename(video_path).replace('mp4', 'gif'))) + + # test np.ndarray input + inferencer = MMAction2Inferencer( + config, + label_file=label_file, + device=device, + input_format='array') + import decord + import numpy as np + video = decord.VideoReader(video_path) + frames = [x.asnumpy()[..., ::-1] for x in video] + frames = np.stack(frames) + inferencer(frames, vid_out_dir=tmp_dir) + assert osp.exists(osp.join(tmp_dir, '00000000.mp4')) diff --git a/tests/data/activitynet_features/v_test1.csv b/tests/data/activitynet_features/v_test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..49b38d637091e2289e932e034dc03edc76a48734 --- /dev/null +++ b/tests/data/activitynet_features/v_test1.csv @@ -0,0 +1,6 @@ +f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,f100,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127,f128,f129,f130,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140,f141,f142,f143,f144,f145,f146,f147,f148,f149,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174,f175,f176,f177,f178,f179,f180,f181,f182,f183,f184,f185,f186,f187,f188,f189,f190,f191,f192,f193,f194,f195,f196,f197,f198,f199,f200,f201,f202,f203,f204,f205,f206,f207,f208,f209,f210,f211,f212,f213,f214,f215,f216,f217,f218,f219,f220,f221,f222,f223,f224,f225,f226,f227,f228,f229,f230,f231,f232,f233,f234,f235,f236,f237,f238,f239,f240,f241,f242,f243,f244,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299,f300,f301,f302,f303,f304,f305,f306,f307,f308,f309,f310,f311,f312,f313,f314,f315,f316,f317,f318,f319,f320,f321,f322,f323,f324,f325,f326,f327,f328,f329,f330,f331,f332,f333,f334,f335,f336,f337,f338,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348,f349,f350,f351,f352,f353,f354,f355,f356,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366,f367,f368,f369,f370,f371,f372,f373,f374,f375,f376,f377,f378,f379,f380,f381,f382,f383,f384,f385,f386,f387,f388,f389,f390,f391,f392,f393,f394,f395,f396,f397,f398,f399 +-2.52400826749,0.0481050342173,-0.727137195971,2.75537272315,3.09127621822,-1.57007092339,-0.418208286763,0.0913230466118,-0.536148328353,-0.527615223662,1.09348152733,-0.740857539139,1.03076939449,0.947990020203,-0.00932133916349,0.546988826083,-0.737920381243,0.823520260094,-1.44379751155,1.67705288164,1.85386635752,0.62453161102,1.13374109944,-0.161873651211,1.40335457467,0.267813141882,1.40327533282,0.143771933515,-0.29447495679,0.779869429758,-1.38585145822,-0.361671252653,-1.46679541523,0.0859254586217,0.266080879981,-0.680839165484,-0.774731957742,-0.618207527285,1.57201336054,0.875829197772,-0.896498123858,-2.55398872891,-0.796735937603,-0.338483746318,0.511324636391,-1.21437529424,-0.0488620607446,0.253289302886,2.71006785221,-0.573161459164,-0.341657902954,-0.854258292083,0.562081610284,-0.828878082845,2.00327134909,1.29068322546,-0.418051389774,1.14570354001,1.39098484308,-1.13415579068,-1.01751984858,-0.823485884605,0.354335798556,1.79059040272,0.609877418462,-1.01807533199,1.56390048495,1.00308338848,0.226345738051,-0.145077751076,0.0986133282503,-0.0274079232177,0.0618308794267,2.33058959297,0.0527062771437,-1.11440070055,-2.85928208684,2.15750540841,0.866524370256,-0.999664886812,0.65322760642,-1.01907039308,-0.827862563442,0.702348045951,-0.266591888881,-0.51787754913,-0.87550654118,-1.08840756221,-0.330164993751,-0.885034718769,-1.09602854198,-1.90739000514,-1.41201400125,3.55564525741,2.24864990051,1.85192671744,-0.886962869481,-0.706411036437,0.962288821262,-1.30219301658,0.0603706527015,-0.672105670826,-0.147220359933,-1.00931681574,-1.34130794644,-0.0213488208036,-0.965187689045,0.427090878957,-0.922304333641,-1.13947635577,0.637382086489,-1.706998011,0.00132625548269,0.663770250584,1.58249601114,-1.04340366269,0.375227416108,-0.0870821477482,0.551722806776,0.588611513848,-0.477017772079,-1.51536188044,0.237936462599,0.31261506067,-0.198127712396,-0.318572429209,-1.18890325315,0.035582087437,2.67528950232,-0.197889107378,1.55762961412,0.104639883842,-1.66993450781,0.702282006582,1.36717389178,0.634535223722,2.85315937821,-1.27367064913,0.483830422936,-0.869812565212,0.641265734616,-0.11914733068,1.0239396073,-3.92902142357,0.694317328488,1.34085481986,-0.135329176331,0.0261293066915,-0.303456270416,0.909167548313,-2.04735304332,-0.285427697695,-1.03457319064,-2.77420531572,0.197031497599,-0.520362589547,-1.37924786457,-0.418569629841,1.54322130788,1.83725603097,3.35605137842,-0.117215889143,-0.970470848036,-0.339063598965,1.57921290781,0.196319119013,-1.22568776573,-0.448961007657,0.609897182756,-0.168152849526,0.254480323573,-0.51589471003,-0.253088873187,-0.716572365129,-1.56268640697,-3.33835895995,-0.679914745818,0.107016925667,-1.61204098026,-0.387739681651,2.40210230323,-1.0956975287,-1.72501473746,-0.766200882827,0.752211827669,1.55532805525,0.113983938016,4.54239864121,-1.36827292666,-1.88835217549,1.40817465219,0.708602657522,1.31514883588,0.0314930005956,-0.79571607963,0.75615035674,1.14977174081,-1.72166323668,0.565034879125,-1.41448308724,-1.57710396359,-1.17078288789,1.1485206762,0.393694747107,1.20387821507,0.699366232003,1.80047030851,1.42655580688,-1.41627641805,-0.0899006423315,1.0611155262,-1.131250839,2.23898952868,-3.58230877813,-0.889216990584,1.40956827182,-1.46751403757,-0.691296854089,-1.54265676827,2.65262625498,2.19788404633,-2.01697903653,0.611521417417,0.359316692791,4.6816105414,0.862952723244,0.167491980372,2.6932665368,-3.00625465314,-0.351348050268,-0.89827277051,1.1813078626,-0.683418750015,0.612255702038,1.80744153164,0.0561640557506,-1.55411351133,0.711329718813,-3.72017506799,0.381065155569,-0.414420442519,-1.60570235569,-0.599320146458,1.05618929973,-1.47036342112,1.14814616981,-0.245414197276,-1.86036272008,2.96957122081,-1.61679375941,-0.50189343957,3.2102935297,3.52676818145,3.37559696234,1.65133903096,1.07003903059,0.246458431642,-2.86996585644,2.9472088513,0.156860758686,2.65348488352,-1.65249707957,-1.10731408448,1.62994935577,-1.96909845304,-1.9090510476,2.51069158859,-1.65984114813,0.148115664273,1.10611308391,1.18241718985,-4.85953441229,-1.0049765752,3.88280249662,-1.75265659238,0.372608524032,-2.22002927662,1.18168715581,-2.87508345833,-0.676288569625,-2.44675108062,-1.55716385372,-1.62059798953,0.724381881496,-0.960783561886,-0.552230426264,0.121615798579,1.04462357852,0.118085120237,1.26606201262,-0.380661477003,-2.58578204132,4.03374155601,-2.25326988394,-2.88061044978,3.26819336615,1.91267201179,-0.19674664532,2.05710699236,-3.54867236793,-0.326269919106,0.752888089223,0.132116086772,-1.54644230279,-2.836589684,0.141382075407,-1.44156945706,1.19807019893,1.68431397116,0.438746488152,-2.06834516275,-0.842738093366,0.465043608979,-0.629041527666,-0.0120976683258,-3.00099798249,-1.73881566772,0.881273090875,-0.540746588847,-0.38645376593,-2.43880278615,-0.563591295604,1.477140512,-1.75295748363,1.76406287775,2.66264589914,0.484454554128,0.273973214982,-2.05206947308,-0.369256326252,-0.689306857174,1.66270560488,-0.131857610115,0.955091272134,-1.60116198558,-2.28544168464,2.11164102397,-4.18991734267,0.173959671197,-0.0354114097397,-1.4089728089,-0.311132524,1.89336391541,2.43192427419,1.01858890895,2.03606205304,1.62452822335,3.64225894583,2.28056802496,5.64531833088,-1.1566376147,2.07540663589,0.620578413989,0.750977221371,0.0162535885321,-2.16207619048,-0.105952032448,-0.117025236938,-2.50755272675,1.48142693144,-0.430885550216,2.23543980132,-0.326485130108,0.0243268507167,2.06152002688,-1.02234084951,-2.0303752323,0.561301589735,2.3433107876,-0.925805005171,2.80904484078,-1.94807647011,0.329007639042,0.397634451785,1.47111085828,-2.50084066219,1.09999789629,-2.99330297808,-0.0599839422321,-1.9690194292,0.960052060426,-2.19808352939,-2.01816409011,-5.65800942077,-0.0169289777679,1.16420775694,0.723551353918,0.643957264021,-0.140148446853,-0.056547111384,1.91572655252,-1.37543404733,0.484043939791,2.79265339713,-1.17311209973,-0.371278463653,0.469582405128,-2.31444814128,1.41635027072,-1.07100369346 +-4.16998558362,2.12975610028,-2.56134395649,7.28089529038,5.71112143199,-1.43967841108,-2.27770995537,-0.621412650546,-1.44766437213,-2.65973161459,1.36775091092,-0.475116016803,-0.587382383942,4.81157625596,0.770176066954,0.363275742132,-0.0876347057022,-0.475521533538,-0.0547252563637,4.64327842236,3.68908154567,2.63090462903,4.96261648734,-2.3996240147,0.249490239721,1.12136919369,2.95945439398,-1.5711039712,2.68638911406,0.584886546134,-2.50314228614,-2.72285134157,0.61815967679,-1.74822253416,-0.311564020118,-2.74809125702,1.47346679886,-3.40588476142,1.47545339028,3.02455658674,-3.94506848613,-4.14376579285,-1.73336583535,-2.40840473334,-2.22219073812,-5.15251653036,0.988312865494,1.78566960146,6.54388860067,-1.45725802938,0.214708279868,-2.72405630668,2.83319289843,-1.85521226009,3.58616267999,3.34310981591,1.02165599783,3.42570413748,0.846149519881,-2.93276470105,-1.80281494916,-4.22263733625,-1.52749340316,3.2283666563,4.42827975909,-1.44139790932,1.73660321256,1.17811784268,4.59021838108,1.89355262021,0.455512814919,1.27808425168,1.62865997315,6.70429563522,-0.847455751549,-5.35004572391,-5.12095170339,6.48116056124,0.300556570692,-5.01764505545,-0.875816748044,-1.82039844963,-1.25923923691,0.632047503791,2.15801657677,-2.92180285851,0.511598025958,-2.96027669827,0.547309962512,-2.98510901829,-0.335630682309,-4.73974208434,-2.01421547413,3.362338895,5.79285810471,9.42033552887,-2.91738398632,1.82035643975,1.98379708379,-2.70420178073,-1.48058941424,1.56434452216,-0.992579338154,2.37859466165,-3.72032371362,1.26282515267,-3.50253353516,0.00376921892301,1.18962185065,-1.0557041204,0.54337829232,-1.99295026461,2.62920855999,3.76263545752,1.2841622142,-2.72069926341,-1.80479015474,1.58534218073,2.60577425917,-0.440677909057,2.20203198473,-3.39447330793,2.79975073894,2.23906295717,0.677189537287,1.39489221702,-0.518861652811,-1.19545238594,5.21395279209,2.14497482498,3.99990809123,-1.70296090881,-2.09669830044,-0.502894639969,3.01051452478,1.25882732471,-1.28701953888,-3.64675308704,0.679585470159,-3.88040889422,0.100971349178,-3.87473366777,8.57528485777,-7.33635827383,0.620873548189,4.256256452,-3.20197622975,0.181273331641,-1.08387027582,4.7040402782,-4.30957582315,-3.2032131656,-3.55255149682,-5.39665594737,-1.43142532587,-1.0020887959,0.310152183772,-4.9755616792,0.544686280489,3.23141360442,3.48532564084,-2.27912784214,-5.4400074927,-2.9422715648,5.55690115452,1.07856818487,-2.60423706293,0.296417542696,0.018438497484,-1.6693427813,-1.97826829297,-0.649584023059,1.0299335142,-1.30126957735,-1.49028243661,-7.05598390897,1.53666977635,2.47103852113,0.548410004575,-2.33345104297,1.05941242347,-2.22456861824,-0.833920312524,0.616063261429,1.08299628615,4.64962686857,-0.85913300693,8.38019424758,-3.35722782453,-5.88692650636,2.48297270139,-1.82296590428,-1.72441059232,-3.50540684352,-4.86662904103,1.4669711864,4.01910547892,-0.666310483219,1.94299481273,-1.65633018176,-0.233463008008,2.92032059917,-3.11237916489,1.65681514025,-5.82044394652,-0.84150699973,5.2420919474,1.65209466338,5.1169664971,2.8554833293,2.7991078945,1.85252228816,-1.80552712282,0.913601561388,0.441482040088,-0.160765804846,1.5659571287,-5.15831661542,1.85946914524,4.30885611724,2.5515617756,4.66296468178,6.40177754471,0.323659792742,2.79168056408,-2.54396620949,2.11927359978,3.5409553499,0.143619238635,0.247531717618,-3.67236700398,0.0737643596032,6.4369303449,-4.20339368939,1.39238156477,-0.479590680996,1.23359161367,1.11356295109,-0.530017747878,2.8127275755,1.67139578978,-0.648806054595,-3.56483347257,-0.00777567660002,-4.97657731056,2.76010027647,2.79106523007,-2.92366722226,-0.381967118582,-8.20272569498,-1.22538543622,-0.975923561257,-1.2079847001,5.68413191756,-0.519274702668,1.34021991417,0.46834429979,-0.752738639987,-4.23064642449,-6.19847359916,1.9824349012,-5.77588344375,-6.11922142108,3.66428396702,7.66924429814,-0.776042481264,7.10654588699,-0.732527501781,2.01595049262,-0.872191261451,2.67919575771,2.4503210032,-2.90921337763,8.53517298381,0.212812230588,0.476091645162,0.748127258619,-0.886277671655,-2.89118565341,0.142637886207,-3.79416944186,1.11709731897,-1.30126662016,0.359220613638,-2.86900741637,-4.63997180067,1.53915568789,-4.55603598674,-2.03369594216,-1.81275931041,-2.69728669763,-2.77373948296,-0.780138870872,-0.710413366953,-1.87378830453,2.78039755662,-3.32990742207,3.18837203344,-1.00930721204,-4.34471332073,-2.7804454573,1.49880246004,1.22752761165,-1.44689382633,1.45333088478,4.27367163022,1.721656696,-3.6055589668,3.01899054011,7.5569880708,-2.61906720797,-2.57271003584,2.80881048858,-0.415334333976,-3.0628209281,-3.63716221015,-0.194801000356,2.79870586514,2.79689924727,0.0788984746723,-1.96187414487,-2.75171196282,-2.28218094111,0.444554739001,4.8369281887,0.373838265736,-3.15276482065,4.03460666657,-1.86244435867,0.253326237999,-0.800799566707,-1.74990467469,-2.74444140275,-5.73288337012,-4.91918236891,-0.418412837584,-2.99338801781,-1.38950726748,1.11461923277,5.90281201998,-0.707580384415,-2.67438790878,4.21448961059,0.828290172268,1.15630444248,2.80011883676,2.65575761526,0.483185992143,1.03626998862,0.131995103361,-2.91395613949,-1.43565141161,-2.69984012683,-0.626701895692,3.98586324195,-2.19652486801,-2.48867563566,-1.19348388483,2.79217995802,-0.750475711823,-0.945274029968,-0.126381392279,-3.6633948501,-1.54844618718,1.36196402073,0.468697243529,1.29018088311,0.94496485432,0.257892522415,-5.15796130657,-1.53281098127,0.595785883914,-0.833150585492,2.10806567272,5.13338648002,0.01430302143,1.24969169378,0.00611201127369,1.25787633081,-0.926280161539,2.16456234137,2.116730539,4.47622630279,2.12537882169,0.520683592956,-1.542467405,6.23520137549,-1.31958263814,0.309113717082,-1.16410690943,2.81666246732,1.45756631712,-5.58640872558,-0.689133227666,-1.21494281928,-2.40350431559,-2.07186533292,-4.34414368868,-0.898425387144,-2.84011162599 +-2.85525532881,4.14924573302,-1.27022984872,4.43080223083,1.04979521433,-1.7563615183,-1.1571517543,0.443647010723,-0.840120493175,-0.564384366473,-0.631840480766,0.532262438599,0.584832645258,3.23352189611,3.05675490737,2.79432141225,-1.4358461082,0.0141486930853,0.928806241353,4.37966580232,2.8490308106,0.783738804857,3.78208962361,-2.80982620994,2.02718123476,0.447202665606,2.01867037753,0.748949680329,0.626896452109,-0.226885780966,-2.62637141645,-4.79518300573,0.517160896062,-0.495881884893,0.551008209387,-1.1999056525,1.58518931756,0.092337232629,-1.19481320501,2.92050409516,0.70208245794,-1.14886969738,0.497751923401,-0.698487961093,-1.87117256582,-1.65841737827,3.39620117505,3.17374242703,3.50091727654,0.480773175558,1.40684746265,-3.40429907004,0.423096078237,-1.25402658423,1.40384977142,2.23528889895,-0.70792874376,3.44265838623,-0.298643459876,-2.92092214823,-0.387096325756,-3.39548440655,-2.21305868606,4.01884763082,2.1962247467,0.178924582303,-0.175330102443,-1.81287087758,4.0013677895,0.506375047565,0.164289975565,2.65211846734,1.90428843131,4.45052925507,0.60681405703,-2.01008831143,-1.829990381,3.47248803615,-1.04316819509,-2.40825766305,-2.3010283341,1.26562317558,1.44828870733,0.254433333177,-0.294035871825,-2.39190562248,1.16849062324,-2.10750372112,-0.213768513898,-4.53380696336,-2.05353827099,-6.3679600064,-3.59502876282,-0.357480708757,3.44140817722,7.012797233,-1.16484250784,-0.17219096899,-1.65201326678,-3.91428116242,-1.39317485134,-1.78935467323,-2.13693570018,1.49206449827,-1.47030715466,0.326555347044,-2.8691151468,0.987859331371,-0.0670162276435,-1.38699082017,1.38502636115,-0.891648494402,1.63707906797,0.654039901097,0.315870566068,-1.13308484296,-1.63928325141,-0.569100450525,1.42651925405,-0.627428011101,0.216225209237,-1.25899307927,0.828946293494,0.974174125592,-0.332280605535,2.90402588169,-1.104502304,-0.644741526048,-1.07491079171,0.416999756893,1.47221087893,-3.26141314586,-2.26964950522,-0.0280790646872,3.24086038212,1.20009862085,-1.75527016382,-0.539535063108,3.23909044464,-2.99914438327,-0.492613923551,-2.91626054168,7.31597944102,-1.64774904013,-0.73017560184,0.442671738662,0.283633226553,0.714817404846,-1.79878552278,5.11262804588,-4.30506066322,-3.61411044379,-3.82477523089,-2.89008922736,-1.73692337195,-0.71265813748,0.314715143045,-3.16757190545,3.47336832523,1.5834569327,-0.637929768363,-1.56214804153,-2.64970105807,-1.12900751829,3.98810140292,1.87983502666,-2.51413838069,-0.909131198054,2.46703845749,-1.16912671606,-0.352692016586,2.6085906589,0.711290110747,1.82539761384,0.137608984311,-4.09530947288,1.01127222915,1.98808420658,0.725776154994,0.456542024016,2.36024162223,-1.51671710104,0.909857604951,1.3748901693,1.41866263221,2.22546428785,-0.842200076581,3.517446268,-1.94564609289,-2.96543750087,3.66959119841,-4.30324907561,-3.19456482887,-2.38057807227,-4.43179172357,-0.982803171277,-0.41006461223,0.280178608544,1.95349114498,0.637461675009,0.711961734593,1.80234276384,-1.78083568494,0.520603844326,-2.37248194615,0.146621232829,1.95268532594,1.55047165434,0.825010337035,2.16551250696,0.958925328056,-1.03714228699,0.654975053468,-3.01727262656,0.247705178956,-0.0690905296781,-0.235510739784,-3.40891237398,1.3884248968,1.15451488764,2.64650440057,0.807570249241,2.08921063463,0.508586264452,2.52009829918,-1.11128878554,1.39935349762,1.06951609214,-0.485668144226,-0.460008237761,-1.70877252301,0.942621914198,3.41737226328,-2.40122259855,1.40087889274,1.62360543887,-1.58665239096,1.05352225239,-1.45161462784,0.468765456079,1.15845116933,-0.269039389293,-1.64486767074,1.02112615665,-3.15314137697,1.83668091496,-0.21584566613,-3.70026185195,-0.418916064101,-3.95508877378,-2.58916404287,-0.282405416965,0.0237940859794,1.56997692525,1.15945299725,1.77722654502,2.98457802137,-1.70026101914,-1.18428363204,-3.13462997307,2.47967257818,-3.06139141003,-3.33533022483,1.78348285884,3.65876099269,-0.542083423932,4.338555275,0.646300950845,2.75761772871,-1.33789882819,3.41355988423,-2.1038232104,-1.58832200845,4.30315493663,-0.497908014457,1.43125514845,1.23661852837,1.89458917022,-1.27604429007,-0.118665337562,-2.98061999162,0.96282290379,-0.317447299958,0.177331671019,0.190233225426,-2.25885749382,0.633996060689,-0.931709454854,-0.453512817619,-1.06709086379,-0.45003234585,-2.11921728969,0.742342797123,-1.2796056505,-3.18736832539,1.89475087484,0.647759524982,3.05645425161,1.20850815674,-2.71339397748,-0.888974133234,2.6798757871,0.973526877165,-3.10087224166,0.282148707707,-0.588648343086,1.1617284895,-0.947238893711,1.91763001402,2.77221791545,0.242102493444,-2.7309236304,1.19404949462,-2.29922574123,-0.496662088036,-1.43388394435,-0.541529648303,0.914798926115,-1.00208673149,0.693878029583,-1.63149386843,-1.92279982587,-2.83413622906,1.15527868609,1.48624739955,0.0722957122324,-2.01015367587,2.79194158167,-1.34159947316,0.350978424549,-0.150799014967,0.594457630018,-0.702615435521,-2.49834770679,-3.44722706755,0.724352367323,-1.91413194974,-1.50618719021,0.208274304816,2.56051458041,-3.38282206297,-2.67611726205,4.30181331436,2.60196872592,0.980345343721,4.28195017179,2.45016477822,-0.720569800933,0.134198579739,-0.29681619644,-0.620866637628,0.0668065834062,-0.820043117604,0.427079674204,1.07770038346,-1.89850125671,-0.367198590239,0.309245206813,1.49165853987,-1.93249949853,-0.770264412958,0.697864535651,-1.92503979524,0.36664308548,0.6772959585,-0.407557226819,-0.110297719638,0.0780190831417,1.13796422362,-2.93108891884,-0.108831601143,0.0333983715381,-0.582767866453,1.68451089442,1.07477574031,0.759609896341,1.02592154245,-1.07680930615,0.977406439981,-2.15689084132,0.897650267382,0.871076323589,0.485362575054,-0.271094031335,0.392738024197,-1.50007651523,4.16120113373,-0.87542103827,0.770962069035,-0.193105610213,2.63168554207,-0.0860587771735,-1.02318051895,1.64206330359,-1.97631421804,-0.459768193164,-0.987577437561,-3.05661367973,0.700944906869,-2.85832208077 +-2.40668356418,3.32200128635,-0.583146995504,5.17893602371,0.543722619215,-3.61351331234,-2.15219051798,0.154239282607,-1.86185589939,1.86499222438,0.546306239763,0.173791361054,4.68988918622,1.45787520011,4.61635592778,4.10645994823,-3.2520207723,2.82534058571,1.75578262289,1.28921755393,-2.56118538936,-0.681506864627,1.08718702157,-1.73322505633,1.85559087117,2.59411209822,4.86438429197,0.952494197489,-2.00043742815,1.56013310157,-2.24776257197,-3.37023128669,-4.3081034263,2.49645762126,0.0613088496522,1.5614004375,0.196160220802,6.71882646243,-0.515890210072,4.46806035837,6.49843154748,-1.07791967916,3.66291252851,0.340969046157,-0.717211693128,0.893422653279,4.23518612067,1.59024640679,4.00953623931,7.01554282506,3.3829888622,-5.28307714462,-2.56433442275,-1.21852455298,0.420509056251,3.97645592849,-3.46140729904,2.4203199927,0.499145697951,-3.22149805546,-0.210846113366,-1.82363392035,-0.608880066672,6.9203904438,-1.60331305107,-0.572833641767,-0.809020875096,-3.67446678479,-0.751598751347,-4.0169324255,-2.54423304001,3.43391434272,2.22814426263,0.720494257411,2.44403583368,0.126800663272,1.21261574904,2.80068611622,-1.46503902833,-1.02387386938,-2.22691595475,2.92893217762,4.35140001932,2.05282717824,-1.44687641621,-1.2482182169,-2.92161394775,-1.7117234171,-0.664106516638,-4.8541015784,-5.77170533816,-4.39334596634,-3.39425205867,2.10928462108,1.63525372922,2.20211301041,1.6979695034,-3.62859933059,-5.0955384318,-3.70584682147,0.913468626738,-7.92930506388,-5.18711395264,-2.14751714547,0.553891262807,-1.69585991979,-3.80843970299,5.93398868561,-2.32868751923,-1.18235898415,2.63725592931,-1.31388532559,0.924713171173,-3.68923300982,1.09287478288,0.447131590248,-1.02456968466,-1.82614021699,-1.27993409872,1.58124616583,-3.71338141124,2.08220694741,-2.52321253032,-1.8201927487,-0.489585822324,2.26087673823,-3.07679171085,-2.40032638788,-2.84321398576,-1.48280228813,-0.933238696854,-4.71049805482,-2.02947084586,3.95902432919,4.56408443928,2.77234577179,1.14790276547,3.24662017902,8.24014697393,-2.22661842028,2.16570036093,-1.47694238861,-1.56150964896,3.00861291885,-1.58600352287,-2.14261006952,3.36371217092,-0.277815688848,-2.55071312587,3.11163931847,-3.03255870501,-5.94063932737,-4.34915611903,-1.83065024058,0.344852973223,1.66785877029,-2.92896215598,1.02625600656,3.99294057846,-0.764026923974,-1.21331283232,1.14239682655,2.6062800312,0.555238639911,4.5995118173,4.17675596714,-4.47169959545,0.607188218708,4.99268372536,-1.10778329849,0.359094379742,2.3692166694,0.923014166752,1.39561937173,0.489449826081,-3.64099951267,1.49465563099,-0.864940508206,-3.8856684494,3.41578993161,3.80568179767,-3.16751228809,-1.90362671534,-1.0676062123,-0.827274825275,0.810656501699,-2.94211922248,3.80980886777,-0.505323204397,-2.70784498771,5.20668672403,-4.93021412532,-4.60470018069,-0.988903661569,-3.12164619764,-0.759834496776,-1.40789370815,-1.30719569206,3.67482577324,1.25514381965,0.729897277155,-0.221074349482,0.727831269502,-0.159013110398,2.35894515037,-1.60380238533,2.4536198473,-0.0437957082188,-2.46773814758,1.21704642216,-0.603128572703,-1.80407706489,3.83205666224,-6.8485059007,-0.767830495338,3.48311652978,-1.5156415077,-0.384740158121,-2.00051572681,1.33816781203,2.74709336281,-4.9876317811,-0.8754006657,1.1287828366,4.12337694327,-0.0656415704896,0.988705775539,1.21024437666,-5.10868624687,-0.0440934690972,0.0288263560326,-0.0765196786313,-2.51989612102,0.279547793863,3.3720527331,0.871332397062,-1.06302194118,3.50864712556,-5.51388967832,-2.0657237343,1.25920955737,-0.851355524064,0.682309628327,1.77832262437,0.827240066528,2.64016712666,-1.44682307978,-1.31921160618,3.49327129126,-0.484558734299,0.692844864529,1.00374541759,2.69691859166,0.154326318701,3.57687735557,2.06113112768,0.991898488825,-2.44635528803,3.95126618067,0.472989312112,2.33190206448,-1.30573364337,-0.437735764186,-0.251160595852,-4.47043835958,-1.51135720338,0.506121761999,-2.44358267824,0.0295987832554,-0.774288076561,1.33123704235,-9.26131312053,-1.16106868347,2.29522511721,-0.143810934227,1.58851175785,-0.934488321146,2.50735031327,-2.19833483537,0.610350404581,0.244342085619,-0.716844118736,2.41659238497,-1.20272970358,-0.134129219055,1.19137221933,1.4639560167,2.79779875596,0.0395902216937,-1.13805999756,-1.18215333223,-4.94711904526,2.09147545735,-2.13449596683,-5.07175304095,3.36638139486,3.70780602773,-0.945894616344,2.34982962509,-3.65934572061,1.50665946653,-1.83905771414,0.419523326158,-3.01953722636,-2.5896670572,-3.02772776922,-0.675756167273,2.18817773163,0.581919515134,-2.15692337871,-0.136594539186,-0.149565262596,-0.947531465589,-2.10921741764,2.44600348274,-0.959342634677,-1.03096477588,-0.498095233439,-2.70281470935,0.375763909419,-1.34648666104,-1.03886758149,0.246556117833,1.06395082176,1.52048031847,4.41094911337,1.58565980355,-0.538471474896,-3.59832179228,-1.84744771719,-1.98041345438,0.181751922071,-1.86992271225,2.09672110558,-3.00351278146,-2.34073953231,1.90364366372,-3.77574122826,-1.82476956447,-1.66754270395,-4.17944864114,-1.643569568,3.2956170102,4.84715448697,1.54389404712,0.413878052236,2.01489253759,3.26832122485,0.128817051054,5.05713614782,1.29822279056,3.8207182916,1.3051289777,2.15857723474,-1.16148341576,-2.10272764564,2.65485213935,3.33767395735,-0.225942747493,-0.0608929246157,0.386773107847,3.04139202913,0.880819526515,3.79432223876,1.34475161622,-1.15084494869,-2.72890689214,-2.20355211159,4.0270291551,0.831315397334,3.15832736333,-1.64833269834,-1.15337079207,4.42843692621,3.73798665524,1.77370616277,-0.414466093183,-5.21718411287,2.14873480677,-3.09902131875,-0.431480846305,-2.21315110326,-2.32947000265,-7.03267769655,0.620159295995,0.669400061817,-1.29065409263,0.639066412349,0.412046761511,1.52948790789,1.63768410901,-3.5861120669,1.49905408064,3.24001261135,-1.20556717555,-1.63470236778,-0.0621758023897,-2.13516124328,1.88267453392,-0.0397303390498 +-1.94168348829,1.77759615302,0.00324969291651,2.76537520647,0.356809294373,-3.04903903445,-0.571212081513,0.542071000835,-0.3627079765,1.24325743755,-0.427730951508,0.239423566062,3.11484637578,0.816348610718,2.79456279387,3.34600726088,-2.36868370374,0.960648590526,0.492966024081,1.63726032575,-0.520594614346,-0.710762829333,0.599766778151,-1.0725793888,1.89727054477,1.25175032437,2.9051876543,1.70391878923,-1.64619573633,0.92607907027,-1.19849523346,-2.36430278246,-2.74358758171,2.20087053259,0.111789479652,-0.408449259351,-0.328728172382,4.23223049204,-0.694623126908,2.42802311579,2.87807498376,-0.740068741241,1.62616318464,0.592944381834,0.159329541226,0.917487897477,1.93800289412,0.471566647292,2.55488344431,3.85311472585,1.20064109365,-2.55722387075,-1.1082152708,-1.02037551522,0.175513311128,2.44115464489,-1.72615523438,0.765665462018,0.977433196902,-1.83432733496,0.349625592828,-1.36312687636,0.715892488958,4.3416105775,-1.06046443701,-0.509649908741,0.497787223061,-1.32805623293,-0.711287250494,-3.12120837728,-0.91976089676,0.324390023948,1.0570640707,1.21327393075,0.919257143736,0.591331963142,0.457342879871,1.42952108284,-0.507037276824,-0.131746374767,-0.758843362331,0.87595297138,1.89250633915,1.27972093304,-1.20407567422,-1.03513803601,-1.98328871648,-0.134607525474,-1.06089170476,-3.55350990852,-3.87543780128,-3.44827607234,-1.73182748934,0.474614856841,0.146985151768,1.38470371882,1.86053468158,-2.31340465764,-2.91458182531,-2.94684989293,0.104040072957,-5.12927719911,-2.74529750084,-1.73663758914,-0.050694579085,-1.07352878233,-2.26959511399,3.31798489332,-2.0237891674,-0.758091919621,1.11872776558,-0.914398193459,0.528536718686,-2.51926944176,1.82730301281,-0.9518930234,-0.139356404841,-1.27573636502,-1.00687736809,0.303327493866,-2.61500696495,0.686027350427,-1.85459803333,-0.927233275571,-0.823916974465,1.41521901513,-2.20459855944,-1.11158712417,-2.42684030851,-0.775827880999,-0.958329697748,-2.08249924024,-1.46892851353,2.33831092993,2.1452542154,1.52739960074,2.11092267672,1.58193236212,4.76442153255,-0.500175990462,1.07725728969,-0.0380358799299,-0.134679699142,1.48794374386,-0.768634611766,-0.826269167265,3.30978691737,0.666516949734,-0.977266769807,2.49315859,-2.23554114183,-2.87566772004,-2.56910360535,-2.05376130993,-0.0498415172097,0.0825265093635,-2.29967758298,1.79486357128,2.97849754334,0.294754260181,-0.787193464239,1.07911070456,2.28606698573,1.0229565537,3.31828083058,2.20088501116,-2.9493214941,0.495515686273,3.67637633016,-0.658510478187,1.48509448349,0.636143160462,-0.197983719906,1.11500002464,0.257854927381,-2.22390707294,-0.292455268702,-0.33748634686,-2.67893269607,1.85805808067,3.25056247751,-1.99736208757,-0.936352628816,-1.63319216132,0.0197323211029,0.691710200409,-1.60304873069,2.2222357738,-0.494030532042,-1.08713753581,3.57224936565,-3.01036009913,-1.89224094709,0.0352522730817,-1.16673329006,-0.490160132845,-0.675803392779,-1.24046576689,1.81585133443,0.898491098881,0.944541202783,-1.58925671836,0.205119132002,0.531537915468,0.253908309937,-0.676644065382,2.24614178936,1.33602100372,-0.244497090975,1.76761640933,-1.35158142765,-0.414446250596,3.73523249467,-6.43518327713,1.29597712395,4.63066692571,-0.613321131865,0.561347877184,0.0711209956796,-0.127557443778,3.31162866752,-3.6926500086,-0.285006345312,0.5099318854,4.93547654788,0.868819684585,0.137249038219,1.1507523783,-4.40680729866,-0.0998956763242,0.600819382666,-0.423278113404,-2.2000334398,1.6212370952,3.27790774345,1.55507115324,-1.22907078028,-0.029405062197,-3.98268408457,-0.990495022685,3.63038349777,0.218821062246,-0.752823298723,-0.248150258065,1.06529252927,0.178199325207,1.01655516048,-1.81574172656,3.30965251942,-1.68384102901,-1.26371297797,1.5262250487,3.47741630872,-0.265562386513,-0.801813617449,2.82347845296,0.909660657868,-3.02272153417,1.71411415577,0.936149520477,2.0352847523,-2.75044326146,-2.03834780852,1.15331309106,-1.52446875255,0.960695721459,0.943234353662,-0.174043610891,-1.19030439148,-2.10528520733,-0.142430415601,-8.60760105293,-1.17194890261,0.969270079944,-0.40029415071,2.89137278279,2.72696355025,2.00195770899,-3.36749429703,-0.503019749323,-0.34973009636,1.07590580434,1.94033220887,0.0662941793603,-0.844664263724,-0.205224726594,-0.927714525858,1.82151385903,0.716765152912,-0.505078462759,-1.39169801553,-3.75972258488,3.03527408441,-3.67419142961,-4.80039191882,3.66278994322,2.01150090774,1.10328300466,1.56799778958,-1.6706875356,2.33891484579,-2.23406077822,-0.0790168158239,-2.29835296949,-3.19061029037,-3.09279531479,-1.82349063297,0.689713494777,2.95120071332,-0.454457020759,-1.06216011772,1.86404781302,0.412750553488,-0.312192496856,-0.901166524788,-1.71619956255,-0.00137017687125,-0.982375823656,-2.71353883425,-2.3097029229,-1.10547592401,0.557556620639,-0.718295222919,0.482262715501,0.333214447946,6.6798358191,1.22103029927,-0.80201618895,-4.47059836705,-1.35593414923,-2.29788345019,0.258600590626,-0.521844846309,0.437594954967,-2.18392724584,-0.493631593385,1.37908267339,-2.26255252202,-0.30756078084,-0.831326435408,-3.69798319499,-0.223937482238,4.03011242072,3.48706426779,0.441070608024,0.828923836351,1.9435341994,1.40788590272,0.878239062827,4.71550399621,-1.09901936968,3.4838750726,0.68982342432,2.16981327931,-1.96828734874,-2.21202177366,0.926186291775,1.88568594058,-1.7648316586,0.236547902774,2.64866965254,1.89112312635,2.27105943719,2.17706474463,0.199846277238,-0.520975260338,-4.22671780745,1.23446348428,2.73591025611,-0.260378292885,2.32260232011,-1.45908730944,-2.6201878802,3.38336368958,3.02514527659,0.979315823712,-1.99782266637,-1.60707169851,0.0483122205721,-4.34822138787,-0.213511068026,-2.52483056227,-1.12644841035,-6.88962921143,0.44326542365,0.096239015261,-0.0212235212322,-0.688477359512,-0.351519578299,2.47742046833,1.44010951281,-1.97519741376,2.6616740036,2.26513570666,-0.766266692481,-2.78300611377,-0.376965727806,-2.54099842787,2.187827818,1.03102740248 diff --git a/tests/data/activitynet_features/v_test2.csv b/tests/data/activitynet_features/v_test2.csv new file mode 100644 index 0000000000000000000000000000000000000000..a568674f6cf39b61fa91c9fa1b1934343d8ee667 --- /dev/null +++ b/tests/data/activitynet_features/v_test2.csv @@ -0,0 +1,6 @@ +f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,f100,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127,f128,f129,f130,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140,f141,f142,f143,f144,f145,f146,f147,f148,f149,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174,f175,f176,f177,f178,f179,f180,f181,f182,f183,f184,f185,f186,f187,f188,f189,f190,f191,f192,f193,f194,f195,f196,f197,f198,f199,f200,f201,f202,f203,f204,f205,f206,f207,f208,f209,f210,f211,f212,f213,f214,f215,f216,f217,f218,f219,f220,f221,f222,f223,f224,f225,f226,f227,f228,f229,f230,f231,f232,f233,f234,f235,f236,f237,f238,f239,f240,f241,f242,f243,f244,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299,f300,f301,f302,f303,f304,f305,f306,f307,f308,f309,f310,f311,f312,f313,f314,f315,f316,f317,f318,f319,f320,f321,f322,f323,f324,f325,f326,f327,f328,f329,f330,f331,f332,f333,f334,f335,f336,f337,f338,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348,f349,f350,f351,f352,f353,f354,f355,f356,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366,f367,f368,f369,f370,f371,f372,f373,f374,f375,f376,f377,f378,f379,f380,f381,f382,f383,f384,f385,f386,f387,f388,f389,f390,f391,f392,f393,f394,f395,f396,f397,f398,f399 +-2.50391422427,1.68599787994,-6.01226188664,-0.125473405835,-4.05747392075,6.31113406836,3.125083399,-1.28819161128,-0.594363160034,-4.04687042561,3.33266554158,2.05021273438,5.06569788016,-1.51135614382,-1.75754686884,-0.330255823582,2.89510802927,-0.73977406509,-7.89353751824,-3.45772308633,1.17079686934,-4.14460512795,-1.39475490187,3.86253584502,0.447348279778,3.92883117367,-4.46848521844,-3.76229701362,1.69349113829,-3.27463325871,0.924009592578,2.12999677853,2.85659594768,-4.17102590297,5.99293164916,10.2884632288,1.83231558377,1.4478797998,-4.38947245616,3.90167659309,-1.85908630842,-3.78404481822,-4.00131390917,-5.05896560394,-5.12547527286,-1.43005141799,-0.799648821025,-3.57910595264,-2.2926393485,5.31605148185,-4.44407908701,2.30758203368,4.12896344555,-2.10192899924,-1.57365770347,1.46184540219,1.02006796352,0.693975594963,-0.882507590565,-0.268305251769,-1.78810432009,-1.44049936972,-1.30807676828,-2.54602796889,1.91918086343,-1.87330246853,-1.19116743588,-4.94173944111,3.41346881759,1.04477840726,-3.87883468949,-1.6401990057,-3.11963649974,3.10739194639,2.00107403406,-3.01992488162,-2.17734208151,1.18544464156,-3.26027744456,-1.38117784752,-1.12807281493,1.23731617227,-4.22769494609,-2.31104123998,-2.73342858264,-2.60609814517,-3.91516964902,-1.43564934755,5.86923505644,10.8698481406,-0.0644558026284,1.29974983175,11.9821762355,2.63645925008,-0.800439528532,0.305979802689,10.4448009584,3.89998507623,10.3629906773,0.987935663397,1.06111665476,1.15934493999,4.74597180691,-0.53357543254,-5.53819862455,-1.08892905758,-2.84128587559,2.54403880204,3.08628575869,2.26009004126,2.77060999349,-0.582374569877,-1.77802346002,-0.2937931835,1.02838354244,3.37142584142,-6.2468072647,2.20336157741,4.02669576097,7.7139954797,-2.62292807265,-1.63856477894,5.24209850422,-5.95689444574,10.9237309757,5.56173629091,-0.06239338509,-0.11586309122,10.5260359799,0.0455641002992,-0.143587274683,6.85981490484,1.30256727268,0.099060309792,-0.99507694974,-2.39523977029,0.646837872527,-0.549287130061,0.528060432284,0.478981495421,-2.87669151504,-1.24631201746,-2.76280551886,-4.99648601327,1.56782352093,1.72098800023,-0.0553381940814,-5.35496277362,-1.12433242997,-0.526286978024,4.84426140262,-1.67891876845,-0.0265676538691,-3.17656040053,0.26415708479,4.03517758548,1.4993594204,3.83278299704,-2.77651900406,-0.861125229206,11.2030357751,-3.15313750697,-2.50459314309,1.78739187732,-7.82420403389,0.904809594012,-4.18456179152,0.60995901817,-1.44564015234,3.83168430104,-0.00437937539048,-2.3451228437,5.58568740368,2.97791145801,4.32271502614,-1.54512997459,0.536759116431,-1.1815032758,-3.14896126398,-6.86535051022,-2.70346348657,0.0113500145858,-2.77794979296,2.35137890776,-2.64285167165,-3.95364762035,-5.22867795339,6.15572625407,-6.91736113212,-1.52054794698,-2.80880080933,0.30321730122,-5.91560237718,-7.42976562356,-1.07937648743,-3.26394725639,5.0495641506,-0.553299233738,3.96384933141,-2.30659410078,-1.92410211898,-0.0740623548288,-0.741995456365,1.25729537246,3.06146581722,2.64592689772,-0.768545938319,-0.368544330909,-4.14440217226,1.39461226592,0.549227126659,-2.66866894906,2.50084337145,-6.41121511041,0.753405646177,0.280067476256,0.0344201303652,1.11097541213,-0.756136736626,-0.134220118965,5.6025168238,-2.69538654726,-1.20349766834,-2.90915489789,-3.07136878235,5.78831844318,4.79880530822,-1.54153241949,-4.93687499883,-1.02846407186,2.11793406884,1.81036372992,0.928447236083,-1.67445344365,5.93752378918,5.25534441684,-1.32955752029,5.02874157984,-8.32498580794,1.22665544488,0.729978278127,3.76998885216,1.18933444305,-4.01561953996,-1.91036380149,-2.01600540918,-2.19074894269,-6.06838036269,1.91566910093,3.16219263298,-5.36112836713,-3.03646755643,2.60723549671,-4.73392456058,-1.27864055974,1.65558185437,0.35871136493,-1.97445669054,2.00282359886,0.766041404302,0.935142604145,0.146960995005,0.90301123882,0.584378651645,2.43738964301,2.14986027277,2.13076803503,3.4849176696,3.37372560032,1.19906408345,-3.25606738189,-7.18101082565,-1.28755031363,0.930275378818,0.638566405974,4.33632120663,3.7835789624,3.41258601273,-0.279865853117,-0.651737863704,-4.7223025058,5.75545690528,-0.820105519292,-4.00676441302,2.11396374954,2.60952237005,-0.820631582523,-0.546553676079,5.33481172893,1.34852465273,2.93794032376,-1.33422280837,0.00903616898423,-2.36627310158,-4.99107783527,4.48972757256,3.85615534734,0.528791357535,5.58767522678,0.127227772965,0.973913995567,-1.8062694088,2.32322553868,-0.442473914737,-0.123751487135,-1.67863033336,0.0891421785383,2.82212784306,-0.478511586228,-3.3537200428,-0.522387139102,-4.25974474021,2.87018204241,-0.111813521457,3.94839403804,3.74490500576,-2.30623158975,1.49538655047,0.530469396242,5.1296629385,-0.453469798231,0.306027388129,0.35104102143,-2.34272025863,2.87870763106,0.212640115016,0.719817214469,-0.20345939615,-0.506974699062,5.3592568385,-2.28140813929,2.88992723737,1.65410613199,4.48693866632,-0.09672872709,-1.87582435405,-2.46928755752,-3.56278716312,1.74785164057,2.74009034813,-7.29490411233,-3.16100976408,0.847520336401,2.92602454656,-0.0986801903656,-2.16201799224,-3.39690165524,1.53765563161,-1.41997380147,2.71161737728,-0.0167333157083,1.75945290337,2.10004583364,0.765974609689,1.79493778887,3.43569638106,1.49552039321,1.90617850633,-0.592973705882,4.00305455331,0.0335191789012,1.05186070161,2.48385107847,4.89055257951,2.06091725733,-0.18432842804,-4.0123498625,-1.32194922277,2.87064841629,-2.07818711219,0.695646315956,-2.8474977249,-0.372025591391,0.277543174562,0.348284025789,-0.54074715731,2.48928393808,-5.685446576,-1.66416304574,-7.02726226008,-4.88155203391,-5.57406386037,-4.91916411608,-7.94337537982,-3.65389317081,-2.97659988583,-5.97952768511,-0.575712613136,-3.38044490327,1.89594224776,-0.106777342905,-1.21814931744,2.66339186237,2.37583883107,-2.34277046832,0.0847875222918,2.1196259109,-2.034442402,0.994460807731,-5.99126604669 +-3.61196599602,1.54396823943,-7.05199570656,0.70936037898,-4.42450754642,5.79873381853,4.79998759627,-1.51375595927,0.041889913378,-5.36947724223,3.11711617708,1.87290850281,5.37537143231,-0.140440261367,-1.07927534082,-0.8091666732,4.91609726548,-1.47799203396,-8.695467484,-4.09717354178,-1.04496299029,-3.85961924196,-2.10038466751,3.32289713025,-0.286860848963,3.96218072772,-4.39675701856,-4.40787660479,3.73622534722,-2.87716412544,0.454319910706,2.42820411325,3.82069679498,-2.79692421705,4.38538633883,10.2156878471,3.4358463645,2.12645539939,-4.04702971578,3.87549848557,-3.44834155142,-4.70891635418,-3.76960349679,-4.85522414446,-4.31793097854,-1.22963698059,0.447048375012,-2.53883199245,-3.42271156311,4.74730663896,-3.28625443876,1.15255518705,4.48008643985,-2.00973020792,0.25715895891,2.01633035838,1.72455749959,2.46865062863,-2.55920924097,-0.941734179414,-1.01115750857,-1.55530408025,-1.35561941266,-1.23846808225,4.0139059037,-2.82922329605,-1.54500077367,-4.14823132754,3.46829478144,1.42298098058,-3.60501238108,-0.478655001521,-2.27799000442,3.80441823602,0.555091810227,-4.56343603134,-3.86684781313,2.51266635656,-2.34452754557,-3.54211790189,-1.63034411222,-1.93864814639,-3.73451783657,-1.60328631774,-2.4672175467,-3.80095796585,-4.04769252539,-1.72506986559,5.59767432213,11.0820033073,-0.191732565476,1.90799899697,11.6760043621,4.55487689376,-0.31670263633,0.824923895671,8.5647937417,6.5042055428,11.780738759,1.50271001905,-0.0258838802575,0.435441556572,3.30290358961,0.377896644174,-6.5453125,-1.00815881342,-4.10386363864,1.63551698476,3.23607475758,1.42431855202,2.55384192467,-0.456127088517,-1.94804133773,0.550055715443,0.636448504358,2.32128318697,-6.70778397321,2.73787901104,3.27784690857,8.87038059237,-3.74099546671,-1.75985428691,4.34281664491,-6.43530688286,12.9979223013,6.78234988451,-0.806176937745,-0.697792875396,12.720209074,1.51877520681,0.540385435523,6.74378789664,0.843219137377,-0.0813938416541,0.253477528694,-0.220510208608,-0.133373232186,0.959342181682,1.10231779218,0.231312006339,-1.99769770503,-2.40456032157,-2.95679311156,-5.95258055926,1.98243983686,2.28856839836,-0.382299264148,-5.90337668657,-2.26504155695,-2.81989197582,5.54886015653,-2.23119397462,0.655153363942,-3.77459974289,1.65176175833,5.3708147645,0.977352631095,1.60295453668,-4.00599938631,-1.69029248208,10.0866486311,-3.23101823926,-3.1206391573,-0.391065824031,-6.68118602037,2.16630054861,-4.7760153234,0.383674836252,-2.48520847857,2.07149813026,-1.99720753431,-1.20698849112,6.08765767813,2.54862617255,4.67334094047,-2.9711537391,0.948479612171,-1.01456621587,-3.11699818373,-6.72917854786,-2.92183075547,0.496130555124,-1.61810724959,4.37298168838,-1.93378492743,-1.86215627491,-4.90786517859,8.62715418338,-7.5756526351,-3.27301322818,-1.76513157338,0.75444869213,-6.96635819673,-8.78930687905,-1.7524562791,-2.41629351974,3.68741244673,-1.43222312816,3.23068808318,-1.59724262357,-3.27234983742,1.24265492261,-0.0109941303718,2.80159805715,2.48849355877,3.07970299125,-0.557770807296,0.432648000119,-3.69374324679,0.0467125833038,0.424763832987,-3.38139162659,3.42404463887,-4.51077946425,2.03796033263,0.507232870907,-0.506469908358,1.50909484178,-1.27529992908,-0.255473581143,6.49730739594,-3.27221466898,0.583703720573,-2.57865453363,-2.25019647181,5.4004673481,4.42697024941,-0.0842542231125,-3.7730645895,-0.905618444086,2.8413999021,1.14175421931,0.425801990927,-0.551772788169,4.81836385727,2.67149700224,-1.60633691549,3.67677226961,-7.09939215183,3.07843704373,-0.603567731382,1.07058879137,-0.284542271494,-2.65182375908,-0.966910338403,-2.21251030267,-1.5918459788,-6.73685925007,2.16504070461,3.16708334088,-5.73397156,-0.0308346152315,3.96178902388,-4.34651784301,-0.626209998878,2.96317673624,1.55037861467,-1.6240209043,-0.916502046583,2.22772178277,1.73989147246,0.425792780239,2.44748416841,1.27179720402,3.01824558973,0.45870998502,1.6810954839,4.9340551734,4.52931187153,1.22776987255,-4.30461632609,-8.0007350564,0.293104887008,2.59760651291,-2.09017359019,2.84267843664,3.92640956045,4.39850687385,0.263943502309,-2.52996243984,-4.9456074357,3.01140740514,0.060671949388,-3.45182769299,3.45659797787,-0.717935073377,-1.70038859993,-0.159526935219,4.78994245529,1.73284136951,3.39466386437,-3.02896884084,0.745040215552,-2.42295794487,-5.48635936975,5.81924671531,4.81498251557,0.588836860656,5.34480842352,-1.69491340667,-0.931661537289,-1.47670565099,1.95115838945,4.33551876547,-2.35900820047,-2.03742983938,-2.51175971031,2.00818323493,-1.02861073502,-2.83876619935,-1.42532885447,-3.22665929496,3.24723680019,2.50910392105,1.66940991878,1.98924016655,-2.976414603,2.39372268021,0.0301794916395,2.93753557801,-2.53472368196,-0.224031038582,2.22086050436,-4.60367997885,0.344105190041,0.892087735609,-0.732750460502,-0.0278959076854,-2.04538312331,4.39118845462,-1.92525613308,2.48760456741,2.12224386633,4.20933679342,-0.160378366895,-0.847533833979,-2.68713091612,-2.85529101193,1.45633238703,3.13940095305,-6.84778351784,-3.07674325108,2.9240462061,1.66283178181,0.366562292727,-0.474471753836,-2.22659401149,2.12781591714,-0.698044653983,3.11203145981,-0.0878812848356,2.08509909212,2.37360790372,-0.383632448313,2.85876693129,1.43884898126,2.44588458538,1.13197429609,0.669784083962,2.82567384094,-0.303028093278,0.0804680705045,1.01148720384,3.96722738147,3.78676999509,0.484674140066,-5.0017509222,0.154588726159,2.53468632102,-2.48899200261,0.211847947538,-2.28771493435,-0.277051561698,1.01623694403,0.347248692065,-1.88412645785,0.431219244007,-5.62209599018,-2.32514169514,-6.17786878348,-4.5459565401,-5.45559768676,-5.25804600716,-7.30329209566,-4.18787643314,-1.41929989755,-6.36565381289,0.691979244352,-5.4266118586,0.243365764617,-0.33372869622,-1.60025772154,2.65902011394,1.72226278037,-3.51518207789,0.837280854209,2.64499332011,-0.451456475259,4.05596930012,-4.51415959 +-4.72683149606,1.45348708808,-8.07086817742,1.63604789376,-4.73549800873,5.20675960303,6.51230325818,-1.76839387298,0.728590119478,-6.74178866983,2.8130164218,1.58456622004,5.62148933888,1.37578496694,-0.371593541978,-1.41557620727,7.0383985126,-2.29083102226,-9.45700079202,-4.80206114411,-3.43986400128,-3.55278479934,-2.83554306328,2.7735268724,-1.13780232042,3.92281681627,-4.25488941192,-5.10927104115,5.96552311688,-2.43940485954,-0.0862283119556,2.73895709873,4.89024929762,-1.2763541922,2.57022780523,9.9613841939,5.07362765074,2.82582543075,-3.62501172424,3.7390643692,-5.19941673696,-5.66170942306,-3.52688271404,-4.6018137145,-3.43782470346,-0.992310488373,1.76652944327,-1.43113125652,-4.60094419718,3.99586562991,-2.03482079327,-0.160126103461,4.7740121144,-1.88776037335,2.26084538698,2.65253681004,2.54412336618,4.38450802416,-4.3977601847,-1.7176710071,-0.0724306311467,-1.70681380391,-1.41692107796,0.200332455933,6.24482979595,-3.83351793349,-1.88694544792,-3.24113301516,3.48263311743,1.83456811458,-3.1987385869,0.769642335775,-1.36940517485,4.47494917393,-1.01712017417,-6.15526720286,-5.62981627226,3.9166711688,-1.23287549198,-5.84563351884,-2.13252854615,-5.38287308335,-3.12790068805,-0.774887352436,-2.1297221756,-5.0906492424,-4.12367990136,-1.97023809493,5.23813544751,11.0778312242,-0.275825666287,2.59604639888,11.1118171802,6.55417260289,0.203035293669,1.38965836696,6.4515772891,9.32944820284,13.2775346517,2.04594562918,-1.18929040372,-0.312611132264,1.6740041858,1.40754847616,-7.60108621597,-0.907561735809,-5.39238245725,0.626936522051,3.35088065982,0.46351477623,2.31236622334,-0.229608643204,-2.07551843763,1.55680642903,0.263669897775,1.0858634612,-7.05738488197,3.32455673039,2.40335632682,10.0899427987,-4.92568675757,-1.80175588966,3.28225847542,-6.88330174923,14.9608820614,8.02759130716,-1.60224438258,-1.24848374822,14.9900966168,3.09142677188,1.2888044706,6.5442295146,0.330789602659,-0.286123776287,1.62822659672,2.06531837225,-0.982651502788,2.60571396113,1.63691263556,0.01017631717,-1.03312850952,-3.68506930947,-3.12813932538,-6.89839523554,2.3975418067,2.95167421162,-0.811870859787,-6.43306355715,-3.44969232738,-5.32219609171,6.3486418271,-2.75835331619,1.37597230494,-4.40136899472,3.19074914694,6.78243587256,0.445585229398,-0.808829127549,-5.32398023844,-2.61561192304,8.69628513216,-3.31122705817,-3.75478894711,-2.72484310418,-5.34768217325,3.53855306476,-5.38706000924,0.145446739923,-3.58612233102,0.120355840028,-4.15744045019,0.0731746891131,6.55438641787,1.99956796408,4.91731314421,-4.42644771397,1.40971697062,-0.784811406731,-3.00484983444,-6.53485749721,-3.15200479388,1.03534908369,-0.301970368177,6.51142239392,-1.10611675471,0.418995252622,-4.4721977675,11.1724257183,-8.21665349245,-5.11762260079,-0.615411399901,1.18636612185,-8.06906448126,-10.1247596884,-2.49426667422,-1.32065032601,2.17061477065,-2.33631666951,2.38926856876,-0.913166025876,-4.7118704623,2.72928834141,0.775672697726,4.4457443577,1.71014433921,3.57591197133,-0.235582885593,1.25215531408,-3.14634150744,-1.4078004086,0.365033659041,-4.17761438727,4.40297134757,-2.42336025477,3.4580388701,0.689679874331,-1.04557964027,1.87770598858,-1.80380414367,-0.417696796171,7.45841611862,-3.81225969553,2.56200723887,-2.21683688522,-1.32409115911,4.95071142197,3.92624093532,1.46352795839,-2.46225001812,-0.77849281013,3.50410349012,0.434351972267,-0.0288636657596,0.669223650095,3.49293913841,-0.137764969467,-1.96554630518,2.1402142328,-5.7265598917,5.16214273542,-2.05637966395,-1.8495585683,-1.87955528319,-1.25644548416,0.00674796104395,-2.43147389591,-0.893102669418,-7.49637273312,2.34914988339,3.13358963132,-6.12764425039,3.23036705017,5.41211955786,-3.91730147004,0.0444684042034,4.39211372912,2.92113072753,-1.25977230668,-4.12997387886,3.87697173372,2.66106281221,0.736292763781,4.03323895753,2.06197661877,3.74714529276,-1.27023549199,1.21123526514,6.49754122019,5.87128979206,1.2765970856,-5.3870420897,-8.90884536504,2.01509624004,4.4446681577,-5.09674575568,1.20312212527,4.04149165512,5.50566021562,0.953406482342,-4.55933359832,-5.21267021895,-0.036395560507,1.1284481287,-2.80024212361,4.99020810008,-4.41919901133,-2.62691727608,0.226202066541,4.16152264595,2.0979556495,3.87861913562,-4.9043425262,1.60233154863,-2.46861632347,-6.0349463439,7.17580538869,5.88561519026,0.718002053499,5.10737453699,-3.68287960738,-3.00543767631,-1.03803471714,1.53446617425,9.3747028375,-4.76337719411,-2.39580845952,-5.3522044754,1.13427948356,-1.6372946959,-2.29562118411,-2.37800694928,-2.10207263529,3.68294849873,5.38075784862,-0.940855975155,0.0137967544802,-3.74462119222,3.33829092682,-0.57550301969,0.537392029762,-4.84174327537,-0.825007719694,4.19546295956,-7.04726793528,-2.39606908321,1.61995286934,-2.34724253952,0.159427139386,-3.66048334882,3.28457990646,-1.59395935536,2.02604223549,2.65396766722,3.91925804377,-0.170175538174,0.293078864813,-2.97810955763,-2.11363542974,1.19750591725,3.54246556639,-6.34636378288,-2.98813998103,5.24311850287,0.266658103764,0.848274391745,1.48310565829,-0.99932412535,2.74228922785,0.028886015862,3.42641401768,-0.174800014277,2.45710129201,2.67823993087,-1.63095737636,3.88755993008,-0.699719142316,3.417716069,0.163006665744,2.16666536272,1.66770118028,-0.553962221444,-1.03107923508,-0.689737435581,2.84424331307,5.59421723187,1.20365538374,-6.0307972002,1.79253649413,2.07976007581,-2.97050522506,-0.320198328197,-1.71101762295,-0.148553741649,1.92997103455,0.389586392492,-3.34172380107,-1.60005307674,-5.45010868966,-3.076508376,-5.23991111994,-4.07970976352,-5.24768321514,-5.51570352555,-6.46153886914,-4.78648862958,0.280570674728,-6.66282331825,2.05202573478,-7.5744939363,-1.66311737061,-0.568106225319,-1.98653774977,2.69276298046,1.04291445166,-4.88652718305,1.6799737481,3.19981912076,1.09642167091,7.33881660357,-2.92239319682 +-3.95618640951,2.16822504699,-7.02749201775,2.07438584924,-3.7952008903,4.66516063452,5.66598080516,-1.93683131397,0.83286083467,-6.31038688779,1.93803728581,0.415994385479,4.63695873261,2.03064954996,-0.546765608815,-2.54600209773,6.67720080018,-2.60086139083,-8.36665858864,-5.08000973701,-3.84362360537,-3.51486208201,-2.64075744003,3.07348869205,-1.94571852326,3.0428294871,-3.48582068503,-5.26945194721,6.5893364191,-2.27115260124,-0.558212063015,2.65741990924,5.38911813021,-0.610317340195,1.36496483032,7.88430027903,4.24496084571,2.5491838041,-2.95291282773,2.46365449905,-5.8806508565,-5.27971760869,-3.57540645719,-4.17462575197,-3.20521330357,-0.712461964526,1.66458856776,-1.43753664225,-4.29921654403,2.28583934903,-1.82383457958,-1.12579432636,3.8323690407,-1.60873620778,2.88645622611,3.1870587337,3.35539863348,4.68089458585,-5.01220222473,-2.40511398852,1.23198447682,-2.04995642841,-1.54208872378,0.738531426192,6.23694182634,-3.66800229013,-1.47559821933,-2.51566377998,2.96481087386,1.93647179783,-1.85266061902,0.897218961718,-1.2290535754,3.62848708004,-1.39016747028,-5.53799726665,-5.19588583469,3.79989851355,0.365908132196,-5.86183534264,-1.74588927373,-6.0965897572,-2.17361679807,0.099301538021,-1.49651467532,-5.28756560326,-3.35764337569,-1.22807119251,4.41288296581,8.37310397655,0.329299056678,3.0666925776,8.31520066255,6.03162533879,0.254658643305,1.52927615046,5.15474370718,9.92706954478,13.1178707933,1.9851475221,-1.25251645445,-0.040588879585,0.598402907254,2.09637820482,-7.39962798595,-0.736607771963,-4.72784618586,0.148764773328,2.82482881815,-0.363951296807,2.18847515703,0.851648719757,-1.44513312698,2.82303802848,0.789665968129,-0.284895439446,-5.39480451405,3.52706449866,1.50199447424,9.94445934776,-4.85012166024,-0.775828022365,2.07768519119,-6.15859429717,12.0614514388,7.37984260201,-1.64554053068,-0.434650133851,14.1951656962,3.12879480362,1.52092895806,5.6518155706,0.0597475437445,-0.432820611596,2.15243572235,1.70108392119,-1.19518387556,3.0659382844,0.729992161989,0.512096637264,-0.702464946806,-4.23238757848,-2.71316921115,-6.04356548428,2.08492669598,3.63833817005,-1.76652027816,-5.79197620272,-3.09022756994,-6.01349622488,6.92608562946,-2.03923279405,1.31198180869,-4.27980091691,3.90416300416,6.64981202126,0.73166857958,-1.23485268474,-5.4199275887,-3.10880723954,6.33416883498,-3.2787891686,-3.49453917981,-2.87733795069,-3.98702534318,3.87149213552,-5.16316780805,0.178835353982,-3.50880401373,-0.771996193229,-4.59445316195,0.868211128412,5.75491086721,0.921819759609,3.39493911088,-3.67554339618,1.67544182837,-0.174868727922,-2.08721256792,-5.95615169048,-3.12308293462,1.30280533791,0.644019361586,6.33218312264,-0.25693573624,1.04176057992,-3.36895969659,10.1426500809,-7.50808531523,-4.85486101508,-0.170589606464,0.612994321586,-7.87276499986,-8.79793308139,-2.78509446978,0.942439908986,1.39931613266,-1.95726648182,1.68011825532,-1.75475023031,-4.74921035767,3.71489373327,0.868516312915,4.43326895118,-0.263135685322,3.9764669311,0.911694865376,0.85224120736,-2.35560669035,-1.62565724194,1.2212044698,-4.61154775619,4.34895780444,-1.68536224604,4.06422766924,-0.0101673817625,-0.609392880799,1.22532760024,-1.5149737785,-0.805999085308,7.55067921162,-2.93719872087,3.43533396363,-2.10260034561,-0.721583162695,4.52110221148,2.69720968336,1.40812491387,-1.62846618414,-0.822517428993,2.23470644593,0.491862057373,0.920802225173,0.962496383188,1.928562572,-0.802637988328,-2.72160144806,1.0092707017,-4.93745543241,6.46554609537,-2.43392473698,-2.37087579571,-2.17133839786,-1.93240495443,-0.362681306601,-2.54449704886,-0.17978616923,-8.05280478001,1.39086142182,2.67881788671,-6.08614060402,3.92572582901,5.49754135013,-3.72346940279,-0.242022804468,4.81397798061,4.11047571898,-1.36651873588,-5.34488024235,4.95870956659,3.41118116498,0.89432107985,3.33253220856,2.74165137768,5.04070746183,-0.415948872567,1.31926612794,6.72856174469,7.17419068098,1.49098495662,-4.98007160067,-9.318038764,2.46224850535,5.27640871287,-6.26628448487,0.635381773711,3.60578859449,6.173201437,2.24732711256,-4.89329962254,-5.55538270712,-1.49875565291,2.64946635843,-2.09067063332,6.20336785316,-6.25677093268,-2.50105109721,-0.0861860245474,3.59812706232,1.57726798058,3.84794261813,-5.72557672262,2.46239029348,-2.29553559303,-6.28103302002,6.47278197646,6.46319063902,1.48405849189,5.35767221928,-4.23237529636,-3.51878979206,-0.00904786854982,1.29577608407,8.77539933744,-5.03432886004,-2.11539484441,-6.16999167681,1.0546652633,-1.90332779229,-2.35973056435,-2.26917619407,-1.82008438647,4.08268388271,6.31470301866,-3.08372749806,-1.22069035709,-4.38186541558,3.19182102323,-1.42976873428,-0.223793095648,-5.89660835981,-1.25134502113,3.99110957295,-7.45729860783,-2.86559789747,1.66721295506,-3.13464591861,0.162813140824,-3.38049943731,2.39996716856,-2.15944387913,1.63885930896,3.04169135332,3.98578349114,0.511457957626,0.823394746482,-3.67019996286,-2.25544205963,1.80545994013,3.28000457585,-6.05162557602,-3.00187867403,6.49878694773,-0.326051785648,0.684602611069,3.36035886407,-1.228521097,2.57487190307,-0.46660696879,2.10812581897,-0.305482617393,2.75176966548,2.83328473449,-1.89653189778,2.65913075805,-0.83185869336,2.94031493856,-1.53106848534,3.9481344676,2.79967945367,0.710376281441,-1.93211027801,-2.24844452739,1.20713421225,5.22792970717,1.27727831364,-5.73701616764,2.55549032926,0.93986610532,-3.48593280315,-0.51567519635,-1.94204506159,0.172434514092,3.41956290126,0.900014420896,-3.65240677357,0.294835821394,-4.22226468399,-3.63110159874,-4.85140349388,-2.80221052408,-4.28761808038,-4.3011406827,-4.58334078341,-5.13591312647,0.760158468181,-5.32113479346,2.1639226532,-7.19870259762,-3.37775546551,-0.481121961772,-1.74219072804,3.14396611452,1.24187298924,-6.32387711763,2.16209208607,3.14260455966,-0.531431690456,7.58907546639,-2.70918695331 +-1.8262197373,3.46346980632,-4.49737847328,2.16065784335,-1.95281272531,4.15987870455,2.97505878091,-2.04312422812,0.517240521534,-4.57863372207,0.651493945123,-1.38716154456,2.76521640778,2.06453320265,-1.35841371835,-4.05420722187,4.5255736053,-2.54840182424,-5.94124334216,-5.05016509056,-2.81190917194,-3.67080595732,-1.77554705888,3.98575968385,-2.7226165092,1.5568113219,-2.26458370864,-5.039455657,6.05570743084,-2.2971960926,-0.980765613618,2.29306887269,5.47656385183,-0.560339287222,0.599394002372,4.49311896622,1.63815704465,1.56890586585,-2.10052304626,0.367122573851,-5.79060420752,-3.93543901801,-3.83389718414,-3.62215631246,-3.43940053463,-0.401958022528,0.53790163979,-2.24713481337,-2.93054077685,-0.115260034797,-2.36293837487,-1.84129033774,1.99996710196,-1.21648682684,2.51857071042,3.64827320695,4.16069695712,3.80975566268,-4.74414714813,-3.02875908077,2.80003528588,-2.53125300467,-1.71329928577,0.627459498644,4.61502670049,-2.65913504899,-0.521180084049,-1.92113643766,2.06333797991,1.81511531889,0.170950590374,0.216834144594,-1.64254453719,1.68837884694,-0.898700688779,-3.32811955512,-3.17814456463,2.586751405,2.31587963283,-4.22904350161,-0.718470119983,-4.84180047393,-0.968689443319,1.00650176987,-0.650119360983,-4.69666749001,-1.9845663628,0.225895456072,3.25188346505,3.7214648661,1.43130174562,3.38060764193,3.90915834465,3.69100523353,-0.0311959603429,1.36241446137,4.44646521807,8.91873133182,11.7640150213,1.48888324678,-0.522589141129,0.966836237908,-0.0783090251672,2.53949897528,-6.29179323673,-0.514931401759,-2.65529345423,0.0529807021281,1.83676325411,-1.09529018879,2.14935440898,2.54911320939,-0.268381892444,4.27633724332,1.96361587535,-1.7532244429,-2.28158183038,3.45261253476,0.581260310115,8.81487321379,-3.86599963188,1.0199303931,0.769287056774,-4.58845028997,5.65657658659,5.3673800838,-1.14614109278,1.36291043639,11.190714488,2.05933052063,1.38084109723,4.25990110517,-0.0372709861028,-0.537679631114,2.06346488953,-0.573661087751,-0.94866688013,2.67103304624,-1.21616028428,1.53528989427,-0.828803119361,-4.25112649202,-1.87550593019,-3.89059672713,1.24767834112,4.34198590755,-3.09971417338,-4.30684231401,-1.61756324589,-5.39918883562,7.34323934078,-0.421631439929,0.682198462336,-3.61873383403,4.0223959434,5.40389529228,1.60736526251,-0.229281746149,-4.63487404227,-3.29047121048,3.27148656696,-3.16514086127,-2.58948973179,-1.45728034504,-2.60679310679,3.4551587224,-4.33734436273,0.408040666226,-2.58206250429,-0.901034320293,-3.78914433122,1.31350503355,4.04259036302,-0.537077046634,0.599152674081,-1.33412403017,1.80021581352,0.709162880183,-0.588873988985,-5.10033129215,-2.90737100959,1.37433995247,1.32315881923,4.48205828667,0.607754543123,0.468945158216,-1.78444975466,6.53551485418,-5.82657202005,-3.07283199429,-0.23395036608,-0.685120877029,-6.73997298956,-5.55178875566,-2.75079527497,4.04717801094,1.16547028959,-0.65315918714,1.06632480771,-3.69622943163,-3.77567577362,4.33929287315,0.461161797867,3.2264848721,-3.09787745297,4.30806201577,2.65380481884,-0.426790667771,-1.38944570095,-0.951971263289,2.73767599046,-4.78429574967,3.55026640534,-1.92020277738,4.08365875006,-1.34608724355,0.52991460502,-0.16311301112,-0.636902204679,-1.35728861392,7.01656522274,-1.04194504768,3.51204951167,-2.16685251236,-0.352366254777,4.10601737738,0.943123545944,0.196937171517,-1.13858823061,-0.989929439423,-0.42760035634,1.10082056798,2.88289318532,0.586945049761,0.191918394566,0.0784438192848,-3.76375469208,0.170746930539,-4.56917799234,7.20640591383,-2.03627742291,-1.162803858,-1.52358367979,-4.10165442705,-1.70038300634,-2.58114453554,0.544036584797,-8.4628292799,-0.39101603627,1.92033407032,-5.73090517998,2.77129089892,4.59895916582,-3.6993329978,-1.21856652394,4.50981304645,5.16903883219,-1.81281971931,-5.11896033764,5.63131075621,4.03798224807,0.94242796123,0.983446211217,3.34165998936,6.74135187387,2.30066786289,1.84391614258,5.99896759033,8.44891975641,1.82473051131,-3.49935032487,-9.36754787922,1.99033073068,5.37617359877,-6.11145027161,0.840345951319,2.77300786376,6.52381299019,3.97701953709,-4.00499682903,-5.95263335466,-1.81812204599,4.49724048495,-1.33929533959,7.18550524711,-6.75075093508,-1.61648165077,-0.901867833736,3.08160940886,0.417978906632,3.44625248551,-5.78684989214,3.32444414914,-1.96475922584,-6.30903808594,4.28494357228,6.6853062272,2.70926908404,5.95959033489,-3.74483410478,-2.90718505382,1.44551556975,1.18541310728,4.11113968076,-3.76723547578,-1.37445659459,-5.52958389044,1.54767837942,-1.92233352125,-2.86162799716,-1.39507161677,-2.14555080593,4.45648285389,5.85169536352,-4.88964028836,-1.92099967062,-4.92469491005,2.25878873229,-2.46324559539,0.196596455872,-6.0487574029,-1.55177951395,2.21588104457,-6.40127635479,-1.69814975291,1.22380428523,-3.3257760489,0.0335933651775,-1.73429207817,1.6753979218,-3.37144515515,1.30529874444,3.32560100317,4.3093956852,1.69156472504,0.913729201853,-4.65146396518,-3.03416330755,3.03830222517,2.53774605692,-5.90589033127,-3.08939878225,6.98781540632,-0.339520339072,0.055654079916,5.17970392108,-2.50772905916,1.84376598061,-1.84338212296,-0.387204087972,-0.467715920136,2.99068574548,2.88048758626,-1.45433287024,-0.196598118542,0.482496773005,1.41811820263,-3.74789556999,5.93477154017,5.5825525865,3.06712063462,-2.68137378276,-3.70440641046,-0.800623167756,3.29453107536,0.885642924309,-4.48956893444,2.6876345849,-0.693841806652,-4.02581026554,-0.468509003222,-2.75538569212,0.6321949444,5.32430804849,1.74790291831,-3.13624450207,5.02021304012,-2.23322165832,-4.04383488655,-4.85901101112,-0.93985485792,-2.78530479312,-2.02520967245,-1.95793826431,-5.30569067717,0.360007514108,-2.79794396788,1.37599081039,-5.00351879596,-4.95408667088,-0.162458266318,-1.04320560053,3.89612897635,2.07402950018,-7.80881192922,2.38426132559,2.64415159821,-4.4487659061,5.65304963946,-3.48982639909 diff --git a/tests/data/annotations/action_test_anno.json b/tests/data/annotations/action_test_anno.json new file mode 100644 index 0000000000000000000000000000000000000000..28ef0acbb60b0d7507c5f2e32fb7cd9ca2494284 --- /dev/null +++ b/tests/data/annotations/action_test_anno.json @@ -0,0 +1,34 @@ + { + "v_test1": { + "duration_second": 1, + "duration_frame": 30, + "annotations": [ + { + "segment": [ + 0.3, + 0.6 + ], + "label": "Rock climbing" + } + ], + "feature_frame": 30, + "fps": 30.0, + "rfps": 30 + }, + "v_test2": { + "duration_second": 2, + "duration_frame": 48, + "annotations": [ + { + "segment": [ + 1.0, + 2.0 + ], + "label": "Drinking beer" + } + ], + "feature_frame": 48, + "fps": 24.0, + "rfps": 24.0 + } + } diff --git a/tests/data/annotations/audio_feature_test_list.txt b/tests/data/annotations/audio_feature_test_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2bc7268bca69f8bfd61b29e2b72d316f05634f2 --- /dev/null +++ b/tests/data/annotations/audio_feature_test_list.txt @@ -0,0 +1,2 @@ +test 100 127 +test 100 127 diff --git a/tests/data/annotations/audio_test_list.txt b/tests/data/annotations/audio_test_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..247dea4f4d910fa839b5283296ad08f26d4d3d64 --- /dev/null +++ b/tests/data/annotations/audio_test_list.txt @@ -0,0 +1,2 @@ +test.wav 100 127 +test.wav 100 127 diff --git a/tests/data/annotations/hvu_frame_test_anno.json b/tests/data/annotations/hvu_frame_test_anno.json new file mode 100644 index 0000000000000000000000000000000000000000..fa9e5da4f1fc7571ac05a6d42ca869fd710006a7 --- /dev/null +++ b/tests/data/annotations/hvu_frame_test_anno.json @@ -0,0 +1,24 @@ +[ + { + "frame_dir":"imgs", + "total_frames":5, + "label":{ + "concept":[250, 131, 42, 51, 57, 155, 122], + "object":[1570, 508], + "event":[16], + "action":[180], + "scene":[206] + } + }, + { + "frame_dir":"imgs", + "total_frames":5, + "label":{ + "concept":[250, 131, 42, 51, 57, 155, 122], + "object":[1570, 508], + "event":[16], + "action":[180], + "scene":[206] + } + } +] diff --git a/tests/data/annotations/hvu_video_eval_test_anno.json b/tests/data/annotations/hvu_video_eval_test_anno.json new file mode 100644 index 0000000000000000000000000000000000000000..a77398ba6c9df29d0f7d4e38cc2246ea80559cf2 --- /dev/null +++ b/tests/data/annotations/hvu_video_eval_test_anno.json @@ -0,0 +1,18 @@ +[ + { + "filename":"test.mp4", + "label":{ + "action": [2], + "scene": [2], + "object": [1] + } + }, + { + "filename":"test.avi", + "label":{ + "action": [1], + "scene": [1], + "object": [2] + } + } +] diff --git a/tests/data/annotations/hvu_video_test_anno.json b/tests/data/annotations/hvu_video_test_anno.json new file mode 100644 index 0000000000000000000000000000000000000000..f5b2c58f2b428ce8e27b3bd23e35cf3ad3514f1a --- /dev/null +++ b/tests/data/annotations/hvu_video_test_anno.json @@ -0,0 +1,22 @@ +[ + { + "filename":"tmp.mp4", + "label":{ + "concept":[250, 131, 42, 51, 57, 155, 122], + "object":[1570, 508], + "event":[16], + "action":[180], + "scene":[206] + } + }, + { + "filename":"tmp.mp4", + "label":{ + "concept":[250, 131, 42, 51, 57, 155, 122], + "object":[1570, 508], + "event":[16], + "action":[180], + "scene":[206] + } + } +] diff --git a/tests/data/annotations/proposal_normalized_list.txt b/tests/data/annotations/proposal_normalized_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d43bee42a0cf59f5c6ab204eb62b163070e1630 --- /dev/null +++ b/tests/data/annotations/proposal_normalized_list.txt @@ -0,0 +1,18 @@ +# 0 +imgs +5 +1 +2 +3 0.2000 0.4000 +3 0.6000 1.0000 +10 +3 1.0000 1.0000 0.2000 0.4000 +3 0.5000 0.5000 0.2000 0.6000 +3 0.3333 0.3333 0.2000 0.8000 +3 0.5000 0.5000 0.2000 1.0000 +3 0.0000 0.0000 0.4000 0.6000 +3 0.3333 0.5000 0.4000 0.8000 +3 0.6666 0.6666 0.4000 1.0000 +3 0.5000 1.0000 0.6000 0.8000 +3 1.0000 1.0000 0.6000 1.0000 +3 0.5000 1.0000 0.8000 1.0000 diff --git a/tests/data/annotations/proposal_test_list.txt b/tests/data/annotations/proposal_test_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba446556f439892c25614bc35bdcea37ddad56a1 --- /dev/null +++ b/tests/data/annotations/proposal_test_list.txt @@ -0,0 +1,18 @@ +# 0 +imgs +5 +1 +2 +3 1 2 +3 3 5 +10 +3 1.0000 1.0000 1 2 +3 0.5000 0.5000 1 3 +3 0.3333 0.3333 1 4 +3 0.5000 0.5000 1 5 +3 0.0000 0.0000 2 3 +3 0.3333 0.5000 2 4 +3 0.6666 0.6666 2 5 +3 0.5000 1.0000 3 4 +3 1.0000 1.0000 3 5 +3 0.5000 1.0000 4 5 diff --git a/tests/data/annotations/rawframe_test_list.txt b/tests/data/annotations/rawframe_test_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2c5b7d86afb4d6f46df30e10bd4740a1c3526c2 --- /dev/null +++ b/tests/data/annotations/rawframe_test_list.txt @@ -0,0 +1,2 @@ +imgs 5 127 +imgs 5 127 diff --git a/tests/data/annotations/rawframe_test_list_multi_label.txt b/tests/data/annotations/rawframe_test_list_multi_label.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5247c74dffba7560243f2c075b3d93adde7eaf7 --- /dev/null +++ b/tests/data/annotations/rawframe_test_list_multi_label.txt @@ -0,0 +1,2 @@ +imgs 5 1 +imgs 5 3 5 diff --git a/tests/data/annotations/rawframe_test_list_with_offset.txt b/tests/data/annotations/rawframe_test_list_with_offset.txt new file mode 100644 index 0000000000000000000000000000000000000000..620ed78a877063c458a6adca621165c7d2f94874 --- /dev/null +++ b/tests/data/annotations/rawframe_test_list_with_offset.txt @@ -0,0 +1,2 @@ +imgs 2 5 127 +imgs 2 5 127 diff --git a/tests/data/annotations/rawvideo_test_anno.json b/tests/data/annotations/rawvideo_test_anno.json new file mode 100644 index 0000000000000000000000000000000000000000..f67ad8e56794680cbd806697cf0d4ea0c6ad1f5f --- /dev/null +++ b/tests/data/annotations/rawvideo_test_anno.json @@ -0,0 +1,8 @@ +[ + { + "video_dir":"rawvideo_dataset", + "label":1, + "num_clips":2, + "positive_clip_inds":[0] + } +] diff --git a/tests/data/annotations/rawvideo_test_anno.txt b/tests/data/annotations/rawvideo_test_anno.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bdd573e27f57a09032e8874d5746d58ee5899fe --- /dev/null +++ b/tests/data/annotations/rawvideo_test_anno.txt @@ -0,0 +1 @@ +rawvideo_dataset 1 2 0 diff --git a/tests/data/annotations/sample.pkl b/tests/data/annotations/sample.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8bd245b9a6269c66c85bfc9b75722ac1315a13bf --- /dev/null +++ b/tests/data/annotations/sample.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eacddfc5e56956a7af7e3a3f7c57fac9d8cf5892e36f16274a1f2f5217f16c2 +size 278252 diff --git a/tests/data/annotations/video_test_list.txt b/tests/data/annotations/video_test_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a7fb78cb27309683dc65a61a717206a9d460d33 --- /dev/null +++ b/tests/data/annotations/video_test_list.txt @@ -0,0 +1,2 @@ +test.mp4 0 +test.mp4 0 diff --git a/tests/data/annotations/video_test_list_multi_label.txt b/tests/data/annotations/video_test_list_multi_label.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f1a510d7981711a49a9875c35a3dc1cdd601341 --- /dev/null +++ b/tests/data/annotations/video_test_list_multi_label.txt @@ -0,0 +1,2 @@ +test.mp4 0 3 +test.mp4 0 2 4 diff --git a/tests/data/annotations/video_text_test_list.json b/tests/data/annotations/video_text_test_list.json new file mode 100644 index 0000000000000000000000000000000000000000..99e968fe88499ca6264b612bc73c722eab14c3f5 --- /dev/null +++ b/tests/data/annotations/video_text_test_list.json @@ -0,0 +1 @@ +{"test.mp4": ["A person is cleaning a swimming pool", "A person is using a cleaning machine to clean the swimming pool"]} \ No newline at end of file diff --git a/tests/data/ava_dataset/action_list.txt b/tests/data/ava_dataset/action_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a5d442bbf947694c274798064263bdca31255d2 --- /dev/null +++ b/tests/data/ava_dataset/action_list.txt @@ -0,0 +1,16 @@ +item { + name: "action1" + id: 12 +} +item { + name: "action2" + id: 17 +} +item { + name: "action3" + id: 79 +} +item { + name: "action3" + id: 80 +} diff --git a/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv b/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv new file mode 100644 index 0000000000000000000000000000000000000000..3b353a69e9dc311688c8470fc6ec5da18e3a273d --- /dev/null +++ b/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv @@ -0,0 +1,2 @@ +0f39OWEqJ24,0903 +_-Z6wFjXtGQ,0902 diff --git a/tests/data/ava_dataset/ava_proposals_sample.pkl b/tests/data/ava_dataset/ava_proposals_sample.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e6113a28ca8da045ffda783d4cdb227012bd8c3f --- /dev/null +++ b/tests/data/ava_dataset/ava_proposals_sample.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c688e3fdd42655fd4d744813a4ba507d1349917f28215b8c9eba03a3aa9e4b8 +size 476 diff --git a/tests/data/ava_dataset/ava_sample.csv b/tests/data/ava_dataset/ava_sample.csv new file mode 100644 index 0000000000000000000000000000000000000000..888369ce272399e3a514da987f2293a326a7e933 --- /dev/null +++ b/tests/data/ava_dataset/ava_sample.csv @@ -0,0 +1,8 @@ +0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,12,0 +0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,17,0 +0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,79,0 +0f39OWEqJ24,0903,0.034,0.189,0.669,0.980,12,0 +0f39OWEqJ24,0903,0.034,0.189,0.669,0.980,17,0 +_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0 +_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0 +_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,80,0 diff --git a/tests/data/bsp_features/v_test1.npy b/tests/data/bsp_features/v_test1.npy new file mode 100644 index 0000000000000000000000000000000000000000..57e96ee51702ddd221ebf17961feb07cca02b9c4 --- /dev/null +++ b/tests/data/bsp_features/v_test1.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf3a0c38540d78aac01ce02a17af366cd95c54e05d80748fafa294f2ad19964 +size 170368 diff --git a/tests/data/eval_detection/action_list.txt b/tests/data/eval_detection/action_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ca0ac4a6732343148d6b90c884f275c6c68fe2 --- /dev/null +++ b/tests/data/eval_detection/action_list.txt @@ -0,0 +1,12 @@ +item { + name: "action1" + id: 1 +} +item { + name: "action2" + id: 2 +} +item { + name: "action3" + id: 3 +} diff --git a/tests/data/eval_detection/gt.csv b/tests/data/eval_detection/gt.csv new file mode 100644 index 0000000000000000000000000000000000000000..b68e8f60d8692d48477fef5d431d51a611d1c353 --- /dev/null +++ b/tests/data/eval_detection/gt.csv @@ -0,0 +1,12 @@ +3reY9zJKhqN,1774,0.278,0.203,0.964,0.677,3,0 +3reY9zJKhqN,1774,0.050,0.230,0.522,0.952,1,1 +3reY9zJKhqN,1774,0.154,0.039,0.757,0.743,1,2 +3reY9zJKhqN,1774,0.428,0.482,0.659,0.607,2,3 +HmR8SmNIoxu,1384,0.278,0.296,0.729,0.957,3,0 +HmR8SmNIoxu,1384,0.254,0.371,0.677,0.859,3,1 +HmR8SmNIoxu,1384,0.061,0.318,0.584,0.710,1,2 +HmR8SmNIoxu,1384,0.484,0.483,0.895,0.837,3,3 +5HNXoce1raG,1097,0.195,0.031,1.000,0.664,2,0 +5HNXoce1raG,1097,0.047,0.218,0.512,0.504,1,1 +5HNXoce1raG,1097,0.362,0.465,0.932,0.696,2,2 +5HNXoce1raG,1097,0.446,0.156,0.856,0.951,3,3 diff --git a/tests/data/eval_detection/pred.csv b/tests/data/eval_detection/pred.csv new file mode 100644 index 0000000000000000000000000000000000000000..ff14331acc0ead0199bbe05902b4b68222cb0c8a --- /dev/null +++ b/tests/data/eval_detection/pred.csv @@ -0,0 +1,30 @@ +3reY9zJKhqN,1774,0.072,0.470,0.840,0.898,2,0.655 +3reY9zJKhqN,1774,0.230,0.215,0.781,0.534,1,0.949 +3reY9zJKhqN,1774,0.195,0.128,0.643,0.944,1,0.640 +3reY9zJKhqN,1774,0.236,0.189,0.689,0.740,3,0.681 +3reY9zJKhqN,1774,0.375,0.371,0.726,0.804,3,0.425 +3reY9zJKhqN,1774,0.024,0.398,0.776,0.719,1,0.160 +3reY9zJKhqN,1774,0.477,0.135,0.959,0.967,2,0.753 +3reY9zJKhqN,1774,0.435,0.071,0.966,0.578,1,0.088 +3reY9zJKhqN,1774,0.089,0.494,0.583,0.669,1,0.084 +3reY9zJKhqN,1774,0.136,0.129,0.507,0.532,1,0.041 +HmR8SmNIoxu,1384,0.152,0.299,0.599,0.577,1,0.060 +HmR8SmNIoxu,1384,0.360,0.170,0.731,0.987,3,0.138 +HmR8SmNIoxu,1384,0.348,0.193,0.533,0.727,2,0.429 +HmR8SmNIoxu,1384,0.242,0.396,0.875,0.907,2,0.470 +HmR8SmNIoxu,1384,0.496,0.023,0.730,0.673,3,0.473 +HmR8SmNIoxu,1384,0.038,0.025,0.843,0.570,1,0.606 +HmR8SmNIoxu,1384,0.156,0.193,0.836,0.836,2,0.388 +HmR8SmNIoxu,1384,0.433,0.072,0.962,0.755,3,0.787 +HmR8SmNIoxu,1384,0.430,0.026,0.948,0.524,2,0.518 +HmR8SmNIoxu,1384,0.273,0.210,0.907,0.712,3,0.396 +5HNXoce1raG,1097,0.331,0.328,0.783,0.825,3,0.157 +5HNXoce1raG,1097,0.140,0.195,0.558,0.983,3,0.989 +5HNXoce1raG,1097,0.130,0.207,0.761,0.523,2,0.976 +5HNXoce1raG,1097,0.145,0.444,0.611,0.571,1,0.560 +5HNXoce1raG,1097,0.448,0.116,0.513,0.657,1,0.131 +5HNXoce1raG,1097,0.468,0.361,0.511,0.512,2,0.608 +5HNXoce1raG,1097,0.321,0.093,0.749,0.841,1,0.298 +5HNXoce1raG,1097,0.018,0.137,0.650,0.832,3,0.390 +5HNXoce1raG,1097,0.002,0.417,0.851,0.573,1,0.083 +5HNXoce1raG,1097,0.130,0.389,0.872,0.611,2,0.912 diff --git a/tests/data/eval_detection/proposal.pkl b/tests/data/eval_detection/proposal.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1f55c278e90030cf48a6c6d0b4ca3a0234c4d1a3 --- /dev/null +++ b/tests/data/eval_detection/proposal.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c86e60cdae116dda8683d7bb68d3ac49e0beea8042371122acea1ee2f63faa1e +size 2085 diff --git a/tests/data/eval_localization/gt.json b/tests/data/eval_localization/gt.json new file mode 100644 index 0000000000000000000000000000000000000000..a82f034b59c1195b6e2bcc8f39857e30d202e40b --- /dev/null +++ b/tests/data/eval_localization/gt.json @@ -0,0 +1,46 @@ +{ + "v_bYUmtLBL7W4": { + "duration": 224.49, + "subset": "validation", + "resolution": "1920x1080", + "url": "https://www.youtube.com/watch?v=bYUmtLBL7W4", + "annotations": [ + { + "segment": [ + 11.553655226209049, + 57.06805460218409 + ], + "label": "Wakeboarding" + }, + { + "segment": [ + 68.62170982839314, + 126.03987519500778 + ], + "label": "Wakeboarding" + }, + { + "segment": [ + 135.4928658346334, + 201.31368954758187 + ], + "label": "Wakeboarding" + } + ] + }, + "v_hDPLy21Yyuk": { + "duration": 76.23, + "subset": "validation", + "resolution": "1280x720", + "url": "https://www.youtube.com/watch?v=hDPLy21Yyuk", + "annotations": [ + { + "segment": [ + 21.392480499219968, + 76.161 + ], + "label": "Cleaning shoes" + } + ] + } +} diff --git a/tests/data/eval_localization/result.json b/tests/data/eval_localization/result.json new file mode 100644 index 0000000000000000000000000000000000000000..04ea06e4b9b597b6d9a642d76cbd3d97a54789b8 --- /dev/null +++ b/tests/data/eval_localization/result.json @@ -0,0 +1,120 @@ +{ + "results": { + "bYUmtLBL7W4": [ + { + "label": "Wakeboarding", + "score": 0.6533445119857788, + "segment": [ + 0.0, + 206.3465619982159 + ] + }, + { + "label": "Wakeboarding", + "score": 0.5620265007019043, + "segment": [ + 33.64346119536128, + 206.3465619982159 + ] + }, + { + "label": "Wakeboarding", + "score": 0.4421495497226715, + "segment": [ + 148.03122925958965, + 204.1036645851918 + ] + }, + { + "label": "Wakeboarding", + "score": 0.31284379959106445, + "segment": [ + 0.0, + 123.35935771632472 + ] + }, + { + "label": "Wakeboarding", + "score": 0.2897574603557587, + "segment": [ + 67.28692239072257, + 206.3465619982159 + ] + }, + { + "label": "Wakeboarding", + "score": 0.284942090511322, + "segment": [ + 33.64346119536128, + 125.60225512934882 + ] + }, + { + "label": "Wakeboarding", + "score": 0.12905514240264893, + "segment": [ + 0.0, + 53.829537912578054 + ] + }, + { + "label": "Wakeboarding", + "score": 0.12616874277591705, + "segment": [ + 67.28692239072257, + 123.35935771632472 + ] + }, + { + "label": "Wakeboarding", + "score": 0.12591737508773804, + "segment": [ + 100.93038358608386, + 204.1036645851918 + ] + }, + { + "label": "Wakeboarding", + "score": 0.10444077104330064, + "segment": [ + 38.12925602140946, + 53.829537912578054 + ] + } + ], + "hDPLy21Yyuk": [ + { + "label": "Cleaning shoes", + "score": 0.5667440891265869, + "segment": [ + 21.222965776805253, + 75.03834328227572 + ] + }, + { + "label": "Cleaning shoes", + "score": 0.414698988199234, + "segment": [ + 21.222965776805253, + 43.96185768052516 + ] + }, + { + "label": "Cleaning shoes", + "score": 0.21768000721931455, + "segment": [ + 0.0, + 75.03834328227572 + ] + }, + { + "label": "Cleaning shoes", + "score": 0.10800375044345856, + "segment": [ + 29.560559474835888, + 70.49056490153174 + ] + } + ] + } +} diff --git a/tests/data/eval_multisports/data_samples.pkl b/tests/data/eval_multisports/data_samples.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ca2b050c40607b5d7f13ffcfbdce7489991e7266 --- /dev/null +++ b/tests/data/eval_multisports/data_samples.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d2edf0d7168573a7007df44b0078379310038265ab9452ad28cc630a68e48b +size 368784 diff --git a/tests/data/eval_multisports/gt.pkl b/tests/data/eval_multisports/gt.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8a74fbb169c76b4e5f849b0308c9498c98ad1aae --- /dev/null +++ b/tests/data/eval_multisports/gt.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc51c0efc5516e0472a568ecbdb782c6557c706c72c3129a19d2e261a5d86d4 +size 7891 diff --git a/tests/data/imgs/img_00001.jpg b/tests/data/imgs/img_00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e846e5af2e8cad0c4d99f440b0d1d7709f82fd26 Binary files /dev/null and b/tests/data/imgs/img_00001.jpg differ diff --git a/tests/data/imgs/img_00002.jpg b/tests/data/imgs/img_00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6d7c81b31702ec1e861eb94fbeed4509d4a23d75 Binary files /dev/null and b/tests/data/imgs/img_00002.jpg differ diff --git a/tests/data/imgs/img_00003.jpg b/tests/data/imgs/img_00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6289b32ecf59281f846de097bad6d577b9fb59a4 Binary files /dev/null and b/tests/data/imgs/img_00003.jpg differ diff --git a/tests/data/imgs/img_00004.jpg b/tests/data/imgs/img_00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a75094d0d5b64889ed0c36c5ecdb98428ecd3b94 Binary files /dev/null and b/tests/data/imgs/img_00004.jpg differ diff --git a/tests/data/imgs/img_00005.jpg b/tests/data/imgs/img_00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..25828b83669ad72e1e76fb249282899798167eaf Binary files /dev/null and b/tests/data/imgs/img_00005.jpg differ diff --git a/tests/data/imgs/img_00006.jpg b/tests/data/imgs/img_00006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7f0fa6ca5ce2bd44b3c2108c9024cafcd9e12fc7 Binary files /dev/null and b/tests/data/imgs/img_00006.jpg differ diff --git a/tests/data/imgs/img_00007.jpg b/tests/data/imgs/img_00007.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2ebc51fe1b110d26e1201299eaad86c4ee5c0460 Binary files /dev/null and b/tests/data/imgs/img_00007.jpg differ diff --git a/tests/data/imgs/img_00008.jpg b/tests/data/imgs/img_00008.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f9747042fbb3c2409c013b288076bdc9d2a0d3aa Binary files /dev/null and b/tests/data/imgs/img_00008.jpg differ diff --git a/tests/data/imgs/img_00009.jpg b/tests/data/imgs/img_00009.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b4a74ebb0debc4fcbd9c96e75fb679383a05fbbb Binary files /dev/null and b/tests/data/imgs/img_00009.jpg differ diff --git a/tests/data/imgs/img_00010.jpg b/tests/data/imgs/img_00010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9944e620895f613649e97c7cd74a4c3f6d1ab746 Binary files /dev/null and b/tests/data/imgs/img_00010.jpg differ diff --git a/tests/data/imgs/x_00001.jpg b/tests/data/imgs/x_00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..705ba4b6aee3fd579f0c6ff3edf709b27bacdb8b Binary files /dev/null and b/tests/data/imgs/x_00001.jpg differ diff --git a/tests/data/imgs/x_00002.jpg b/tests/data/imgs/x_00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f5016755fb98a2f7f81913b85d1c8e728f6098bb Binary files /dev/null and b/tests/data/imgs/x_00002.jpg differ diff --git a/tests/data/imgs/x_00003.jpg b/tests/data/imgs/x_00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f419d712d874e67305799c39818c8375e3c66d15 Binary files /dev/null and b/tests/data/imgs/x_00003.jpg differ diff --git a/tests/data/imgs/x_00004.jpg b/tests/data/imgs/x_00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cb52d25933899bc3bee8a29e36fa22554f0b2e31 Binary files /dev/null and b/tests/data/imgs/x_00004.jpg differ diff --git a/tests/data/imgs/x_00005.jpg b/tests/data/imgs/x_00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..399fda2544f4c819ec86bcd460d90fd90a27c1c2 Binary files /dev/null and b/tests/data/imgs/x_00005.jpg differ diff --git a/tests/data/imgs/y_00001.jpg b/tests/data/imgs/y_00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..743b0b2a6d1c16093a5a20b1d0583d791145d4b9 Binary files /dev/null and b/tests/data/imgs/y_00001.jpg differ diff --git a/tests/data/imgs/y_00002.jpg b/tests/data/imgs/y_00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..37f84d07eec4abc29311f743636c9e33d67bbe15 Binary files /dev/null and b/tests/data/imgs/y_00002.jpg differ diff --git a/tests/data/imgs/y_00003.jpg b/tests/data/imgs/y_00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..938a5b6cdc6280c477f88f1815177888200aead5 Binary files /dev/null and b/tests/data/imgs/y_00003.jpg differ diff --git a/tests/data/imgs/y_00004.jpg b/tests/data/imgs/y_00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..af4c666c4c411c97ab8d48034ddaaa0b8c05855a Binary files /dev/null and b/tests/data/imgs/y_00004.jpg differ diff --git a/tests/data/imgs/y_00005.jpg b/tests/data/imgs/y_00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41e05d707236b7cf9cb3c48d01ac3b8d2cbfe3bf Binary files /dev/null and b/tests/data/imgs/y_00005.jpg differ diff --git a/tests/data/lfb/lfb_unittest.pkl b/tests/data/lfb/lfb_unittest.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9d93e4da01f55ba007e0682846c315d9d64594b3 --- /dev/null +++ b/tests/data/lfb/lfb_unittest.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9e8ec0dcce016f4f35d69e3386ba5c4e449ad623eddd97de92c42b79670c0da +size 81082 diff --git a/tests/data/multisports_dataset/multisports_proposals_sample.pkl b/tests/data/multisports_dataset/multisports_proposals_sample.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4ee2f3e359be2aa7fd84f8b78e3f564c72bcf8da --- /dev/null +++ b/tests/data/multisports_dataset/multisports_proposals_sample.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97e778f7f2b9adf28215625ee71aaa62b522931cdf2945235ce5fa06a7968dcb +size 2982 diff --git a/tests/data/multisports_dataset/multisports_sample.csv b/tests/data/multisports_dataset/multisports_sample.csv new file mode 100644 index 0000000000000000000000000000000000000000..d457a7399b591f74491c23b19250d3cc25a3d163 --- /dev/null +++ b/tests/data/multisports_dataset/multisports_sample.csv @@ -0,0 +1,9 @@ +aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,377,0.706,0.439,0.794,0.811,11,0 +aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,378,0.689,0.438,0.794,0.804,11,0 +aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,379,0.672,0.419,0.802,0.797,11,0 +aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,380,0.680,0.361,0.791,0.783,11,0 +aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,443,0.109,0.669,0.345,0.768,1,0 +aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,444,0.112,0.668,0.347,0.767,1,0 +aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,445,0.115,0.663,0.350,0.761,1,0 +aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,446,0.117,0.644,0.352,0.757,1,0 +aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,447,0.118,0.636,0.352,0.754,1,0 \ No newline at end of file diff --git a/tests/data/proposals/v_test1.csv b/tests/data/proposals/v_test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..9d4f4a575bf295cc7d28ebb987285779eaeef4ea --- /dev/null +++ b/tests/data/proposals/v_test1.csv @@ -0,0 +1,10 @@ +tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa +0.1,0.2,0.95,0.96,0.97,0.85,0.84 +0.2,0.3,0.94,0.95,0.96,0.84,0.83 +0.3,0.4,0.93,0.94,0.95,0.83,0.82 +0.4,0.5,0.92,0.93,0.94,0.82,0.81 +0.5,0.6,0.91,0.92,0.93,0.81,0.80 +0.6,0.7,0.90,0.91,0.92,0.80,0.79 +0.5,0.7,0.90,0.91,0.92,0.80,0.79 +0.6,0.8,0.90,0.91,0.92,0.80,0.79 +0.4,0.7,0.90,0.91,0.92,0.80,0.79 diff --git a/tests/data/proposals/v_test2.csv b/tests/data/proposals/v_test2.csv new file mode 100644 index 0000000000000000000000000000000000000000..a2863df6e6cd7e527490d9919cf896bda45995e7 --- /dev/null +++ b/tests/data/proposals/v_test2.csv @@ -0,0 +1,7 @@ +tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa +0.1,0.2,0.95,0.96,0.97,0.75,0.74 +0.2,0.3,0.94,0.95,0.96,0.74,0.73 +0.3,0.4,0.93,0.94,0.95,0.73,0.72 +0.4,0.5,0.92,0.93,0.94,0.72,0.71 +0.5,0.6,0.91,0.92,0.93,0.71,0.70 +0.6,0.7,0.90,0.91,0.92,0.70,0.79 diff --git a/tests/data/rawvideo_dataset/part_0.mp4 b/tests/data/rawvideo_dataset/part_0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fb5b091c6ca597ad605b3b125eb47cf24295468d --- /dev/null +++ b/tests/data/rawvideo_dataset/part_0.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019048c7c7e6ceb3b4821f027ff3cda0e198ab33dd28b32d9ee7e397be87b4b3 +size 158581 diff --git a/tests/data/rawvideo_dataset/part_1.mp4 b/tests/data/rawvideo_dataset/part_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fb5b091c6ca597ad605b3b125eb47cf24295468d --- /dev/null +++ b/tests/data/rawvideo_dataset/part_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019048c7c7e6ceb3b4821f027ff3cda0e198ab33dd28b32d9ee7e397be87b4b3 +size 158581 diff --git a/tests/data/tem_results/v_test1.csv b/tests/data/tem_results/v_test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..5ec36a078ec9483906af43074bb3dee54d941ca0 --- /dev/null +++ b/tests/data/tem_results/v_test1.csv @@ -0,0 +1,11 @@ +action,start,end,tmin,tmax +3.711169585585594177e-02,5.839086771011352539e-01,1.464508026838302612e-01,0.0,0.1 +1.555041410028934479e-02,3.062666654586791992e-01,2.622193098068237305e-01,0.1,0.2 +1.146762818098068237e-02,1.464279890060424805e-01,3.260520696640014648e-01,0.2,0.3 +1.371797081083059311e-02,1.365097165107727051e-01,3.570831716060638428e-01,0.3,0.4 +1.519643329083919525e-02,1.688144057989120483e-01,3.057994544506072998e-01,0.4,0.5 +1.968025043606758118e-02,1.974480003118515015e-01,2.933082580566406250e-01,0.5,0.6 +2.251588553190231323e-02,1.885317713022232056e-01,3.326449990272521973e-01,0.6,0.7 +2.402217499911785126e-02,1.918197423219680786e-01,3.420312106609344482e-01,0.7,0.8 +2.045033127069473267e-02,1.970291137695312500e-01,3.339000344276428223e-01,0.8,0.9 +3.435279428958892822e-02,5.583426356315612793e-01,1.250019371509552002e-01,0.9,1.0 diff --git a/tests/data/tem_results/v_test2.csv b/tests/data/tem_results/v_test2.csv new file mode 100644 index 0000000000000000000000000000000000000000..79f0685ea72932ea1bbef6716410b6ec41896718 --- /dev/null +++ b/tests/data/tem_results/v_test2.csv @@ -0,0 +1,11 @@ +action,start,end,tmin,tmax +5.711169585585594177e-02,7.839086771011352539e-01,3.464508026838302612e-01,0.0,0.1 +2.555041410028934479e-02,3.062666654586791992e-01,3.622193098068237305e-01,0.1,0.2 +2.146762818098068237e-02,2.464279890060424805e-01,3.260520696640014648e-01,0.2,0.3 +1.371797081083059311e-02,1.365097165107727051e-01,3.570831716060638428e-01,0.3,0.4 +1.519643329083919525e-02,1.688144057989120483e-01,3.057994544506072998e-01,0.4,0.5 +1.968025043606758118e-02,1.974480003118515015e-01,2.933082580566406250e-01,0.5,0.6 +2.251588553190231323e-02,1.885317713022232056e-01,3.326449990272521973e-01,0.6,0.7 +2.402217499911785126e-02,1.918197423219680786e-01,3.420312106609344482e-01,0.7,0.8 +2.045033127069473267e-02,1.970291137695312500e-01,3.339000344276428223e-01,0.8,0.9 +3.435279428958892822e-02,5.583426356315612793e-01,1.250019371509552002e-01,0.9,1.0 diff --git a/tests/data/test.avi b/tests/data/test.avi new file mode 100644 index 0000000000000000000000000000000000000000..b65a08432cb327a7592f72e244a73cb8dcdb9d88 --- /dev/null +++ b/tests/data/test.avi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3ebb11e80d2900071ff929633b7476a33ee1698ac9a91206e2ba64c1c28920 +size 294566 diff --git a/tests/data/test.jpg b/tests/data/test.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d88aea0ac50bce6efdde58c2248bbd25d1ae9122 Binary files /dev/null and b/tests/data/test.jpg differ diff --git a/tests/data/test.mp4 b/tests/data/test.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..883a242d1bfe37908c2d9545de5bba067029de33 --- /dev/null +++ b/tests/data/test.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38eff15224f44090631dc29b28804f5781c07c1a579e918a74d3e3bb9d12cb59 +size 1352828 diff --git a/tests/data/test.wav b/tests/data/test.wav new file mode 100644 index 0000000000000000000000000000000000000000..4ff616e5b8f1b8468cc319ea845ffa02da6bf7bb --- /dev/null +++ b/tests/data/test.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c139b1dcd0ebebbe6417038d75126b25fbf259b7993eedce5130bc200f55049 +size 419710 diff --git a/tests/datasets/__init__.py b/tests/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..41c3272a7cce96aeda4fccabef088e589347e63a --- /dev/null +++ b/tests/datasets/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseTestDataset + +__all__ = ['BaseTestDataset'] diff --git a/tests/datasets/base.py b/tests/datasets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..e72a3a1e0b8e536f0bb1040600fc22e0ec7de600 --- /dev/null +++ b/tests/datasets/base.py @@ -0,0 +1,163 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +from mmengine import ConfigDict + + +class BaseTestDataset: + + @classmethod + def setup_class(cls): + # prefix path + cls.data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../data/')) + cls.ann_file_prefix = osp.join(cls.data_prefix, 'annotations') + + # annotations path + cls.action_ann_file = osp.join(cls.ann_file_prefix, + 'action_test_anno.json') + cls.audio_feature_ann_file = osp.join(cls.ann_file_prefix, + 'audio_feature_test_list.txt') + cls.audio_ann_file = osp.join(cls.ann_file_prefix, + 'audio_test_list.txt') + cls.frame_ann_file_multi_label = osp.join( + cls.ann_file_prefix, 'rawframe_test_list_multi_label.txt') + cls.frame_ann_file_with_offset = osp.join( + cls.ann_file_prefix, 'rawframe_test_list_with_offset.txt') + cls.frame_ann_file = osp.join(cls.ann_file_prefix, + 'rawframe_test_list.txt') + cls.hvu_frame_ann_file = osp.join(cls.ann_file_prefix, + 'hvu_frame_test_anno.json') + cls.hvu_video_ann_file = osp.join(cls.ann_file_prefix, + 'hvu_video_test_anno.json') + cls.hvu_video_eval_ann_file = osp.join( + cls.ann_file_prefix, 'hvu_video_eval_test_anno.json') + cls.proposal_ann_file = osp.join(cls.ann_file_prefix, + 'proposal_test_list.txt') + cls.proposal_norm_ann_file = osp.join(cls.ann_file_prefix, + 'proposal_normalized_list.txt') + cls.rawvideo_test_anno_json = osp.join(cls.ann_file_prefix, + 'rawvideo_test_anno.json') + cls.rawvideo_test_anno_txt = osp.join(cls.ann_file_prefix, + 'rawvideo_test_anno.txt') + cls.video_ann_file = osp.join(cls.ann_file_prefix, + 'video_test_list.txt') + cls.video_ann_file_multi_label = osp.join( + cls.ann_file_prefix, 'video_test_list_multi_label.txt') + cls.video_text_ann_file = osp.join(cls.ann_file_prefix, + 'video_text_test_list.json') + cls.pose_ann_file = osp.join(cls.ann_file_prefix, 'sample.pkl') + + # pipeline configuration + cls.action_pipeline = [] + cls.audio_feature_pipeline = [ + dict(type='LoadAudioFeature'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1), + dict(type='AudioFeatureSelector') + ] + cls.audio_pipeline = [ + dict(type='AudioDecodeInit'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1), + dict(type='AudioDecode') + ] + cls.frame_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1), + dict(type='RawFrameDecode', io_backend='disk') + ] + cls.proposal_pipeline = [ + dict( + type='SampleProposalFrames', + clip_len=1, + body_segments=5, + aug_segments=(2, 2), + aug_ratio=0.5), + dict(type='RawFrameDecode', io_backend='disk') + ] + cls.proposal_test_pipeline = [ + dict( + type='SampleProposalFrames', + clip_len=1, + body_segments=5, + aug_segments=(2, 2), + aug_ratio=0.5, + mode='test'), + dict(type='RawFrameDecode', io_backend='disk') + ] + cls.proposal_train_cfg = ConfigDict( + dict( + ssn=dict( + assigner=dict( + positive_iou_threshold=0.7, + background_iou_threshold=0.01, + incomplete_iou_threshold=0.5, + background_coverage_threshold=0.02, + incomplete_overlap_threshold=0.01), + sampler=dict( + num_per_video=8, + positive_ratio=1, + background_ratio=1, + incomplete_ratio=6, + add_gt_as_proposals=True), + loss_weight=dict( + comp_loss_weight=0.1, reg_loss_weight=0.1), + debug=False))) + cls.proposal_test_cfg = ConfigDict( + dict( + ssn=dict( + sampler=dict(test_interval=6, batch_size=16), + evaluater=dict( + top_k=2000, + nms=0.2, + softmax_before_filter=True, + cls_top_k=2)))) + cls.proposal_test_cfg_topall = ConfigDict( + dict( + ssn=dict( + sampler=dict(test_interval=6, batch_size=16), + evaluater=dict( + top_k=-1, + nms=0.2, + softmax_before_filter=True, + cls_top_k=2)))) + cls.rawvideo_pipeline = [] + cls.video_pipeline = [ + dict(type='OpenCVInit'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1), + dict(type='OpenCVDecode') + ] + + cls.video_text_pipeline = [ + dict(type='OpenCVInit'), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1), + dict(type='OpenCVDecode'), + dict(type='CLIPTokenize') + ] + + cls.hvu_categories = [ + 'action', 'attribute', 'concept', 'event', 'object', 'scene' + ] + cls.hvu_category_nums = [739, 117, 291, 69, 1679, 248] + cls.hvu_categories_for_eval = ['action', 'scene', 'object'] + cls.hvu_category_nums_for_eval = [3, 3, 3] + + cls.filename_tmpl = 'img_{:05d}.jpg' diff --git a/tests/datasets/test_ava_dataset.py b/tests/datasets/test_ava_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a327a031e54bf2408b78bc04dbae38de4a02c7fe --- /dev/null +++ b/tests/datasets/test_ava_dataset.py @@ -0,0 +1,353 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import mmengine +import numpy as np +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from mmaction.datasets import AVADataset, AVAKineticsDataset +from mmaction.utils import register_all_modules + + +class TestAVADataset: + + @classmethod + def setup_class(cls): + cls.data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), './../data', 'ava_dataset')) + cls.label_file = osp.join(cls.data_prefix, 'action_list.txt') + cls.ann_file = osp.join(cls.data_prefix, 'ava_sample.csv') + cls.exclude_file = osp.join(cls.data_prefix, + 'ava_excluded_timestamps_sample.csv') + cls.proposal_file = osp.join(cls.data_prefix, + 'ava_proposals_sample.pkl') + cls.pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2) + ] + cls.proposal = mmengine.load(cls.proposal_file) + + def test_ava_dataset(self): + register_all_modules() + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + self.exclude_file, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + # custom classes + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + self.exclude_file, + label_file=self.label_file, + custom_classes=[17, 79], + num_classes=3, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + # ava_infos = ava_dataset.video_infos + target_labels = np.array([1, 2]) + labels = np.zeros([3]) + labels[target_labels] = 1. + target_labels = labels[None, ...] + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + None, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + None, + self.label_file, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + del ava_dataset + + def test_ava_pipeline(self): + register_all_modules() + target_keys = [ + 'frame_dir', 'video_id', 'timestamp', 'img_key', 'shot_info', + 'fps', 'filename_tmpl', 'modality', 'start_index', + 'timestamp_start', 'timestamp_end', 'proposals', 'scores', + 'frame_inds', 'clip_len', 'frame_interval', 'gt_labels', + 'gt_bboxes', 'entity_ids' + ] + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + self.exclude_file, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + result = ava_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + assert result['filename_tmpl'] == 'img_{:05}.jpg' + assert result['modality'] == 'RGB' + assert result['start_index'] == 1 + assert result['timestamp_start'] == 900 + assert result['timestamp_end'] == 1800 + assert_array_equal(result['proposals'], + np.array([[0.011, 0.157, 0.655, 0.983]])) + assert_array_equal(result['scores'], np.array([0.998163])) + + assert result['clip_len'] == 32 + assert result['frame_interval'] == 2 + assert len(result['frame_inds']) == 32 + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + None, + self.label_file, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + # Try to get a sample + result = ava_dataset[0] + assert result['filename_tmpl'] == 'img_{:05}.jpg' + assert result['modality'] == 'RGB' + assert result['start_index'] == 1 + assert result['timestamp_start'] == 900 + assert result['timestamp_end'] == 1800 + + +class TestMultiSportsDataset: + + @classmethod + def setup_class(cls): + cls.data_prefix = osp.normpath( + osp.join( + osp.dirname(__file__), './../data', 'multisports_dataset')) + cls.ann_file = osp.join(cls.data_prefix, 'multisports_sample.csv') + cls.proposal_file = osp.join(cls.data_prefix, + 'multisports_proposals_sample.pkl') + cls.pipeline = [ + dict(type='DecordInit'), + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='DecordDecode') + ] + cls.proposal = mmengine.load(cls.proposal_file) + + def test_multisports_dataset(self): + register_all_modules() + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file, + use_frames=False, + timestamp_start=1, + start_index=0, + multilabel=False, + fps=1) + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file, + use_frames=False, + timestamp_start=1, + start_index=0, + multilabel=False, + fps=1) + + del ava_dataset + + def test_ava_pipeline(self): + register_all_modules() + target_keys = [ + 'filename', 'video_id', 'timestamp', 'img_key', 'shot_info', 'fps', + 'filename_tmpl', 'modality', 'start_index', 'timestamp_start', + 'timestamp_end', 'proposals', 'scores', 'frame_inds', 'clip_len', + 'frame_interval', 'gt_labels', 'gt_bboxes', 'entity_ids' + ] + + def mock_video_reader(filename): + from unittest.mock import MagicMock + container = MagicMock() + container.__len__.return_value = 100 + container.get_avg_fps.return_value = 24 + frame_batch = MagicMock() + frame_batch.asnumpy.return_value = np.zeros((32, 720, 1280, 3)) + container.get_batch.return_value = frame_batch + return container + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file, + use_frames=False, + timestamp_start=1, + start_index=0, + multilabel=False, + fps=1) + + # Mock a decord Container + ava_dataset.pipeline.transforms[ + 0]._get_video_reader = mock_video_reader + result = ava_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + assert result['modality'] == 'RGB' + assert result['fps'] == 1 + assert result['start_index'] == 0 + + h, w = result['imgs'][0].shape[:2] + scale_factor = np.array([w, h, w, h]) + gt_bboxes = np.array([[0.71097612, 0.44144461, 0.79291363, 0.80873633], + [0.19915699, 0.40121613, 0.29834411, + 0.79667876]]) + assert_array_almost_equal( + result['proposals'], gt_bboxes * scale_factor, decimal=4) + assert_array_almost_equal(result['scores'], + np.array([0.994165, 0.9902001])) + + assert result['clip_len'] == 32 + assert result['frame_interval'] == 2 + assert len(result['frame_inds']) == 32 + + ava_dataset = AVADataset( + self.ann_file, + self.pipeline, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file, + use_frames=False, + timestamp_start=1, + start_index=0, + multilabel=False, + fps=1) + # Mock a decord Container + ava_dataset.pipeline.transforms[ + 0]._get_video_reader = mock_video_reader + # Try to get a sample + result = ava_dataset[0] + assert result['modality'] == 'RGB' + assert result['fps'] == 1 + assert result['start_index'] == 0 + + +class TestAVAKineticsDataset: + + @classmethod + def setup_class(cls): + cls.data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), './../data', 'ava_dataset')) + cls.label_file = osp.join(cls.data_prefix, 'action_list.txt') + cls.ann_file = osp.join(cls.data_prefix, 'ava_sample.csv') + cls.exclude_file = osp.join(cls.data_prefix, + 'ava_excluded_timestamps_sample.csv') + cls.proposal_file = osp.join(cls.data_prefix, + 'ava_proposals_sample.pkl') + cls.pipeline = [ + dict(dict(type='SampleAVAFrames', clip_len=32, frame_interval=2)) + ] + cls.proposal = mmengine.load(cls.proposal_file) + + def test_ava_kinetics_dataset(self): + register_all_modules() + ava_dataset = AVAKineticsDataset( + self.ann_file, + self.exclude_file, + self.pipeline, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + # custom classes + ava_dataset = AVAKineticsDataset( + self.ann_file, + self.exclude_file, + self.pipeline, + label_file=self.label_file, + custom_classes=[17, 79], + num_classes=3, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + # ava_infos = ava_dataset.video_infos + target_labels = np.array([1, 2]) + labels = np.zeros([3]) + labels[target_labels] = 1. + target_labels = labels[None, ...] + + ava_dataset = AVAKineticsDataset( + self.ann_file, + None, + self.pipeline, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + ava_dataset = AVAKineticsDataset( + self.ann_file, + None, + self.pipeline, + self.label_file, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + + del ava_dataset + + def test_ava_kinetics_pipeline(self): + register_all_modules() + target_keys = [ + 'frame_dir', 'video_id', 'timestamp', 'img_key', 'shot_info', + 'fps', 'filename_tmpl', 'modality', 'start_index', + 'timestamp_start', 'timestamp_end', 'proposals', 'scores', + 'frame_inds', 'clip_len', 'frame_interval', 'gt_labels', + 'gt_bboxes', 'entity_ids' + ] + + ava_dataset = AVAKineticsDataset( + self.ann_file, + self.exclude_file, + self.pipeline, + self.label_file, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + result = ava_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + assert result['filename_tmpl'] == 'img_{:05}.jpg' + assert result['modality'] == 'RGB' + assert result['start_index'] == 0 + assert result['timestamp_start'] == 900 + assert result['timestamp_end'] == 1800 + assert_array_equal(result['proposals'], + np.array([[0.011, 0.157, 0.655, 0.983]])) + assert_array_equal(result['scores'], np.array([0.998163])) + + assert result['clip_len'] == 32 + assert result['frame_interval'] == 2 + assert len(result['frame_inds']) == 32 + + ava_dataset = AVAKineticsDataset( + self.ann_file, + None, + self.pipeline, + self.label_file, + test_mode=True, + data_prefix={'img': self.data_prefix}, + proposal_file=self.proposal_file) + # Try to get a sample + result = ava_dataset[0] + assert result['filename_tmpl'] == 'img_{:05}.jpg' + assert result['modality'] == 'RGB' + assert result['start_index'] >= 0 + assert result['timestamp_start'] > 0 + assert result['timestamp_end'] > result['timestamp_start'] diff --git a/tests/datasets/test_pose_dataset.py b/tests/datasets/test_pose_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..383d20acd1a6094c401f3cdbdc00a72b8be902bc --- /dev/null +++ b/tests/datasets/test_pose_dataset.py @@ -0,0 +1,48 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest + +from mmaction.datasets import PoseDataset +from .base import BaseTestDataset + + +class TestPoseDataset(BaseTestDataset): + + def test_pose_dataset(self): + ann_file = self.pose_ann_file + data_prefix = dict(video='root') + dataset = PoseDataset( + ann_file=ann_file, + pipeline=[], + split='train', + box_thr=0.5, + data_prefix=data_prefix) + assert len(dataset) == 100 + item = dataset[0] + assert item['frame_dir'].startswith(data_prefix['video']) + + dataset = PoseDataset( + ann_file=ann_file, + pipeline=[], + split='train', + valid_ratio=0.2, + box_thr=0.9) + assert len(dataset) == 84 + for item in dataset: + assert np.all(item['box_score'][item['anno_inds']] >= 0.9) + assert item['valid'][0.9] / item['total_frames'] >= 0.2 + + dataset = PoseDataset( + ann_file=ann_file, + pipeline=[], + split='train', + valid_ratio=0.3, + box_thr=0.7) + assert len(dataset) == 87 + for item in dataset: + assert np.all(item['box_score'][item['anno_inds']] >= 0.7) + assert item['valid'][0.7] / item['total_frames'] >= 0.3 + + with pytest.raises(AssertionError): + dataset = PoseDataset( + ann_file=ann_file, pipeline=[], valid_ratio=0.2, box_thr=0.55) diff --git a/tests/datasets/test_rawframe_dataset.py b/tests/datasets/test_rawframe_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..63a5338129121d733535fb7b52fdd9a1867e0653 --- /dev/null +++ b/tests/datasets/test_rawframe_dataset.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.testing import assert_dict_has_keys + +from mmaction.datasets import RawframeDataset +from mmaction.utils import register_all_modules +from .base import BaseTestDataset + + +class TestRawframDataset(BaseTestDataset): + + def test_rawframe_dataset(self): + rawframe_dataset = RawframeDataset(self.frame_ann_file, + self.frame_pipeline, + {'img': self.data_prefix}) + assert rawframe_dataset.start_index == 1 + + def test_rawframe_dataset_with_offset(self): + register_all_modules() + rawframe_dataset = RawframeDataset( + self.frame_ann_file_with_offset, + self.frame_pipeline, {'img': self.data_prefix}, + with_offset=True) + assert rawframe_dataset.start_index == 1 + + def test_rawframe_dataset_multi_label(self): + register_all_modules() + rawframe_dataset = RawframeDataset( + self.frame_ann_file_multi_label, + self.frame_pipeline, {'img': self.data_prefix}, + multi_class=True, + num_classes=100) + assert rawframe_dataset.start_index == 1 + + def test_dataset_realpath(self): + register_all_modules() + dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline, + {'img': '.'}) + dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline, + {'img': 's3://good'}) + assert dataset.data_prefix == {'img': 's3://good'} + + dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline) + + def test_rawframe_pipeline(self): + target_keys = [ + 'frame_dir', 'total_frames', 'label', 'filename_tmpl', + 'start_index', 'modality' + ] + + # RawframeDataset not in test mode + rawframe_dataset = RawframeDataset( + self.frame_ann_file, + self.frame_pipeline, {'img': self.data_prefix}, + test_mode=False) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # RawframeDataset in multi-class tasks + rawframe_dataset = RawframeDataset( + self.frame_ann_file, + self.frame_pipeline, {'img': self.data_prefix}, + multi_class=True, + num_classes=400, + test_mode=False) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # RawframeDataset with offset + rawframe_dataset = RawframeDataset( + self.frame_ann_file_with_offset, + self.frame_pipeline, {'img': self.data_prefix}, + with_offset=True, + num_classes=400, + test_mode=False) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys + ['offset']) + + # RawframeDataset in test mode + rawframe_dataset = RawframeDataset( + self.frame_ann_file, + self.frame_pipeline, {'img': self.data_prefix}, + test_mode=True) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # RawframeDataset in multi-class tasks in test mode + rawframe_dataset = RawframeDataset( + self.frame_ann_file, + self.frame_pipeline, {'img': self.data_prefix}, + multi_class=True, + num_classes=400, + test_mode=True) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # RawframeDataset with offset + rawframe_dataset = RawframeDataset( + self.frame_ann_file_with_offset, + self.frame_pipeline, {'img': self.data_prefix}, + with_offset=True, + num_classes=400, + test_mode=True) + result = rawframe_dataset[0] + assert assert_dict_has_keys(result, target_keys + ['offset']) diff --git a/tests/datasets/test_repeataug_dataset.py b/tests/datasets/test_repeataug_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..184092d0c12be3406551d319bc545f8d9c477148 --- /dev/null +++ b/tests/datasets/test_repeataug_dataset.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +from mmengine.testing import assert_dict_has_keys + +from mmaction.datasets import RepeatAugDataset +from mmaction.utils import register_all_modules +from .base import BaseTestDataset + + +class TestVideoDataset(BaseTestDataset): + register_all_modules() + + def test_video_dataset(self): + with pytest.raises(AssertionError): + # Currently only support decord backend + video_dataset = RepeatAugDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix={'video': self.data_prefix}, + start_index=3) + + video_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', clip_len=4, frame_interval=2, + num_clips=1), + dict(type='DecordDecode') + ] + + video_dataset = RepeatAugDataset( + self.video_ann_file, + video_pipeline, + data_prefix={'video': self.data_prefix}, + start_index=3) + assert len(video_dataset) == 2 + assert video_dataset.start_index == 3 + + video_dataset = RepeatAugDataset( + self.video_ann_file, + video_pipeline, + data_prefix={'video': self.data_prefix}) + assert video_dataset.start_index == 0 + + def test_video_dataset_multi_label(self): + video_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', clip_len=4, frame_interval=2, + num_clips=1), + dict(type='DecordDecode') + ] + video_dataset = RepeatAugDataset( + self.video_ann_file_multi_label, + video_pipeline, + data_prefix={'video': self.data_prefix}, + multi_class=True, + num_classes=100) + assert video_dataset.start_index == 0 + + def test_video_pipeline(self): + video_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', clip_len=4, frame_interval=2, + num_clips=1), + dict(type='DecordDecode') + ] + target_keys = ['filename', 'label', 'start_index', 'modality'] + + # RepeatAugDataset not in test mode + video_dataset = RepeatAugDataset( + self.video_ann_file, + video_pipeline, + data_prefix={'video': self.data_prefix}) + result = video_dataset[0] + assert isinstance(result, (list, tuple)) + assert assert_dict_has_keys(result[0], target_keys) diff --git a/tests/datasets/test_video_dataset.py b/tests/datasets/test_video_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfe9dd800a92bab44cbe9d6a57e8108423ce644 --- /dev/null +++ b/tests/datasets/test_video_dataset.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.testing import assert_dict_has_keys + +from mmaction.datasets import VideoDataset +from mmaction.utils import register_all_modules +from .base import BaseTestDataset + + +class TestVideoDataset(BaseTestDataset): + register_all_modules() + + def test_video_dataset(self): + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix={'video': self.data_prefix}, + start_index=3) + assert len(video_dataset) == 2 + assert video_dataset.start_index == 3 + + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix={'video': self.data_prefix}) + assert video_dataset.start_index == 0 + + def test_video_dataset_multi_label(self): + video_dataset = VideoDataset( + self.video_ann_file_multi_label, + self.video_pipeline, + data_prefix={'video': self.data_prefix}, + multi_class=True, + num_classes=100) + assert video_dataset.start_index == 0 + + def test_video_pipeline(self): + target_keys = ['filename', 'label', 'start_index', 'modality'] + + # VideoDataset not in test mode + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix={'video': self.data_prefix}, + test_mode=False) + result = video_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # VideoDataset in test mode + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix={'video': self.data_prefix}, + test_mode=True) + result = video_dataset[0] + assert assert_dict_has_keys(result, target_keys) diff --git a/tests/datasets/test_video_text_dataset.py b/tests/datasets/test_video_text_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..49f150b867cc5e64b3efe1d6cf7b1914659a0a27 --- /dev/null +++ b/tests/datasets/test_video_text_dataset.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.testing import assert_dict_has_keys + +from mmaction.datasets import VideoTextDataset +from mmaction.utils import register_all_modules +from .base import BaseTestDataset + + +class TestVideoTextDataset(BaseTestDataset): + register_all_modules() + + def test_video_dataset(self): + video_dataset = VideoTextDataset( + self.video_text_ann_file, + self.video_text_pipeline, + data_prefix={'video': self.data_prefix}, + start_index=3) + assert len(video_dataset) == 2 + assert video_dataset.start_index == 3 + + video_dataset = VideoTextDataset( + self.video_text_ann_file, + self.video_text_pipeline, + data_prefix={'video': self.data_prefix}) + assert video_dataset.start_index == 0 + + def test_video_pipeline(self): + target_keys = ['filename', 'text', 'start_index', 'modality', 'imgs'] + + # VideoTextDataset not in test mode + video_dataset = VideoTextDataset( + self.video_text_ann_file, + self.video_text_pipeline, + data_prefix={'video': self.data_prefix}, + test_mode=False) + result = video_dataset[0] + assert assert_dict_has_keys(result, target_keys) + + # VideoTextDataset in test mode + video_dataset = VideoTextDataset( + self.video_text_ann_file, + self.video_text_pipeline, + data_prefix={'video': self.data_prefix}, + test_mode=True) + result = video_dataset[0] + assert assert_dict_has_keys(result, target_keys) diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py new file mode 100644 index 0000000000000000000000000000000000000000..746b1d4ae83775e645f011ec314e2442c0428360 --- /dev/null +++ b/tests/datasets/transforms/test_formating.py @@ -0,0 +1,296 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import unittest + +import numpy as np +import pytest +import torch +from mmengine.structures import InstanceData +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_equal + +from mmaction.datasets.transforms import (FormatAudioShape, FormatGCNInput, + FormatShape, PackActionInputs, + Transpose) +from mmaction.registry import TRANSFORMS +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules + +register_all_modules() + + +class TestPackActionInputs(unittest.TestCase): + + def test_transform(self): + # none input + with self.assertRaises(ValueError): + results = PackActionInputs()(dict()) + + # keypoint input + results = dict(keypoint=np.random.randn(2, 300, 17, 3), label=1) + transform = PackActionInputs() + results = transform(results) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertEqual(results['inputs'].shape, (2, 300, 17, 3)) + self.assertEqual(results['data_samples'].gt_label, + torch.LongTensor([1])) + + # heatmap_imgs input + results = dict(heatmap_imgs=np.random.randn(2, 17, 56, 56), label=1) + transform = PackActionInputs() + results = transform(results) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertEqual(results['inputs'].shape, (2, 17, 56, 56)) + self.assertEqual(results['data_samples'].gt_label, + torch.LongTensor([1])) + + # audios input + results = dict(audios=np.random.randn(3, 1, 128, 80), label=[1]) + transform = PackActionInputs() + results = transform(results) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertEqual(results['inputs'].shape, (3, 1, 128, 80)) + self.assertIsInstance(results['inputs'], torch.Tensor) + + # text input + results = dict(text=np.random.randn(77)) + transform = PackActionInputs() + results = transform(results) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertEqual(results['inputs'].shape, (77, )) + self.assertIsInstance(results['inputs'], torch.Tensor) + + # imgs input with label + data = dict( + imgs=np.random.randn(2, 256, 256, 3), + label=[1], + filename='test.txt', + original_shape=(256, 256, 3), + img_shape=(256, 256, 3), + flip_direction='vertical') + + transform = PackActionInputs() + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertIsInstance(results['data_samples'], ActionDataSample) + self.assertEqual(results['data_samples'].img_shape, (256, 256, 3)) + self.assertEqual(results['data_samples'].gt_label, + torch.LongTensor([1])) + + # Test grayscale image + data['imgs'] = data['imgs'].mean(-1) + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertEqual(results['inputs'].shape, (2, 256, 256)) + + # imgs input with gt_bboxes + data = dict( + imgs=np.random.randn(256, 256, 3), + gt_bboxes=np.array([[0, 0, 340, 224]]), + gt_labels=[1], + proposals=np.array([[0, 0, 340, 224]]), + filename='test.txt') + + transform = PackActionInputs() + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], ActionDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].proposals, InstanceData) + + # imgs and text input + data = dict( + imgs=np.random.randn(2, 256, 256, 3), text=np.random.randn(77)) + + transform = PackActionInputs(collect_keys=('imgs', 'text')) + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIn('data_samples', results) + self.assertIsInstance(results['inputs'], dict) + self.assertEqual(results['inputs']['imgs'].shape, (2, 256, 256, 3)) + self.assertEqual(results['inputs']['text'].shape, (77, )) + + def test_repr(self): + cfg = dict( + type='PackActionInputs', meta_keys=['flip_direction', 'img_shape']) + transform = TRANSFORMS.build(cfg) + self.assertEqual( + repr(transform), 'PackActionInputs(collect_keys=None, ' + "meta_keys=['flip_direction', 'img_shape'])") + + +class TestPackLocalizationInputs(unittest.TestCase): + + def test_transform(self): + # raw_feature input + data = dict( + raw_feature=np.random.randn(400, 5), + gt_bbox=np.array([[0.1, 0.3], [0.375, 0.625]]), + filename='test.txt') + + cfg = dict(type='PackLocalizationInputs', keys=('gt_bbox', )) + transform = TRANSFORMS.build(cfg) + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], ActionDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + + del data['raw_feature'] + with self.assertRaises(ValueError): + transform(copy.deepcopy(data)) + + # bsp_feature input + data['bsp_feature'] = np.random.randn(100, 32) + results = transform(copy.deepcopy(data)) + self.assertIn('inputs', results) + self.assertIsInstance(results['inputs'], torch.Tensor) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], ActionDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + + def test_repr(self): + cfg = dict( + type='PackLocalizationInputs', + meta_keys=['video_name', 'feature_frame']) + transform = TRANSFORMS.build(cfg) + self.assertEqual( + repr(transform), + "PackLocalizationInputs(meta_keys=['video_name', 'feature_frame'])" + ) + + +def test_transpose(): + results = dict(imgs=np.random.randn(256, 256, 3)) + keys = ['imgs'] + order = [2, 0, 1] + transpose = Transpose(keys, order) + results = transpose(results) + assert results['imgs'].shape == (3, 256, 256) + assert repr(transpose) == transpose.__class__.__name__ + \ + f'(keys={keys}, order={order})' + + +def test_format_shape(): + with pytest.raises(ValueError): + # invalid input format + FormatShape('NHWC') + + # 'NCHW' input format (RGB Modality) + results = dict( + imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3) + format_shape = FormatShape('NCHW') + assert format_shape(results)['input_shape'] == (3, 3, 224, 224) + + # `NCHW` input format (Flow Modality) + results = dict( + imgs=np.random.randn(3, 224, 224, 2), + num_clips=1, + clip_len=3, + modality='Flow') + format_shape = FormatShape('NCHW') + assert format_shape(results)['input_shape'] == (1, 6, 224, 224) + + # `NCTHW` input format with num_clips=1, clip_len=3 + results = dict( + imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3) + format_shape = FormatShape('NCTHW') + assert format_shape(results)['input_shape'] == (1, 3, 3, 224, 224) + + # `NCTHW` input format with num_clips=2, clip_len=3 + results = dict( + imgs=np.random.randn(18, 224, 224, 3), num_clips=2, clip_len=3) + assert format_shape(results)['input_shape'] == (6, 3, 3, 224, 224) + target_keys = ['imgs', 'input_shape'] + assert assert_dict_has_keys(results, target_keys) + + # `NCTHW` input format with imgs and heatmap_imgs + results = dict( + imgs=np.random.randn(6, 224, 224, 3), + heatmap_imgs=np.random.randn(12, 17, 56, 56), + num_clips=2, + clip_len=dict(RGB=3, Pose=6)) + + results = format_shape(results) + assert results['input_shape'] == (2, 3, 3, 224, 224) + assert results['heatmap_input_shape'] == (2, 17, 6, 56, 56) + + assert repr(format_shape) == "FormatShape(input_format='NCTHW')" + + # `NCTHW_Heatmap` input format + results = dict( + imgs=np.random.randn(12, 17, 56, 56), num_clips=2, clip_len=6) + format_shape = FormatShape('NCTHW_Heatmap') + assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56) + + # `NPTCHW` input format + results = dict( + imgs=np.random.randn(72, 224, 224, 3), + num_clips=9, + clip_len=1, + num_proposals=8) + format_shape = FormatShape('NPTCHW') + assert format_shape(results)['input_shape'] == (8, 9, 3, 224, 224) + + +def test_format_audio_shape(): + with pytest.raises(ValueError): + # invalid input format + FormatAudioShape('XXXX') + + # `NCTF` input format + results = dict(audios=np.random.randn(3, 128, 8)) + format_shape = FormatAudioShape('NCTF') + assert format_shape(results)['input_shape'] == (3, 1, 128, 8) + assert repr(format_shape) == format_shape.__class__.__name__ + \ + "(input_format='NCTF')" + + +def test_format_gcn_input(): + with pytest.raises(AssertionError): + FormatGCNInput(mode='invalid') + + results = dict( + keypoint=np.random.randn(2, 10, 17, 2), + keypoint_score=np.random.randn(2, 10, 17)) + format_shape = FormatGCNInput(num_person=2, mode='zero') + results = format_shape(results) + assert results['keypoint'].shape == (1, 2, 10, 17, 3) + assert repr(format_shape) == 'FormatGCNInput(num_person=2, mode=zero)' + + results = dict(keypoint=np.random.randn(2, 40, 25, 3), num_clips=4) + format_shape = FormatGCNInput(num_person=2, mode='zero') + results = format_shape(results) + assert results['keypoint'].shape == (4, 2, 10, 25, 3) + + results = dict(keypoint=np.random.randn(1, 10, 25, 3)) + format_shape = FormatGCNInput(num_person=2, mode='zero') + results = format_shape(results) + assert results['keypoint'].shape == (1, 2, 10, 25, 3) + assert_array_equal(results['keypoint'][:, 1], np.zeros((1, 10, 25, 3))) + + results = dict(keypoint=np.random.randn(1, 10, 25, 3)) + format_shape = FormatGCNInput(num_person=2, mode='loop') + results = format_shape(results) + assert results['keypoint'].shape == (1, 2, 10, 25, 3) + assert_array_equal(results['keypoint'][:, 1], results['keypoint'][:, 0]) + + results = dict(keypoint=np.random.randn(3, 10, 25, 3)) + format_shape = FormatGCNInput(num_person=2, mode='zero') + results = format_shape(results) + assert results['keypoint'].shape == (1, 2, 10, 25, 3) diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..d1e13687709b0c6213a5b87389830788d96c2e90 --- /dev/null +++ b/tests/datasets/transforms/test_loading.py @@ -0,0 +1,748 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import platform + +import mmcv +import numpy as np +import pytest +import torch +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_almost_equal + +from mmaction.datasets.transforms import (DecordDecode, DecordInit, + GenerateLocalizationLabels, + LoadAudioFeature, LoadHVULabel, + LoadLocalizationFeature, + LoadProposals, LoadRGBFromFile, + OpenCVDecode, OpenCVInit, PIMSDecode, + PIMSInit, PyAVDecode, + PyAVDecodeMotionVector, PyAVInit) + +from mmaction.datasets.transforms import RawFrameDecode # isort:skip + + +class BaseTestLoading: + + @classmethod + def setup_class(cls): + cls.data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../../data')) + cls.img_path = osp.join(cls.data_prefix, 'test.jpg') + cls.video_path = osp.join(cls.data_prefix, 'test.mp4') + cls.wav_path = osp.join(cls.data_prefix, 'test.wav') + cls.audio_spec_path = osp.join(cls.data_prefix, 'test.npy') + cls.img_dir = osp.join(cls.data_prefix, 'imgs') + cls.raw_feature_dir = osp.join(cls.data_prefix, 'activitynet_features') + cls.bsp_feature_dir = osp.join(cls.data_prefix, 'bsp_features') + cls.proposals_dir = osp.join(cls.data_prefix, 'proposals') + + cls.total_frames = 5 + cls.filename_tmpl = 'img_{:05}.jpg' + cls.flow_filename_tmpl = '{}_{:05d}.jpg' + video_total_frames = len(mmcv.VideoReader(cls.video_path)) + cls.audio_total_frames = video_total_frames + + cls.video_results = dict( + filename=cls.video_path, + label=1, + total_frames=video_total_frames, + start_index=0) + cls.audio_results = dict( + audios=np.random.randn(1280, ), + audio_path=cls.wav_path, + total_frames=cls.audio_total_frames, + label=1, + start_index=0) + cls.audio_feature_results = dict( + audios=np.random.randn(128, 80), + audio_path=cls.audio_spec_path, + total_frames=cls.audio_total_frames, + label=1, + start_index=0) + cls.frame_results = dict( + frame_dir=cls.img_dir, + total_frames=cls.total_frames, + filename_tmpl=cls.filename_tmpl, + start_index=1, + modality='RGB', + offset=0, + label=1) + cls.flow_frame_results = dict( + frame_dir=cls.img_dir, + total_frames=cls.total_frames, + filename_tmpl=cls.flow_filename_tmpl, + modality='Flow', + offset=0, + label=1) + cls.action_results = dict( + video_name='v_test1', + data_prefix=cls.raw_feature_dir, + temporal_scale=5, + boundary_ratio=0.1, + duration_second=10, + duration_frame=10, + feature_frame=8, + annotations=[{ + 'segment': [3.0, 5.0], + 'label': 'Rock climbing' + }]) + cls.action_results['feature_path'] = osp.join(cls.raw_feature_dir, + 'v_test1.csv') + + cls.ava_results = dict( + fps=30, timestamp=902, timestamp_start=840, shot_info=(0, 27000)) + + cls.hvu_label_example1 = dict( + categories=['action', 'object', 'scene', 'concept'], + category_nums=[2, 5, 3, 2], + label=dict(action=[0], object=[2, 3], scene=[0, 1])) + cls.hvu_label_example2 = dict( + categories=['action', 'object', 'scene', 'concept'], + category_nums=[2, 5, 3, 2], + label=dict(action=[1], scene=[1, 2], concept=[1])) + + +class TestDecode(BaseTestLoading): + + def test_pyav_init(self): + target_keys = ['video_reader', 'total_frames'] + video_result = copy.deepcopy(self.video_results) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + assert assert_dict_has_keys(pyav_init_result, target_keys) + assert pyav_init_result['total_frames'] == 300 + assert repr( + pyav_init) == f'{pyav_init.__class__.__name__}(io_backend=disk)' + + def test_pyav_decode(self): + target_keys = ['frame_inds', 'imgs', 'original_shape'] + + # test PyAV with 2 dim input and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, + 2)[:, np.newaxis] + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + assert repr(pyav_decode) == (f'{pyav_decode.__class__.__name__}(' + f'multi_thread={False}, mode=accurate)') + + # test PyAV with 1 dim input and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # PyAV with multi thread and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode(multi_thread=True) + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + assert repr(pyav_decode) == (f'{pyav_decode.__class__.__name__}(' + f'multi_thread={True}, mode=accurate)') + + # test PyAV with 2 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, + 2)[:, np.newaxis] + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test PyAV with 1 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # PyAV with multi thread + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode(multi_thread=True) + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # PyAV with efficient mode + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode(multi_thread=True, mode='efficient') + pyav_decode_result = pyav_decode(video_result) + assert assert_dict_has_keys(pyav_decode_result, target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + assert pyav_decode_result['video_reader'] is None + + assert (repr(pyav_decode) == pyav_decode.__class__.__name__ + + f'(multi_thread={True}, mode=efficient)') + + def test_pims_init(self): + target_keys = ['video_reader', 'total_frames'] + video_result = copy.deepcopy(self.video_results) + pims_init = PIMSInit() + pims_init_result = pims_init(video_result) + assert assert_dict_has_keys(pims_init_result, target_keys) + assert pims_init_result['total_frames'] == 300 + + pims_init = PIMSInit(mode='efficient') + pims_init_result = pims_init(video_result) + assert assert_dict_has_keys(pims_init_result, target_keys) + assert pims_init_result['total_frames'] == 300 + + assert repr(pims_init) == (f'{pims_init.__class__.__name__}' + f'(io_backend=disk, mode=efficient)') + + def test_pims_decode(self): + target_keys = ['frame_inds', 'imgs', 'original_shape'] + + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, + 2)[:, np.newaxis] + pims_init = PIMSInit() + pims_init_result = pims_init(video_result) + + pims_decode = PIMSDecode() + pims_decode_result = pims_decode(pims_init_result) + assert assert_dict_has_keys(pims_decode_result, target_keys) + assert pims_decode_result['original_shape'] == (256, 340) + assert np.shape(pims_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + def test_decord_init(self): + target_keys = ['video_reader', 'total_frames', 'avg_fps'] + video_result = copy.deepcopy(self.video_results) + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + assert assert_dict_has_keys(decord_init_result, target_keys) + assert decord_init_result['total_frames'] == len( + decord_init_result['video_reader']) + assert decord_init_result['avg_fps'] == 30 + + assert repr(decord_init) == (f'{decord_init.__class__.__name__}(' + f'io_backend=disk, ' + f'num_threads=1)') + + def test_decord_decode(self): + target_keys = ['frame_inds', 'imgs', 'original_shape'] + + # test Decord with 2 dim input using accurate mode + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, + 3)[:, np.newaxis] + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode() + decord_decode_result = decord_decode(video_result) + assert assert_dict_has_keys(decord_decode_result, target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test Decord with 1 dim input using accurate mode + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 3) + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode() + decord_decode_result = decord_decode(video_result) + assert assert_dict_has_keys(decord_decode_result, target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test Decord with 2 dim input using efficient mode + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, + 3)[:, np.newaxis] + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode(mode='efficient') + decord_decode_result = decord_decode(video_result) + assert assert_dict_has_keys(decord_decode_result, target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test Decord with 1 dim input using efficient mode + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, 3) + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode(mode='efficient') + decord_decode_result = decord_decode(video_result) + assert assert_dict_has_keys(decord_decode_result, target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + assert repr(decord_decode) == (f'{decord_decode.__class__.__name__}(' + f'mode=efficient)') + + def test_opencv_init(self): + target_keys = ['new_path', 'video_reader', 'total_frames'] + video_result = copy.deepcopy(self.video_results) + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + assert assert_dict_has_keys(opencv_init_result, target_keys) + assert opencv_init_result['total_frames'] == len( + opencv_init_result['video_reader']) + assert repr(opencv_init) == (f'{opencv_init.__class__.__name__}(' + f'io_backend=disk)') + + def test_opencv_decode(self): + target_keys = ['frame_inds', 'imgs', 'original_shape'] + + # test OpenCV with 2 dim input when start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, + 2)[:, np.newaxis] + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + + opencv_decode = OpenCVDecode() + opencv_decode_result = opencv_decode(video_result) + assert assert_dict_has_keys(opencv_decode_result, target_keys) + assert opencv_decode_result['original_shape'] == (256, 340) + assert np.shape(opencv_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test OpenCV with 2 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, + 2)[:, np.newaxis] + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + + opencv_decode = OpenCVDecode() + opencv_decode_result = opencv_decode(video_result) + assert assert_dict_has_keys(opencv_decode_result, target_keys) + assert opencv_decode_result['original_shape'] == (256, 340) + assert np.shape(opencv_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test OpenCV with 1 dim input when start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 3) + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + + # test OpenCV with 1 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, 3) + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + + opencv_decode = OpenCVDecode() + opencv_decode_result = opencv_decode(video_result) + assert assert_dict_has_keys(opencv_decode_result, target_keys) + assert opencv_decode_result['original_shape'] == (256, 340) + assert np.shape(opencv_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + def test_rawframe_decode(self): + target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality'] + + # test frame selector with 2 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 2)[:, + np.newaxis] + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + + inputs['gt_bboxes'] = np.array([[0, 0, 1, 1]]) + inputs['proposals'] = np.array([[0, 0, 1, 1]]) + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 2 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 2)[:, + np.newaxis] + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input when start_index = 0 + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 5) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 5) + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 2) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 2) + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input for flow images + inputs = copy.deepcopy(self.flow_frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 2) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 2) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input for flow images + inputs = copy.deepcopy(self.flow_frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 2) + frame_selector = RawFrameDecode(io_backend='disk') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 2) + assert results['original_shape'] == (240, 320) + + return + # cannot install turbojpeg for CI + if platform.system() != 'Windows': + # test frame selector in turbojpeg decoding backend + # when start_index = 0 + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 5) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = RawFrameDecode( + io_backend='disk', decoding_backend='turbojpeg') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), + 240, 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector in turbojpeg decoding backend + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 5) + frame_selector = RawFrameDecode( + io_backend='disk', decoding_backend='turbojpeg') + results = frame_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), + 240, 320, 3) + assert results['original_shape'] == (240, 320) + assert repr(frame_selector) == ( + f'{frame_selector.__class__.__name__}(io_backend=disk, ' + f'decoding_backend=turbojpeg)') + + def test_pyav_decode_motion_vector(self): + pyav_init = PyAVInit() + pyav = PyAVDecodeMotionVector() + + # test pyav with 2-dim input + results = { + 'filename': self.video_path, + 'frame_inds': np.arange(0, 32, 1)[:, np.newaxis] + } + results = pyav_init(results) + results = pyav(results) + target_keys = ['motion_vectors'] + assert assert_dict_has_keys(results, target_keys) + + # test pyav with 1 dim input + results = { + 'filename': self.video_path, + 'frame_inds': np.arange(0, 32, 1) + } + pyav_init = PyAVInit() + results = pyav_init(results) + pyav = PyAVDecodeMotionVector() + results = pyav(results) + + assert assert_dict_has_keys(results, target_keys) + + +class TestLoad(BaseTestLoading): + + def test_load_hvu_label(self): + hvu_label_example1 = copy.deepcopy(self.hvu_label_example1) + hvu_label_example2 = copy.deepcopy(self.hvu_label_example2) + categories = hvu_label_example1['categories'] + category_nums = hvu_label_example1['category_nums'] + num_tags = sum(category_nums) + num_categories = len(categories) + + loader = LoadHVULabel() + assert repr(loader) == (f'{loader.__class__.__name__}(' + f'hvu_initialized={False})') + + result1 = loader(hvu_label_example1) + label1 = torch.zeros(num_tags) + mask1 = torch.zeros(num_tags) + category_mask1 = torch.zeros(num_categories) + + assert repr(loader) == (f'{loader.__class__.__name__}(' + f'hvu_initialized={True})') + + label1[[0, 4, 5, 7, 8]] = 1. + mask1[:10] = 1. + category_mask1[:3] = 1. + + assert torch.all(torch.eq(label1, result1['label'])) + assert torch.all(torch.eq(mask1, result1['mask'])) + assert torch.all(torch.eq(category_mask1, result1['category_mask'])) + + result2 = loader(hvu_label_example2) + label2 = torch.zeros(num_tags) + mask2 = torch.zeros(num_tags) + category_mask2 = torch.zeros(num_categories) + + label2[[1, 8, 9, 11]] = 1. + mask2[:2] = 1. + mask2[7:] = 1. + category_mask2[[0, 2, 3]] = 1. + + assert torch.all(torch.eq(label2, result2['label'])) + assert torch.all(torch.eq(mask2, result2['mask'])) + assert torch.all(torch.eq(category_mask2, result2['category_mask'])) + + def test_load_localization_feature(self): + target_keys = ['raw_feature'] + + action_result = copy.deepcopy(self.action_results) + + # test error cases + with pytest.raises(TypeError): + load_localization_feature = LoadLocalizationFeature( + 'unsupport_ext') + + # test normal cases + load_localization_feature = LoadLocalizationFeature() + load_localization_feature_result = load_localization_feature( + action_result) + assert assert_dict_has_keys(load_localization_feature_result, + target_keys) + assert load_localization_feature_result['raw_feature'].shape == (400, + 5) + assert repr(load_localization_feature + ) == f'{load_localization_feature.__class__.__name__}' + + def test_load_proposals(self): + target_keys = [ + 'bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score', + 'reference_temporal_iou' + ] + + action_result = copy.deepcopy(self.action_results) + + # test error cases + with pytest.raises(NotImplementedError): + load_proposals = LoadProposals(5, self.proposals_dir, + self.bsp_feature_dir, + 'unsupport_ext') + + with pytest.raises(NotImplementedError): + load_proposals = LoadProposals(5, self.proposals_dir, + self.bsp_feature_dir, '.csv', + 'unsupport_ext') + + # test normal cases + load_proposals = LoadProposals(5, self.proposals_dir, + self.bsp_feature_dir) + load_proposals_result = load_proposals(action_result) + assert assert_dict_has_keys(load_proposals_result, target_keys) + assert load_proposals_result['bsp_feature'].shape[0] == 5 + assert load_proposals_result['tmin'].shape == (5, ) + assert_array_almost_equal( + load_proposals_result['tmin'], np.arange(0.1, 0.6, 0.1), decimal=4) + assert load_proposals_result['tmax'].shape == (5, ) + assert_array_almost_equal( + load_proposals_result['tmax'], np.arange(0.2, 0.7, 0.1), decimal=4) + assert load_proposals_result['tmin_score'].shape == (5, ) + assert_array_almost_equal( + load_proposals_result['tmin_score'], + np.arange(0.95, 0.90, -0.01), + decimal=4) + assert load_proposals_result['tmax_score'].shape == (5, ) + assert_array_almost_equal( + load_proposals_result['tmax_score'], + np.arange(0.96, 0.91, -0.01), + decimal=4) + assert load_proposals_result['reference_temporal_iou'].shape == (5, ) + assert_array_almost_equal( + load_proposals_result['reference_temporal_iou'], + np.arange(0.85, 0.80, -0.01), + decimal=4) + assert repr(load_proposals) == ( + f'{load_proposals.__class__.__name__}(' + f'top_k={5}, ' + f'pgm_proposals_dir={self.proposals_dir}, ' + f'pgm_features_dir={self.bsp_feature_dir}, ' + f'proposal_ext=.csv, ' + f'feature_ext=.npy)') + + def test_load_audio_feature(self): + target_keys = ['audios'] + inputs = copy.deepcopy(self.audio_feature_results) + load_audio_feature = LoadAudioFeature() + results = load_audio_feature(inputs) + assert assert_dict_has_keys(results, target_keys) + + # test when no audio feature file exists + inputs = copy.deepcopy(self.audio_feature_results) + inputs['audio_path'] = 'foo/foo/bar.npy' + load_audio_feature = LoadAudioFeature() + results = load_audio_feature(inputs) + assert results['audios'].shape == (640, 80) + assert assert_dict_has_keys(results, target_keys) + assert repr(load_audio_feature) == ( + f'{load_audio_feature.__class__.__name__}(' + f'pad_method=zero)') + + +class TestLocalization(BaseTestLoading): + + def test_generate_localization_label(self): + action_result = copy.deepcopy(self.action_results) + action_result['raw_feature'] = np.random.randn(400, 5) + + # test default setting + target_keys = ['gt_bbox'] + generate_localization_labels = GenerateLocalizationLabels() + generate_localization_labels_result = generate_localization_labels( + action_result) + assert assert_dict_has_keys(generate_localization_labels_result, + target_keys) + + assert_array_almost_equal( + generate_localization_labels_result['gt_bbox'], [[0.375, 0.625]], + decimal=4) + + +class TestLoadImageFromFile: + + def test_load_img(self): + data_prefix = osp.join(osp.dirname(__file__), '../../data') + + results = dict(img_path=osp.join(data_prefix, 'test.jpg')) + transform = LoadRGBFromFile() + results = transform(copy.deepcopy(results)) + assert results['img_path'] == osp.join(data_prefix, 'test.jpg') + assert results['img'].shape == (240, 320, 3) + assert results['img'].dtype == np.uint8 + assert results['img_shape'] == (240, 320) + assert results['ori_shape'] == (240, 320) + assert repr(transform) == transform.__class__.__name__ + \ + "(ignore_empty=False, to_float32=False, color_type='color', " + \ + "imdecode_backend='cv2', io_backend='disk')" + + # to_float32 + transform = LoadRGBFromFile(to_float32=True) + results = transform(copy.deepcopy(results)) + assert results['img'].dtype == np.float32 + + # test load empty + fake_img_path = osp.join(data_prefix, 'fake.jpg') + results['img_path'] = fake_img_path + transform = LoadRGBFromFile(ignore_empty=False) + with pytest.raises(FileNotFoundError): + transform(copy.deepcopy(results)) + transform = LoadRGBFromFile(ignore_empty=True) + assert transform(copy.deepcopy(results)) is None diff --git a/tests/datasets/transforms/test_pose_transforms.py b/tests/datasets/transforms/test_pose_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..b4cde29fdce490ec10061674b42e08a5d7e8e91b --- /dev/null +++ b/tests/datasets/transforms/test_pose_transforms.py @@ -0,0 +1,697 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import copy as cp +import os.path as osp +from collections import defaultdict + +import numpy as np +import pytest +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from mmaction.datasets.transforms import (DecompressPose, GeneratePoseTarget, + GenSkeFeat, JointToBone, + MergeSkeFeat, MMCompact, MMDecode, + MMUniformSampleFrames, PadTo, + PoseCompact, PoseDecode, + PreNormalize2D, PreNormalize3D, + ToMotion, UniformSampleFrames) + + +class TestPoseTransforms: + + @staticmethod + def test_decompress_pose(): + + def get_mode(arr): + cnt = defaultdict(lambda: 0) + for num in arr: + cnt[num] += 1 + max_val = max(cnt.values()) + return [k for k in cnt if cnt[k] == max_val], max_val + + total_frames = 100 + img_shape = (224, 224) + frame_inds = np.random.choice(range(100), size=120) + frame_inds.sort() + anno_flag = np.random.random(120) > 0.1 + anno_inds = np.array([i for i, f in enumerate(anno_flag) if f]) + kp = np.random.random([120, 17, 3]) + results = dict( + frame_inds=frame_inds, + keypoint=kp, + total_frames=total_frames, + img_shape=img_shape) + + inp = cp.deepcopy(results) + + decompress_pose = DecompressPose(squeeze=True, max_person=100) + + assert str(decompress_pose) == ( + 'DecompressPose(squeeze=True, max_person=100)') + return_results = decompress_pose(inp) + assert return_results['keypoint'].shape[:-1] == \ + return_results['keypoint_score'].shape + + num_person = return_results['keypoint'].shape[0] + num_frame = return_results['keypoint'].shape[1] + assert num_person == get_mode(frame_inds)[1] + assert num_frame == len(set(frame_inds)) + + inp = cp.deepcopy(results) + decompress_pose = DecompressPose(squeeze=False, max_person=100) + return_results = decompress_pose(inp) + assert return_results['keypoint'].shape[:-1] == \ + return_results['keypoint_score'].shape + + num_person = return_results['keypoint'].shape[0] + num_frame = return_results['keypoint'].shape[1] + assert num_person == get_mode(frame_inds)[1] + assert num_frame == total_frames + + inp = cp.deepcopy(results) + inp['anno_inds'] = anno_inds + decompress_pose = DecompressPose(squeeze=True, max_person=100) + return_results = decompress_pose(inp) + assert return_results['keypoint'].shape[:-1] == \ + return_results['keypoint_score'].shape + + num_person = return_results['keypoint'].shape[0] + num_frame = return_results['keypoint'].shape[1] + assert num_person == get_mode(frame_inds[anno_inds])[1] + assert num_frame == len(set(frame_inds[anno_inds])) + + inp = cp.deepcopy(results) + inp['anno_inds'] = anno_inds + decompress_pose = DecompressPose(squeeze=True, max_person=2) + return_results = decompress_pose(inp) + assert return_results['keypoint'].shape[:-1] == \ + return_results['keypoint_score'].shape + + num_person = return_results['keypoint'].shape[0] + num_frame = return_results['keypoint'].shape[1] + assert num_person <= 2 + assert num_frame == len(set(frame_inds[anno_inds])) + + @staticmethod + def test_generate_pose_target(): + img_shape = (64, 64) + kp = np.array([[[[24, 24], [40, 40], [24, 40]]]]) + kpscore = np.array([[[1., 1., 1.]]]) + kp = np.concatenate([kp] * 8, axis=1) + kpscore = np.concatenate([kpscore] * 8, axis=1) + results = dict( + img_shape=img_shape, + keypoint=kp, + keypoint_score=kpscore, + modality='Pose') + + generate_pose_target = GeneratePoseTarget( + sigma=1, + with_kp=True, + left_kp=(1, ), + right_kp=(2, ), + left_limb=(0, ), + right_limb=(1, ), + skeletons=()) + assert str(generate_pose_target) == ('GeneratePoseTarget(sigma=1, ' + 'use_score=True, with_kp=True, ' + 'with_limb=False, skeletons=(), ' + 'double=False, left_kp=(1,), ' + 'right_kp=(2,), left_limb=(0,), ' + 'right_limb=(1,), scaling=1.0)') + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) + assert_array_almost_equal(return_results['imgs'][0], + return_results['imgs'][1]) + + results = dict(img_shape=img_shape, keypoint=kp, modality='Pose') + + generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True) + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) + assert_array_almost_equal(return_results['imgs'][0], + return_results['imgs'][1]) + + generate_pose_target = GeneratePoseTarget( + sigma=1, + with_kp=False, + with_limb=True, + skeletons=((0, 1), (1, 2), (0, 2))) + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) + assert_array_almost_equal(return_results['imgs'][0], + return_results['imgs'][1]) + + generate_pose_target = GeneratePoseTarget( + sigma=1, + with_kp=False, + with_limb=True, + double=True, + left_limb=(0, ), + right_limb=(1, ), + skeletons=((0, 1), (1, 2), (0, 2))) + return_results = generate_pose_target(copy.deepcopy(results)) + imgs = return_results['imgs'] + assert imgs.shape == (16, 3, 64, 64) + assert_array_almost_equal(imgs[0], imgs[1]) + assert_array_almost_equal(imgs[:8, 2], imgs[8:, 2, :, ::-1]) + assert_array_almost_equal(imgs[:8, 0], imgs[8:, 1, :, ::-1]) + assert_array_almost_equal(imgs[:8, 1], imgs[8:, 0, :, ::-1]) + + img_shape = (64, 64) + kp = np.array([[[[24, 24], [40, 40], [24, 40]]]]) + kpscore = np.array([[[0., 0., 0.]]]) + kp = np.concatenate([kp] * 8, axis=1) + kpscore = np.concatenate([kpscore] * 8, axis=1) + results = dict( + img_shape=img_shape, + keypoint=kp, + keypoint_score=kpscore, + modality='Pose') + generate_pose_target = GeneratePoseTarget( + sigma=1, with_kp=True, skeletons=()) + return_results = generate_pose_target(copy.deepcopy(results)) + assert_array_almost_equal(return_results['imgs'], 0) + + img_shape = (64, 64) + kp = np.array([[[[24, 24], [40, 40], [24, 40]]]]) + kpscore = np.array([[[0., 0., 0.]]]) + kp = np.concatenate([kp] * 8, axis=1) + kpscore = np.concatenate([kpscore] * 8, axis=1) + results = dict( + img_shape=img_shape, + keypoint=kp, + keypoint_score=kpscore, + modality='Pose') + generate_pose_target = GeneratePoseTarget( + sigma=1, + with_kp=False, + with_limb=True, + skeletons=((0, 1), (1, 2), (0, 2))) + return_results = generate_pose_target(copy.deepcopy(results)) + assert_array_almost_equal(return_results['imgs'], 0) + + img_shape = (64, 64) + kp = np.array([[[[124, 124], [140, 140], [124, 140]]]]) + kpscore = np.array([[[0., 0., 0.]]]) + kp = np.concatenate([kp] * 8, axis=1) + kpscore = np.concatenate([kpscore] * 8, axis=1) + results = dict( + img_shape=img_shape, + keypoint=kp, + keypoint_score=kpscore, + modality='Pose') + generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True) + return_results = generate_pose_target(copy.deepcopy(results)) + assert_array_almost_equal(return_results['imgs'], 0) + + img_shape = (64, 64) + kp = np.array([[[[124., 124.], [140., 140.], [124., 140.]]]]) + kpscore = np.array([[[0., 0., 0.]]]) + kp = np.concatenate([kp] * 8, axis=1) + kpscore = np.concatenate([kpscore] * 8, axis=1) + results = dict( + img_shape=img_shape, + keypoint=kp, + keypoint_score=kpscore, + modality='Pose') + generate_pose_target = GeneratePoseTarget( + sigma=1, + with_kp=False, + with_limb=True, + skeletons=((0, 1), (1, 2), (0, 2))) + return_results = generate_pose_target(results) + assert_array_almost_equal(return_results['imgs'], 0) + + @staticmethod + def test_pose_compact(): + results = {} + results['img_shape'] = (100, 100) + fake_kp = np.zeros([1, 4, 2, 2]) + fake_kp[:, :, 0] = [10, 10] + fake_kp[:, :, 1] = [90, 90] + results['keypoint'] = fake_kp + + pose_compact = PoseCompact( + padding=0, threshold=0, hw_ratio=None, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 80) + assert str(pose_compact) == ( + 'PoseCompact(padding=0, threshold=0, hw_ratio=None, ' + 'allow_imgpad=False)') + + pose_compact = PoseCompact( + padding=0.3, threshold=0, hw_ratio=None, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + + pose_compact = PoseCompact( + padding=0.3, threshold=0, hw_ratio=None, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (104, 104) + + pose_compact = PoseCompact( + padding=0, threshold=100, hw_ratio=None, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + + pose_compact = PoseCompact( + padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 106) + + @staticmethod + def test_pre_normalize3d(): + target_keys = ['keypoint', 'total_frames', 'body_center'] + + results = dict(keypoint=np.random.randn(2, 40, 25, 3), total_frames=40) + + pre_normalize3d = PreNormalize3D( + align_center=True, align_spine=True, align_shoulder=False) + + inp = copy.deepcopy(results) + ret1 = pre_normalize3d(inp) + + inp = copy.deepcopy(ret1) + ret2 = pre_normalize3d(inp) + + assert_array_equal(ret2['body_center'], np.zeros(3)) + assert_array_equal(ret1['keypoint'], ret2['keypoint']) + + pre_normalize3d = PreNormalize3D( + align_center=True, align_spine=False, align_shoulder=True) + + inp = copy.deepcopy(results) + ret3 = pre_normalize3d(inp) + + inp = copy.deepcopy(ret3) + ret4 = pre_normalize3d(inp) + + assert_array_equal(ret4['body_center'], np.zeros(3)) + assert_array_equal(ret3['keypoint'], ret4['keypoint']) + + assert assert_dict_has_keys(ret1, target_keys) + assert repr(pre_normalize3d) == 'PreNormalize3D(zaxis=[0, 1], ' \ + 'xaxis=[8, 4], align_center=True, ' \ + 'align_spine=False, ' \ + 'align_shoulder=True)' + + @staticmethod + def test_pre_normalize2d(): + + def check_pose_normalize(origin_kps, target_kps, h, w): + target_kps[..., 0] = target_kps[..., 0] * w / 2 + w / 2 + target_kps[..., 1] = target_kps[..., 1] * h / 2 + h / 2 + assert_array_almost_equal(origin_kps, target_kps, decimal=4) + + results = dict( + keypoint=np.random.randn(1, 40, 17, 2), img_shape=(480, 854)) + pre_normalize_2d = PreNormalize2D(img_shape=(1080, 1920)) + inp = copy.deepcopy(results) + ret1 = pre_normalize_2d(inp) + check_pose_normalize( + results['keypoint'], ret1['keypoint'], h=480, w=854) + + results = dict(keypoint=np.random.randn(1, 40, 17, 2)) + pre_normalize_2d = PreNormalize2D(img_shape=(1080, 1920)) + inp = copy.deepcopy(results) + ret2 = pre_normalize_2d(inp) + check_pose_normalize( + results['keypoint'], ret2['keypoint'], h=1080, w=1920) + + assert repr(pre_normalize_2d) == \ + 'PreNormalize2D(img_shape=(1080, 1920))' + + @staticmethod + def test_joint_to_bone(): + with pytest.raises(ValueError): + JointToBone(dataset='invalid') + + with pytest.raises(AssertionError): + JointToBone()(dict(keypoint=np.random.randn(2, 15, 25, 4))) + + results = dict(keypoint=np.random.randn(2, 15, 25, 3)) + joint_to_bone = JointToBone(dataset='nturgb+d') + center_index = 20 + results = joint_to_bone(results) + assert_array_equal(results['keypoint'][..., center_index, :], + np.zeros((2, 15, 3))) + + results = dict(keypoint=np.random.randn(2, 15, 18, 3)) + joint_to_bone = JointToBone(dataset='openpose') + center_index = 0 + center_score = results['keypoint'][..., center_index, 2] + results = joint_to_bone(results) + assert_array_equal(results['keypoint'][..., center_index, :2], + np.zeros((2, 15, 2))) + assert_array_almost_equal(results['keypoint'][..., center_index, 2], + center_score) + + results = dict(keypoint=np.random.randn(2, 15, 17, 3)) + joint_to_bone = JointToBone(dataset='coco') + center_index = 0 + center_score = results['keypoint'][..., center_index, 2] + results = joint_to_bone(results) + assert_array_equal(results['keypoint'][..., center_index, :2], + np.zeros((2, 15, 2))) + assert_array_almost_equal(results['keypoint'][..., center_index, 2], + center_score) + + results = dict(keypoint=np.random.randn(2, 15, 17, 3)) + joint_to_bone = JointToBone(dataset='coco', target='bone') + results = joint_to_bone(results) + assert assert_dict_has_keys(results, ['keypoint', 'bone']) + assert repr(joint_to_bone) == 'JointToBone(dataset=coco, target=bone)' + + @staticmethod + def test_to_motion(): + with pytest.raises(AssertionError): + ToMotion()(dict(keypoint=np.random.randn(2, 15, 25, 4))) + + with pytest.raises(KeyError): + ToMotion(source='j')(dict(keypoint=np.random.randn(2, 15, 25, 4))) + + results = dict(keypoint=np.random.randn(2, 15, 25, 3)) + to_motion = ToMotion() + results = to_motion(results) + assert_array_equal(results['motion'][:, -1, :, :], np.zeros( + (2, 25, 3))) + assert assert_dict_has_keys(results, ['keypoint', 'motion']) + assert repr(to_motion) == 'ToMotion(dataset=nturgb+d, ' \ + 'source=keypoint, target=motion)' + + @staticmethod + def test_merge_ske_feat(): + with pytest.raises(KeyError): + MergeSkeFeat()(dict(b=np.random.randn(2, 15, 25, 3))) + + results = dict( + j=np.random.randn(2, 10, 25, 3), b=np.random.randn(2, 10, 25, 3)) + merge_ske_feat = MergeSkeFeat(feat_list=['j', 'b']) + results = merge_ske_feat(results) + + assert assert_dict_has_keys(results, ['keypoint']) + assert results['keypoint'].shape == (2, 10, 25, 6) + assert repr(merge_ske_feat) == "MergeSkeFeat(feat_list=['j', 'b'], " \ + 'target=keypoint, axis=-1)' + + @staticmethod + def test_gen_ske_feat(): + results = dict(keypoint=np.random.randn(1, 10, 25, 3)) + + gen_ske_feat = GenSkeFeat(dataset='nturgb+d', feats=['j']) + inp = copy.deepcopy(results) + ret1 = gen_ske_feat(inp) + assert_array_equal(ret1['keypoint'], results['keypoint']) + + gen_ske_feat = GenSkeFeat( + dataset='nturgb+d', feats=['j', 'b', 'jm', 'bm']) + inp = copy.deepcopy(results) + ret2 = gen_ske_feat(inp) + assert ret2['keypoint'].shape == (1, 10, 25, 12) + + results = dict( + keypoint=np.random.randn(1, 10, 17, 2), + keypoint_score=np.random.randn(1, 10, 17)) + gen_ske_feat = GenSkeFeat(dataset='coco', feats=['j', 'b', 'jm', 'bm']) + results = gen_ske_feat(results) + assert results['keypoint'].shape == (1, 10, 17, 12) + assert assert_dict_has_keys(results, ['keypoint']) + assert not assert_dict_has_keys(results, ['j', 'b', 'jm', 'bm']) + assert repr(gen_ske_feat) == 'GenSkeFeat(dataset=coco, ' \ + "feats=['j', 'b', 'jm', 'bm'], axis=-1)" + + @staticmethod + def test_uniform_sample_frames(): + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + + assert repr(sampling) == ('UniformSampleFrames(clip_len=8, ' + 'num_clips=1, test_mode=True, seed=0)') + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([4, 15, 21, 24, 35, 43, 51, 63])) + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 2, 4, 6, 8, 9, 11, 13])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 1, 2, 3, 4, 5, 6, 0])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=8, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 8 + assert len(sampling_results['frame_inds']) == 64 + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=4, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 4 + assert_array_equal( + sampling_results['frame_inds'], + np.array([ + 4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56, + 0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60 + ])) + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + @staticmethod + def test_pad_to(): + with pytest.raises(AssertionError): + PadTo(length=4, mode='invalid') + + results = dict( + keypoint=np.random.randn(2, 3, 17, 3), + total_frames=3, + start_index=0) + + inp = copy.deepcopy(results) + pad_to = PadTo(length=6, mode='loop') + ret1 = pad_to(inp) + kp = ret1['keypoint'] + assert_array_equal(kp[:, :3], kp[:, 3:]) + + inp = copy.deepcopy(results) + pad_to = PadTo(length=6, mode='zero') + ret2 = pad_to(inp) + kp = ret2['keypoint'] + assert ret2['total_frames'] == 6 + assert_array_equal(kp[:, 3:], np.zeros((2, 3, 17, 3))) + + @staticmethod + def test_pose_decode(): + kp = np.random.random([1, 16, 17, 2]) + kpscore = np.random.random([1, 16, 17]) + frame_inds = np.array([2, 4, 6, 8, 10]) + results = dict( + keypoint=kp, keypoint_score=kpscore, frame_inds=frame_inds) + pose_decode = PoseDecode() + assert repr(pose_decode) == 'PoseDecode()' + decode_results = pose_decode(results) + assert_array_almost_equal(decode_results['keypoint'], kp[:, + frame_inds]) + assert_array_almost_equal(decode_results['keypoint_score'], + kpscore[:, frame_inds]) + + results = dict(keypoint=kp, keypoint_score=kpscore, total_frames=16) + pose_decode = PoseDecode() + decode_results = pose_decode(results) + assert_array_almost_equal(decode_results['keypoint'], kp) + assert_array_almost_equal(decode_results['keypoint_score'], kpscore) + + @staticmethod + def test_mm_uniform_sample_frames(): + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=True, seed=0) + assert repr(sampling) == ('MMUniformSampleFrames(' + "clip_len={'RGB': 8, 'Pose': 32}, " + 'num_clips=1, test_mode=True, seed=0)') + + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert sampling_results['modality'] == ['RGB', 'Pose'] + assert_array_equal(sampling_results['RGB_inds'], + np.array([4, 15, 21, 24, 35, 43, 51, 63])) + assert_array_equal( + sampling_results['Pose_inds'], + np.array([ + 0, 3, 5, 6, 9, 11, 13, 15, 17, 19, 21, 22, 24, 27, 28, 30, 32, + 34, 36, 39, 40, 43, 45, 46, 48, 51, 53, 55, 57, 58, 61, 62 + ])) + + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), + num_clips=10, + test_mode=True, + seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 10 + assert sampling_results['modality'] == ['RGB', 'Pose'] + assert len(sampling_results['RGB_inds']) == 80 + assert len(sampling_results['Pose_inds']) == 320 + + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=False) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['RGB_inds']) == 8 + assert len(sampling_results['Pose_inds']) == 32 + + @staticmethod + def test_mm_decode(): + mm_decode = MMDecode() + + # Pose only test + pose_raw_results = dict( + modality=['Pose'], + Pose_inds=np.array([2, 4, 6, 8, 10]), + keypoint=np.random.random([1, 16, 17, 2]), + img_shape=(1080, 1920)) + rgb_raw_results = dict( + modality=['RGB'], + RGB_inds=np.array([2, 4, 6, 8, 10]), + frame_dir=osp.join(osp.dirname(__file__), '../../data/test')) + + # test pose w/o `keypoint_score` + mm_decode(copy.deepcopy(pose_raw_results)) + + # test pose with `keypoint_score` + pose_raw_results['keypoint_score'] = np.random.random([1, 16, 17]) + pose_results = mm_decode(copy.deepcopy(pose_raw_results)) + + # test rgb + rgb_results = mm_decode(copy.deepcopy(rgb_raw_results)) + + # test pose and rgb + pose_rgb_raw_results = { + **rgb_raw_results, + **pose_raw_results, 'modality': ['RGB', 'Pose'] + } + pose_rgb_results = mm_decode(copy.deepcopy(pose_rgb_raw_results)) + + assert_array_equal(pose_rgb_results['keypoint_score'], + pose_results['keypoint_score']) + scaled_keypoint = copy.deepcopy(pose_results['keypoint']) + oh, ow = pose_results['img_shape'] + nh, nw = pose_rgb_results['img_shape'] + scaled_keypoint[..., 0] *= (nw / ow) + scaled_keypoint[..., 1] *= (nh / oh) + assert_array_equal(pose_rgb_results['keypoint'], scaled_keypoint) + assert_array_equal(pose_rgb_results['imgs'], rgb_results['imgs']) + assert assert_dict_has_keys( + pose_rgb_results, ['filename', 'img_shape', 'original_shape']) + assert repr(mm_decode) == 'MMDecode(io_backend=disk)' + + @staticmethod + def test_mm_compact(): + results = {} + results['img_shape'] = (100, 100) + fake_kp = np.zeros([1, 4, 2, 2]) + fake_kp[:, :, 0] = [10, 10] + fake_kp[:, :, 1] = [90, 90] + results['keypoint'] = fake_kp + results['imgs'] = list(np.zeros([3, 100, 100, 3])) + + pose_compact = MMCompact( + padding=0, threshold=0, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 80) + assert ret['imgs'][0].shape[:-1] == (80, 80) + assert str(pose_compact) == ( + 'MMCompact(padding=0, threshold=0, hw_ratio=(1, 1), ' + 'allow_imgpad=False)') + + pose_compact = MMCompact( + padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + assert ret['imgs'][0].shape[:-1] == (100, 100) + + pose_compact = MMCompact( + padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (104, 104) + assert ret['imgs'][0].shape[:-1] == (104, 104) + + pose_compact = MMCompact( + padding=0, threshold=100, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + assert ret['imgs'][0].shape[:-1] == (100, 100) + + pose_compact = MMCompact( + padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 106) + assert ret['imgs'][0].shape[:-1] == (80, 106) diff --git a/tests/datasets/transforms/test_processing.py b/tests/datasets/transforms/test_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..1c44715379fede42f8a7ade6392f2f2f41245943 --- /dev/null +++ b/tests/datasets/transforms/test_processing.py @@ -0,0 +1,872 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import mmcv +import numpy as np +import pytest +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_almost_equal + +from mmaction.datasets.transforms import (CenterCrop, ColorJitter, Flip, Fuse, + MultiScaleCrop, RandomCrop, + RandomResizedCrop, Resize, TenCrop, + ThreeCrop) + + +def check_crop(origin_imgs, result_imgs, result_bbox, num_crops=1): + """Check if the result_bbox is in correspond to result_imgs.""" + + def check_single_crop(origin_imgs, result_imgs, result_bbox): + result_img_shape = result_imgs[0].shape[:2] + crop_w = result_bbox[2] - result_bbox[0] + crop_h = result_bbox[3] - result_bbox[1] + crop_shape = (crop_h, crop_w) + if not crop_shape == result_img_shape: + return False + left, top, right, bottom = result_bbox + return np.array_equal( + np.array(origin_imgs)[:, top:bottom, left:right, :], + np.array(result_imgs)) + + if result_bbox.ndim == 1: + return check_single_crop(origin_imgs, result_imgs, result_bbox) + if result_bbox.ndim == 2: + num_batch = len(origin_imgs) + for i, bbox in enumerate(result_bbox): + if num_crops == 10: + if (i // num_batch) % 2 == 0: + flag = check_single_crop([origin_imgs[i % num_batch]], + [result_imgs[i]], bbox) + else: + flag = check_single_crop([origin_imgs[i % num_batch]], + [np.flip(result_imgs[i], axis=1)], + bbox) + else: + flag = check_single_crop([origin_imgs[i % num_batch]], + [result_imgs[i]], bbox) + if not flag: + return False + return True + else: + # bbox has a wrong dimension + return False + + +def check_flip(origin_imgs, result_imgs, flip_type): + """Check if the origin_imgs are flipped correctly into result_imgs in + different flip_types.""" + n, _, _, _ = np.shape(origin_imgs) + if flip_type == 'horizontal': + for i in range(n): + if np.any(result_imgs[i] != np.fliplr(origin_imgs[i])): + return False + else: + # yapf: disable + for i in range(n): + if np.any(result_imgs[i] != np.transpose(np.fliplr(np.transpose(origin_imgs[i], (1, 0, 2))), (1, 0, 2))): # noqa:E501 + return False + # yapf: enable + return True + + +class TestColor: + + @staticmethod + def test_color_jitter(): + imgs = list( + np.random.randint(0, 255, size=(3, 112, 112, 3), dtype=np.uint8)) + results = dict(imgs=imgs) + + color_jitter = ColorJitter() + assert color_jitter.brightness == (0.5, 1.5) + assert color_jitter.contrast == (0.5, 1.5) + assert color_jitter.saturation == (0.5, 1.5) + assert color_jitter.hue == (-0.1, 0.1) + + color_jitter_results = color_jitter(results) + target_keys = ['imgs'] + + assert assert_dict_has_keys(color_jitter_results, target_keys) + assert np.shape(color_jitter_results['imgs']) == (3, 112, 112, 3) + for img in color_jitter_results['imgs']: + assert np.all(img >= 0) + assert np.all(img <= 255) + + assert repr(color_jitter) == (f'{color_jitter.__class__.__name__}(' + f'brightness={(0.5, 1.5)}, ' + f'contrast={(0.5, 1.5)}, ' + f'saturation={(0.5, 1.5)}, ' + f'hue={-0.1, 0.1})') + + +class TestCrops: + + @staticmethod + def test_random_crop(): + with pytest.raises(TypeError): + # size must be an int + RandomCrop(size=(112, 112)) + with pytest.raises(AssertionError): + # "size > height" or "size > width" is not allowed + imgs = list(np.random.rand(2, 224, 341, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=320) + random_crop(results) + + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + + # General case + imgs = list(np.random.rand(2, 224, 341, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224) + results['gt_bboxes'] = np.array([[0, 0, 340, 224]]) + results['proposals'] = np.array([[0, 0, 340, 224]]) + kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2]) + results['keypoint'] = kp + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert check_crop(imgs, random_crop_result['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert h == w == 224 + + # Test the case that no need for cropping + imgs = list(np.random.rand(2, 224, 224, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert check_crop(imgs, random_crop_result['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert h == w == 224 + + # Test the one-side-equal case + imgs = list(np.random.rand(2, 224, 225, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert check_crop(imgs, random_crop_result['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert h == w == 224 + + assert repr(random_crop) == (f'{random_crop.__class__.__name__}' + f'(size={224}, lazy={False})') + + @staticmethod + def test_random_resized_crop(): + with pytest.raises(TypeError): + # area_range must be a tuple of float + RandomResizedCrop(area_range=0.5) + with pytest.raises(TypeError): + # aspect_ratio_range must be a tuple of float + RandomResizedCrop(area_range=(0.08, 1.0), aspect_ratio_range=0.1) + + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + # There will be a slight difference because of rounding + eps = 0.01 + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + results['gt_bboxes'] = np.array([[0, 0, 340, 256]]) + results['proposals'] = np.array([[0, 0, 340, 256]]) + kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2]) + results['keypoint'] = kp + + with pytest.raises(AssertionError): + # area_range[0] > area_range[1], which is wrong + random_crop = RandomResizedCrop(area_range=(0.9, 0.7)) + random_crop(results) + with pytest.raises(AssertionError): + # 0 > area_range[0] and area_range[1] > 1, which is wrong + random_crop = RandomResizedCrop(aspect_ratio_range=(-0.1, 2.0)) + random_crop(results) + + random_crop = RandomResizedCrop() + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert check_crop(imgs, random_crop_result['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert ((0.08 - eps <= h * w / 256 / 341) + and (h * w / 256 / 341 <= 1 + eps)) + assert (3. / 4. - eps <= h / w) and (h / w - eps <= 4. / 3.) + assert repr(random_crop) == (f'{random_crop.__class__.__name__}' + f'(area_range={(0.08, 1.0)}, ' + f'aspect_ratio_range={(3 / 4, 4 / 3)}, ' + f'lazy={False})') + + random_crop = RandomResizedCrop( + area_range=(0.9, 0.9), aspect_ratio_range=(10.0, 10.1)) + # Test fallback cases by very big area range + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert check_crop(imgs, random_crop_result['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert h == w == 256 + + @staticmethod + def test_multi_scale_crop(): + with pytest.raises(TypeError): + # input_size must be int or tuple of int + MultiScaleCrop(0.5) + + with pytest.raises(TypeError): + # input_size must be int or tuple of int + MultiScaleCrop('224') + + with pytest.raises(TypeError): + # scales must be tuple. + MultiScaleCrop( + 224, scales=[ + 1, + ]) + + with pytest.raises(ValueError): + # num_fix_crops must be in [5, 13] + MultiScaleCrop(224, num_fixed_crops=6) + + target_keys = ['imgs', 'crop_bbox', 'img_shape', 'scales'] + + # MultiScaleCrop with normal crops. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + results['gt_bboxes'] = np.array([[0, 0, 340, 256]]) + results['proposals'] = np.array([[0, 0, 340, 256]]) + kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2]) + results['keypoint'] = kp + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_results = multi_scale_crop(results) + assert assert_dict_has_keys(multi_scale_crop_results, target_keys) + assert check_crop(imgs, multi_scale_crop_results['imgs'], + multi_scale_crop_results['crop_bbox']) + assert multi_scale_crop_results['img_shape'] in [(256, 256), + (204, 204)] + + # MultiScaleCrop with more fixed crops. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0, + num_fixed_crops=13) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_results = multi_scale_crop(results) + assert assert_dict_has_keys(multi_scale_crop_results, target_keys) + assert check_crop(imgs, multi_scale_crop_results['imgs'], + multi_scale_crop_results['crop_bbox']) + assert multi_scale_crop_results['img_shape'] in [(256, 256), + (204, 204)] + + # MultiScaleCrop with random crop. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=True, + max_wh_scale_gap=0) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_results = multi_scale_crop(results) + assert assert_dict_has_keys(multi_scale_crop_results, target_keys) + assert check_crop(imgs, multi_scale_crop_results['imgs'], + multi_scale_crop_results['crop_bbox']) + assert (multi_scale_crop_results['img_shape'] in [(256, 256), + (204, 204)]) + + assert repr(multi_scale_crop) == ( + f'{multi_scale_crop.__class__.__name__}' + f'(input_size={(224, 224)}, scales={(1, 0.8)}, ' + f'max_wh_scale_gap={0}, random_crop={True}, ' + f'num_fixed_crops=5, lazy={False})') + + @staticmethod + def test_center_crop(): + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + CenterCrop(0.5) + + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + CenterCrop('224') + + # center crop with crop_size 224 + # add kps in test_center_crop + imgs = list(np.random.rand(2, 240, 320, 3)) + results = dict(imgs=imgs) + kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2]) + results['keypoint'] = kp + + results['gt_bboxes'] = np.array([[0, 0, 320, 240]]) + results['proposals'] = np.array([[0, 0, 320, 240]]) + center_crop = CenterCrop(crop_size=224) + center_crop_results = center_crop(results) + target_keys = ['imgs', 'crop_bbox', 'img_shape', 'keypoint'] + assert assert_dict_has_keys(center_crop_results, target_keys) + assert check_crop(imgs, center_crop_results['imgs'], + center_crop_results['crop_bbox']) + assert np.all( + center_crop_results['crop_bbox'] == np.array([48, 8, 272, 232])) + assert center_crop_results['img_shape'] == (224, 224) + assert np.all(center_crop_results['keypoint'] == 112) + + assert repr(center_crop) == (f'{center_crop.__class__.__name__}' + f'(crop_size={(224, 224)}, lazy={False})') + + @staticmethod + def test_three_crop(): + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + ThreeCrop(0.5) + + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + ThreeCrop('224') + + # three crop with crop_size 120 + imgs = list(np.random.rand(2, 240, 120, 3)) + results = dict(imgs=imgs) + three_crop = ThreeCrop(crop_size=120) + three_crop_results = three_crop(results) + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + assert assert_dict_has_keys(three_crop_results, target_keys) + assert check_crop(imgs, three_crop_results['imgs'], + three_crop_results['crop_bbox'], 3) + assert three_crop_results['img_shape'] == (120, 120) + + # three crop with crop_size 224 + imgs = list(np.random.rand(2, 224, 224, 3)) + results = dict(imgs=imgs) + three_crop = ThreeCrop(crop_size=224) + three_crop_results = three_crop(results) + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + assert assert_dict_has_keys(three_crop_results, target_keys) + assert check_crop(imgs, three_crop_results['imgs'], + three_crop_results['crop_bbox'], 3) + assert three_crop_results['img_shape'] == (224, 224) + + assert repr(three_crop) == (f'{three_crop.__class__.__name__}' + f'(crop_size={(224, 224)})') + + @staticmethod + def test_ten_crop(): + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + TenCrop(0.5) + + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + TenCrop('224') + + # ten crop with crop_size 256 + imgs = list(np.random.rand(2, 256, 256, 3)) + results = dict(imgs=imgs) + ten_crop = TenCrop(crop_size=224) + ten_crop_results = ten_crop(results) + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + assert assert_dict_has_keys(ten_crop_results, target_keys) + assert check_crop(imgs, ten_crop_results['imgs'], + ten_crop_results['crop_bbox'], 10) + assert ten_crop_results['img_shape'] == (224, 224) + + assert repr(ten_crop) == (f'{ten_crop.__class__.__name__}' + f'(crop_size={(224, 224)})') + + +class TestFlip: + + @staticmethod + def test_flip(): + with pytest.raises(ValueError): + # direction must be in ['horizontal', 'vertical'] + Flip(direction='vertically') + + target_keys = ['imgs', 'flip_direction', 'modality'] + + # do not flip imgs. + imgs = list(np.random.rand(2, 64, 64, 3)) + results = dict(imgs=copy.deepcopy(imgs), modality='RGB') + flip = Flip(flip_ratio=0, direction='horizontal') + flip_results = flip(results) + assert assert_dict_has_keys(flip_results, target_keys) + assert np.array_equal(imgs, results['imgs']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert np.shape(flip_results['imgs']) == np.shape(imgs) + + # always flip imgs horizontally. + imgs = list(np.random.rand(2, 64, 64, 3)) + results = dict(imgs=copy.deepcopy(imgs), modality='RGB') + results['gt_bboxes'] = np.array([[0, 0, 60, 60]]) + results['proposals'] = np.array([[0, 0, 60, 60]]) + flip = Flip(flip_ratio=1, direction='horizontal') + flip_results = flip(results) + assert assert_dict_has_keys(flip_results, target_keys) + if flip_results['flip'] is True: + assert check_flip(imgs, flip_results['imgs'], + flip_results['flip_direction']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert np.shape(flip_results['imgs']) == np.shape(imgs) + + # flip flow images horizontally + imgs = [ + np.arange(16).reshape(4, 4).astype(np.float32), + np.arange(16, 32).reshape(4, 4).astype(np.float32) + ] + results = dict(imgs=copy.deepcopy(imgs), modality='Flow') + flip = Flip(flip_ratio=1, direction='horizontal') + flip_results = flip(results) + assert assert_dict_has_keys(flip_results, target_keys) + imgs = [x.reshape(4, 4, 1) for x in imgs] + flip_results['imgs'] = [ + x.reshape(4, 4, 1) for x in flip_results['imgs'] + ] + if flip_results['flip'] is True: + assert check_flip([imgs[0]], + [mmcv.iminvert(flip_results['imgs'][0])], + flip_results['flip_direction']) + assert check_flip([imgs[1]], [flip_results['imgs'][1]], + flip_results['flip_direction']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert np.shape(flip_results['imgs']) == np.shape(imgs) + + # always flip imgs vertivally. + imgs = list(np.random.rand(2, 64, 64, 3)) + results = dict(imgs=copy.deepcopy(imgs), modality='RGB') + flip = Flip(flip_ratio=1, direction='vertical') + flip_results = flip(results) + assert assert_dict_has_keys(flip_results, target_keys) + if flip_results['flip'] is True: + assert check_flip(imgs, flip_results['imgs'], + flip_results['flip_direction']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert np.shape(flip_results['imgs']) == np.shape(imgs) + + assert repr(flip) == (f'{flip.__class__.__name__}' + f'(flip_ratio={1}, direction=vertical, ' + f'flip_label_map={None}, lazy={False})') + + # transform label for the flipped image with the specific label. + _flip_label_map = {4: 6} + imgs = list(np.random.rand(2, 64, 64, 3)) + + # the label should be mapped. + results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=4) + flip = Flip( + flip_ratio=1, + direction='horizontal', + flip_label_map=_flip_label_map) + flip_results = flip(results) + assert results['label'] == 6 + + # the label should not be mapped. + results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=3) + flip = Flip( + flip_ratio=1, + direction='horizontal', + flip_label_map=_flip_label_map) + flip_results = flip(results) + assert results['label'] == 3 + + # flip the keypoints + results = dict( + keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]), + modality='Pose', + img_shape=(64, 64)) + flip = Flip( + flip_ratio=1, direction='horizontal', left_kp=[0], right_kp=[1]) + flip_results = flip(results) + assert_array_almost_equal(flip_results['keypoint'][0, 0], + np.array([[1, 63], [63, 1]])) + + results = dict( + keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]), + modality='Pose', + img_shape=(64, 64)) + flip = Flip( + flip_ratio=1, direction='horizontal', left_kp=[], right_kp=[]) + flip_results = flip(results) + assert_array_almost_equal(flip_results['keypoint'][0, 0], + np.array([[63, 1], [1, 63]])) + + with pytest.raises(AssertionError): + results = dict( + keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]), + modality='Pose', + img_shape=(64, 64)) + flip = Flip( + flip_ratio=1, direction='vertical', left_kp=[], right_kp=[]) + flip_results = flip(results) + + +class TestLazy: + + @staticmethod + def test_init_lazy(): + from mmaction.datasets.transforms.processing import \ + _init_lazy_if_proper # noqa: E501 + with pytest.raises(AssertionError): + # use lazy operation but "lazy" not in results + result = dict(lazy=dict(), img_shape=[64, 64]) + _init_lazy_if_proper(result, False) + + lazy_keys = [ + 'original_shape', 'crop_bbox', 'flip', 'flip_direction', + 'interpolation' + ] + + # 'img_shape' not in results + result = dict(imgs=list(np.random.randn(3, 64, 64, 3))) + _init_lazy_if_proper(result, True) + assert assert_dict_has_keys(result, ['imgs', 'lazy', 'img_shape']) + assert assert_dict_has_keys(result['lazy'], lazy_keys) + + # 'img_shape' in results + result = dict(img_shape=[64, 64]) + _init_lazy_if_proper(result, True) + assert assert_dict_has_keys(result, ['lazy', 'img_shape']) + assert assert_dict_has_keys(result['lazy'], lazy_keys) + + # do not use lazy operation + result = dict(img_shape=[64, 64]) + _init_lazy_if_proper(result, False) + assert assert_dict_has_keys(result, ['img_shape']) + assert 'lazy' not in result + + @staticmethod + def test_random_crop_lazy(): + with pytest.raises(TypeError): + # size must be an int + RandomCrop(size=(112, 112), lazy=True) + with pytest.raises(AssertionError): + # "size > height" or "size > width" is not allowed + imgs = list(np.random.rand(2, 224, 341, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=320, lazy=True) + random_crop(results) + + target_keys = ['imgs', 'crop_bbox', 'img_shape', 'lazy'] + + # General case + imgs = list(np.random.rand(2, 224, 341, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224, lazy=True) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert id(imgs) == id(random_crop_result['imgs']) + random_crop_result_fuse = Fuse()(random_crop_result) + assert 'lazy' not in random_crop_result_fuse + assert check_crop(imgs, random_crop_result_fuse['imgs'], + results['crop_bbox']) + h, w = random_crop_result_fuse['img_shape'] + assert h == w == 224 + + # Test the case that no need for cropping + imgs = list(np.random.rand(2, 224, 224, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224, lazy=True) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert id(imgs) == id(random_crop_result['imgs']) + random_crop_result_fuse = Fuse()(random_crop_result) + assert 'lazy' not in random_crop_result_fuse + assert check_crop(imgs, random_crop_result_fuse['imgs'], + results['crop_bbox']) + h, w = random_crop_result_fuse['img_shape'] + assert h == w == 224 + + # Test the one-side-equal case + imgs = list(np.random.rand(2, 224, 225, 3)) + results = dict(imgs=imgs) + random_crop = RandomCrop(size=224, lazy=True) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert id(imgs) == id(random_crop_result['imgs']) + random_crop_result_fuse = Fuse()(random_crop_result) + assert 'lazy' not in random_crop_result_fuse + assert check_crop(imgs, random_crop_result_fuse['imgs'], + results['crop_bbox']) + h, w = random_crop_result_fuse['img_shape'] + assert h == w == 224 + + assert repr(random_crop) == (f'{random_crop.__class__.__name__}' + f'(size={224}, lazy={True})') + + @staticmethod + def test_random_resized_crop_lazy(): + target_keys = ['imgs', 'crop_bbox', 'img_shape', 'lazy'] + # There will be a slight difference because of rounding + eps = 0.01 + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + + with pytest.raises(AssertionError): + # area_range[0] > area_range[1], which is wrong + random_crop = RandomResizedCrop(area_range=(0.9, 0.7), lazy=True) + random_crop(results) + with pytest.raises(AssertionError): + # 0 > area_range[0] and area_range[1] > 1, which is wrong + random_crop = RandomResizedCrop( + aspect_ratio_range=(-0.1, 2.0), lazy=True) + random_crop(results) + + random_crop = RandomResizedCrop(lazy=True) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert id(imgs) == id(random_crop_result['imgs']) + random_crop_result_fuse = Fuse()(random_crop_result) + assert check_crop(imgs, random_crop_result_fuse['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert ((0.08 - eps <= h * w / 256 / 341) + and (h * w / 256 / 341 <= 1 + eps)) + assert (3. / 4. - eps <= h / w) and (h / w - eps <= 4. / 3.) + assert repr(random_crop) == (f'{random_crop.__class__.__name__}' + f'(area_range={(0.08, 1.0)}, ' + f'aspect_ratio_range={(3 / 4, 4 / 3)}, ' + f'lazy={True})') + + random_crop = RandomResizedCrop( + area_range=(0.9, 0.9), aspect_ratio_range=(10.0, 10.1), lazy=True) + # Test fallback cases by very big area range + imgs = np.random.rand(2, 256, 341, 3) + results = dict(imgs=imgs) + random_crop_result = random_crop(results) + assert assert_dict_has_keys(random_crop_result, target_keys) + assert id(imgs) == id(random_crop_result['imgs']) + random_crop_result_fuse = Fuse()(random_crop_result) + assert check_crop(imgs, random_crop_result_fuse['imgs'], + results['crop_bbox']) + h, w = random_crop_result['img_shape'] + assert h == w == 256 + + @staticmethod + def test_multi_scale_crop_lazy(): + with pytest.raises(TypeError): + # input_size must be int or tuple of int + MultiScaleCrop(0.5, lazy=True) + + with pytest.raises(TypeError): + # input_size must be int or tuple of int + MultiScaleCrop('224', lazy=True) + + with pytest.raises(TypeError): + # scales must be tuple. + MultiScaleCrop( + 224, scales=[ + 1, + ], lazy=True) + + with pytest.raises(ValueError): + # num_fix_crops must be in [5, 13] + MultiScaleCrop(224, num_fixed_crops=6, lazy=True) + + target_keys = ['imgs', 'crop_bbox', 'img_shape', 'scales'] + + # MultiScaleCrop with normal crops. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0, + lazy=True) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_result = multi_scale_crop(results) + assert id(imgs) == id(multi_scale_crop_result['imgs']) + assert assert_dict_has_keys(multi_scale_crop_result, target_keys) + multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result) + assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'], + multi_scale_crop_result['crop_bbox']) + assert multi_scale_crop_result_fuse['img_shape'] in [(256, 256), + (204, 204)] + + # MultiScaleCrop with more fixed crops. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=False, + max_wh_scale_gap=0, + num_fixed_crops=13, + lazy=True) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_result = multi_scale_crop(results) + assert id(imgs) == id(multi_scale_crop_result['imgs']) + assert assert_dict_has_keys(multi_scale_crop_result, target_keys) + multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result) + assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'], + multi_scale_crop_result['crop_bbox']) + assert multi_scale_crop_result_fuse['img_shape'] in [(256, 256), + (204, 204)] + + # MultiScaleCrop with random crop. + imgs = list(np.random.rand(2, 256, 341, 3)) + results = dict(imgs=imgs) + config = dict( + input_size=224, + scales=(1, 0.8), + random_crop=True, + max_wh_scale_gap=0, + lazy=True) + multi_scale_crop = MultiScaleCrop(**config) + multi_scale_crop_result = multi_scale_crop(results) + assert id(imgs) == id(multi_scale_crop_result['imgs']) + assert assert_dict_has_keys(multi_scale_crop_result, target_keys) + multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result) + assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'], + multi_scale_crop_result['crop_bbox']) + assert (multi_scale_crop_result_fuse['img_shape'] in [(256, 256), + (204, 204)]) + + assert repr(multi_scale_crop) == ( + f'{multi_scale_crop.__class__.__name__}' + f'(input_size={(224, 224)}, scales={(1, 0.8)}, ' + f'max_wh_scale_gap={0}, random_crop={True}, ' + f'num_fixed_crops={5}, lazy={True})') + + @staticmethod + def test_resize_lazy(): + with pytest.raises(ValueError): + # scale must be positive + Resize(-0.5, lazy=True) + + with pytest.raises(TypeError): + # scale must be tuple of int + Resize('224', lazy=True) + + target_keys = [ + 'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'modality' + ] + + # scale with -1 to indicate np.inf + imgs = list(np.random.rand(2, 240, 320, 3)) + results = dict(imgs=imgs, modality='RGB') + resize = Resize(scale=(-1, 256), keep_ratio=True, lazy=True) + resize_results = resize(results) + assert id(imgs) == id(resize_results['imgs']) + assert assert_dict_has_keys(resize_results, target_keys) + resize_results_fuse = Fuse()(resize_results) + assert np.all(resize_results_fuse['scale_factor'] == np.array( + [341 / 320, 256 / 240], dtype=np.float32)) + assert resize_results_fuse['img_shape'] == (256, 341) + + # scale with a normal tuple (320, 320) to indicate np.inf + imgs = list(np.random.rand(2, 240, 320, 3)) + results = dict(imgs=imgs, modality='RGB') + resize = Resize(scale=(320, 320), keep_ratio=False, lazy=True) + resize_results = resize(results) + assert id(imgs) == id(resize_results['imgs']) + assert assert_dict_has_keys(resize_results, target_keys) + resize_results_fuse = Fuse()(resize_results) + assert np.all(resize_results_fuse['scale_factor'] == np.array( + [1, 320 / 240], dtype=np.float32)) + assert resize_results_fuse['img_shape'] == (320, 320) + + # scale with a normal tuple (341, 256) to indicate np.inf + imgs = list(np.random.rand(2, 240, 320, 3)) + results = dict(imgs=imgs, modality='RGB') + resize = Resize(scale=(341, 256), keep_ratio=False, lazy=True) + resize_results = resize(results) + assert id(imgs) == id(resize_results['imgs']) + assert assert_dict_has_keys(resize_results, target_keys) + resize_results_fuse = Fuse()(resize_results) + assert np.all(resize_results_fuse['scale_factor'] == np.array( + [341 / 320, 256 / 240], dtype=np.float32)) + assert resize_results_fuse['img_shape'] == (256, 341) + + assert repr(resize) == (f'{resize.__class__.__name__}' + f'(scale={(341, 256)}, keep_ratio={False}, ' + + f'interpolation=bilinear, lazy={True})') + + @staticmethod + def test_flip_lazy(): + with pytest.raises(ValueError): + Flip(direction='vertically', lazy=True) + + target_keys = ['imgs', 'flip_direction', 'modality'] + + # do not flip imgs. + imgs = list(np.random.rand(2, 64, 64, 3)) + imgs_tmp = imgs.copy() + results = dict(imgs=imgs_tmp, modality='RGB') + flip = Flip(flip_ratio=0, direction='horizontal', lazy=True) + flip_results = flip(results) + assert id(imgs_tmp) == id(flip_results['imgs']) + assert assert_dict_has_keys(flip_results, target_keys) + flip_results_fuse = Fuse()(flip_results) + assert np.equal(imgs, results['imgs']).all() + assert id(flip_results['imgs']) == id(results['imgs']) + assert flip_results_fuse['imgs'][0].shape == (64, 64, 3) + + # always flip imgs horizontally. + imgs = list(np.random.rand(2, 64, 64, 3)) + imgs_tmp = imgs.copy() + results = dict(imgs=imgs_tmp, modality='RGB') + flip = Flip(flip_ratio=1, direction='horizontal', lazy=True) + flip_results = flip(results) + assert id(imgs_tmp) == id(flip_results['imgs']) + assert assert_dict_has_keys(flip_results, target_keys) + flip_results_fuse = Fuse()(flip_results) + assert check_flip(imgs, flip_results['imgs'], + flip_results['flip_direction']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert flip_results_fuse['imgs'][0].shape == (64, 64, 3) + + # always flip imgs vertivally. + imgs = list(np.random.rand(2, 64, 64, 3)) + imgs_tmp = imgs.copy() + results = dict(imgs=imgs_tmp, modality='RGB') + flip = Flip(flip_ratio=1, direction='vertical', lazy=True) + flip_results = flip(results) + assert id(imgs_tmp) == id(flip_results['imgs']) + assert assert_dict_has_keys(flip_results, target_keys) + flip_results_fuse = Fuse()(flip_results) + assert check_flip(imgs, flip_results['imgs'], + flip_results['flip_direction']) + assert id(flip_results['imgs']) == id(results['imgs']) + assert flip_results_fuse['imgs'][0].shape == (64, 64, 3) + + assert repr(flip) == (f'{flip.__class__.__name__}' + f'(flip_ratio={1}, direction=vertical, ' + f'flip_label_map={None}, lazy={True})') + + @staticmethod + def test_center_crop_lazy(): + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + CenterCrop(0.5) + + with pytest.raises(TypeError): + # crop_size must be int or tuple of int + CenterCrop('224') + + # center crop with crop_size 224 + imgs = list(np.random.rand(2, 240, 320, 3)) + results = dict(imgs=imgs) + center_crop = CenterCrop(crop_size=224, lazy=True) + center_crop_results = center_crop(results) + + target_keys = ['imgs', 'crop_bbox', 'img_shape'] + assert assert_dict_has_keys(center_crop_results, target_keys) + center_crop_results_fuse = Fuse()(center_crop_results) + assert check_crop(imgs, center_crop_results_fuse['imgs'], + center_crop_results['crop_bbox']) + assert np.all(center_crop_results_fuse['crop_bbox'] == np.array( + [48, 8, 272, 232])) + assert center_crop_results_fuse['img_shape'] == (224, 224) + + assert repr(center_crop) == (f'{center_crop.__class__.__name__}' + f'(crop_size={(224, 224)}, lazy={True})') diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..69ba27d3cd52270af5e24563382e6c8756883d50 --- /dev/null +++ b/tests/datasets/transforms/test_sampling.py @@ -0,0 +1,863 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp + +import mmcv +import numpy as np +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_equal + +from mmaction.datasets.transforms import (AudioFeatureSelector, + DenseSampleFrames, SampleAVAFrames, + SampleFrames, UntrimmedSampleFrames) + + +class BaseTestLoading: + + @classmethod + def setup_class(cls): + cls.data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../../data')) + cls.img_path = osp.join(cls.data_prefix, 'test.jpg') + cls.video_path = osp.join(cls.data_prefix, 'test.mp4') + cls.wav_path = osp.join(cls.data_prefix, 'test.wav') + cls.audio_spec_path = osp.join(cls.data_prefix, 'test.npy') + cls.img_dir = osp.join(cls.data_prefix, 'imgs') + cls.raw_feature_dir = osp.join(cls.data_prefix, 'activitynet_features') + cls.bsp_feature_dir = osp.join(cls.data_prefix, 'bsp_features') + cls.proposals_dir = osp.join(cls.data_prefix, 'proposals') + + cls.total_frames = 5 + cls.filename_tmpl = 'img_{:05}.jpg' + cls.flow_filename_tmpl = '{}_{:05d}.jpg' + video_total_frames = len(mmcv.VideoReader(cls.video_path)) + cls.audio_total_frames = video_total_frames + + cls.video_results = dict( + filename=cls.video_path, + label=1, + total_frames=video_total_frames, + start_index=0) + cls.audio_results = dict( + audios=np.random.randn(1280, ), + audio_path=cls.wav_path, + total_frames=cls.audio_total_frames, + label=1, + start_index=0) + cls.audio_feature_results = dict( + audios=np.random.randn(128, 80), + audio_path=cls.audio_spec_path, + total_frames=cls.audio_total_frames, + label=1, + start_index=0) + cls.frame_results = dict( + frame_dir=cls.img_dir, + total_frames=cls.total_frames, + filename_tmpl=cls.filename_tmpl, + start_index=1, + modality='RGB', + offset=0, + label=1) + cls.flow_frame_results = dict( + frame_dir=cls.img_dir, + total_frames=cls.total_frames, + filename_tmpl=cls.flow_filename_tmpl, + modality='Flow', + offset=0, + label=1) + cls.action_results = dict( + video_name='v_test1', + data_prefix=cls.raw_feature_dir, + temporal_scale=5, + boundary_ratio=0.1, + duration_second=10, + duration_frame=10, + feature_frame=8, + annotations=[{ + 'segment': [3.0, 5.0], + 'label': 'Rock climbing' + }]) + """ + from mmaction.datasets.ssn_dataset import SSNInstance + cls.proposal_results = dict( + frame_dir=cls.img_dir, + video_id='imgs', + total_frames=cls.total_frames, + filename_tmpl=cls.filename_tmpl, + start_index=1, + out_proposals=[[['imgs', SSNInstance(1, 4, 10, 1, 1, 1)], 0], + [['imgs', SSNInstance(2, 5, 10, 2, 1, 1)], 0]]) + """ + + cls.ava_results = dict( + fps=30, timestamp=902, timestamp_start=840, shot_info=(0, 27000)) + + cls.hvu_label_example1 = dict( + categories=['action', 'object', 'scene', 'concept'], + category_nums=[2, 5, 3, 2], + label=dict(action=[0], object=[2, 3], scene=[0, 1])) + cls.hvu_label_example2 = dict( + categories=['action', 'object', 'scene', 'concept'], + category_nums=[2, 5, 3, 2], + label=dict(action=[1], scene=[1, 2], concept=[1])) + + +class TestSampling(BaseTestLoading): + + def test_sample_frames(self): + target_keys = [ + 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', + 'total_frames' + ] + + # Sample Frame with tail Frames + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=3, frame_interval=1, num_clips=5, keep_tail_frames=True) + sample_frames = SampleFrames(**config) + sample_frames(video_result) + sample_frames(frame_result) + + # Sample Frame with no temporal_jitter + # clip_len=3, frame_interval=1, num_clips=5 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=3, frame_interval=1, num_clips=5, temporal_jitter=False) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 15 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 15 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={3}, ' + f'frame_interval={1}, ' + f'num_clips={5}, ' + f'temporal_jitter={False}, ' + f'twice_sample={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={False})') + + # Sample Frame with no temporal_jitter + # clip_len=5, frame_interval=1, num_clips=5, + # out_of_bound_opt='repeat_last' + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=5, + frame_interval=1, + num_clips=5, + temporal_jitter=False, + out_of_bound_opt='repeat_last') + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={5}, ' + f'frame_interval={1}, ' + f'num_clips={5}, ' + f'temporal_jitter={False}, ' + f'twice_sample={False}, ' + f'out_of_bound_opt=repeat_last, ' + f'test_mode={False})') + + def check_monotonous(arr): + length = arr.shape[0] + for i in range(length - 1): + if arr[i] > arr[i + 1]: + return False + return True + + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 25 + frame_inds = sample_frames_results['frame_inds'].reshape([5, 5]) + for i in range(5): + assert check_monotonous(frame_inds[i]) + + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 25 + frame_inds = sample_frames_results['frame_inds'].reshape([5, 5]) + for i in range(5): + assert check_monotonous(frame_inds[i]) + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 + + # Sample Frame with temporal_jitter + # clip_len=4, frame_interval=2, num_clips=5 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, frame_interval=2, num_clips=5, temporal_jitter=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 20 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 20 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={4}, ' + f'frame_interval={2}, ' + f'num_clips={5}, ' + f'temporal_jitter={True}, ' + f'twice_sample={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={False})') + + # Sample Frame with no temporal_jitter in test mode + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 24 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 24 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={4}, ' + f'frame_interval={1}, ' + f'num_clips={6}, ' + f'temporal_jitter={False}, ' + f'twice_sample={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={True})') + + # Sample Frame with no temporal_jitter in test mode + # clip_len=3, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=3, + frame_interval=1, + num_clips=6, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 18 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 18 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 + + # Sample Frame with no temporal_jitter to get clip_offsets + # clip_len=1, frame_interval=1, num_clips=8 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 6 + config = dict( + clip_len=1, + frame_interval=1, + num_clips=8, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 8 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([1, 2, 2, 3, 4, 5, 5, 6])) + + # Sample Frame with no temporal_jitter to get clip_offsets + # clip_len=1, frame_interval=1, num_clips=8 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 6 + config = dict( + clip_len=1, + frame_interval=1, + num_clips=8, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 8 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([1, 2, 2, 3, 4, 5, 5, 6])) + + # Sample Frame with no temporal_jitter to get clip_offsets zero + # clip_len=6, frame_interval=1, num_clips=1 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 5 + config = dict( + clip_len=6, + frame_interval=1, + num_clips=1, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 6 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 6 + assert_array_equal(sample_frames_results['frame_inds'], + [1, 2, 3, 4, 5, 1]) + + # Sample Frame with no temporal_jitter to get avg_interval <= 0 + # clip_len=12, frame_interval=1, num_clips=20 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 30 + config = dict( + clip_len=12, + frame_interval=1, + num_clips=20, + temporal_jitter=False, + test_mode=False) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 240 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 240 + assert np.max(sample_frames_results['frame_inds']) <= 30 + assert np.min(sample_frames_results['frame_inds']) >= 1 + + # Sample Frame with no temporal_jitter to get clip_offsets + # clip_len=1, frame_interval=1, num_clips=8 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 6 + config = dict( + clip_len=1, + frame_interval=1, + num_clips=8, + temporal_jitter=False, + test_mode=False) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert sample_frames_results['start_index'] == 0 + assert len(sample_frames_results['frame_inds']) == 8 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 8 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([1, 2, 3, 3, 4, 5, 5, 6])) + + # Sample Frame with no temporal_jitter to get clip_offsets zero + # clip_len=12, frame_interval=1, num_clips=2 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 10 + config = dict( + clip_len=12, + frame_interval=1, + num_clips=2, + temporal_jitter=False, + test_mode=False) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 24 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 24 + assert np.max(sample_frames_results['frame_inds']) <= 10 + assert np.min(sample_frames_results['frame_inds']) >= 1 + + # Sample Frame using twice sample + # clip_len=12, frame_interval=1, num_clips=2 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 40 + config = dict( + clip_len=12, + frame_interval=1, + num_clips=2, + temporal_jitter=False, + twice_sample=True, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 48 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 48 + assert np.max(sample_frames_results['frame_inds']) <= 40 + assert np.min(sample_frames_results['frame_inds']) >= 1 + + def test_dense_sample_frames(self): + target_keys = [ + 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', + 'total_frames' + ] + + # Dense sample with no temporal_jitter in test mode + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + temporal_jitter=False, + test_mode=True) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 240 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 240 + assert repr(dense_sample_frames) == ( + f'{dense_sample_frames.__class__.__name__}(' + f'clip_len={4}, ' + f'frame_interval={1}, ' + f'num_clips={6}, ' + f'sample_range={64}, ' + f'num_sample_positions={10}, ' + f'temporal_jitter={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={True})') + + # Dense sample with no temporal_jitter + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, frame_interval=1, num_clips=6, temporal_jitter=False) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 24 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 24 + + # Dense sample with no temporal_jitter, sample_range=32 in test mode + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + sample_range=32, + temporal_jitter=False, + test_mode=True) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 240 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 240 + + # Dense sample with no temporal_jitter, sample_range=32 + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + sample_range=32, + temporal_jitter=False) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 24 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 24 + assert repr(dense_sample_frames) == ( + f'{dense_sample_frames.__class__.__name__}(' + f'clip_len={4}, ' + f'frame_interval={1}, ' + f'num_clips={6}, ' + f'sample_range={32}, ' + f'num_sample_positions={10}, ' + f'temporal_jitter={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={False})') + + # Dense sample with no temporal_jitter, sample_range=1000 to check mod + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + sample_range=1000, + temporal_jitter=False) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 24 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 24 + + # Dense sample with no temporal_jitter in test mode + # sample_range=32, num_sample_positions=5 + # clip_len=4, frame_interval=1, num_clips=6 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + config = dict( + clip_len=4, + frame_interval=1, + num_clips=6, + num_sample_positions=5, + sample_range=32, + temporal_jitter=False, + test_mode=True) + dense_sample_frames = DenseSampleFrames(**config) + dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 + assert assert_dict_has_keys(dense_sample_frames_results, target_keys) + assert len(dense_sample_frames_results['frame_inds']) == 120 + dense_sample_frames_results = dense_sample_frames(frame_result) + assert len(dense_sample_frames_results['frame_inds']) == 120 + assert repr(dense_sample_frames) == ( + f'{dense_sample_frames.__class__.__name__}(' + f'clip_len={4}, ' + f'frame_interval={1}, ' + f'num_clips={6}, ' + f'sample_range={32}, ' + f'num_sample_positions={5}, ' + f'temporal_jitter={False}, ' + f'out_of_bound_opt=loop, ' + f'test_mode={True})') + + def test_untrim_sample_frames(self): + + target_keys = [ + 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', + 'total_frames' + ] + + frame_result = dict( + frame_dir=None, + total_frames=100, + filename_tmpl=None, + modality='RGB', + start_index=0, + label=1) + video_result = copy.deepcopy(self.video_results) + + config = dict(clip_len=1, clip_interval=16) # , start_index=0) + sample_frames = UntrimmedSampleFrames(**config) + sample_frames_results = sample_frames(frame_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 6 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([8, 24, 40, 56, 72, 88])) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'clip_interval={16}, ' + f'frame_interval={1})') + + config = dict(clip_len=1, clip_interval=16) # , start_index=0) + sample_frames = UntrimmedSampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + frame_inds = np.array(list(range(8, 300, 16))) + assert len(sample_frames_results['frame_inds']) == frame_inds.shape[0] + assert_array_equal(sample_frames_results['frame_inds'], frame_inds) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'clip_interval={16}, ' + f'frame_interval={1})') + + config = dict(clip_len=1, clip_interval=16) + sample_frames = UntrimmedSampleFrames(**config) + frame_result_ = copy.deepcopy(frame_result) + frame_result_['start_index'] = 1 + sample_frames_results = sample_frames(frame_result_) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 6 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([8, 24, 40, 56, 72, 88]) + 1) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'clip_interval={16}, ' + f'frame_interval={1})') + + config = dict(clip_len=3, clip_interval=16) # , start_index=0) + sample_frames = UntrimmedSampleFrames(**config) + sample_frames_results = sample_frames(frame_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 18 + assert_array_equal( + sample_frames_results['frame_inds'], + np.array([ + 7, 8, 9, 23, 24, 25, 39, 40, 41, 55, 56, 57, 71, 72, 73, 87, + 88, 89 + ])) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={3}, ' + f'clip_interval={16}, ' + f'frame_interval={1})') + + config = dict( + clip_len=3, clip_interval=16, frame_interval=4) # , start_index=0) + sample_frames = UntrimmedSampleFrames(**config) + sample_frames_results = sample_frames(frame_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 18 + assert_array_equal( + sample_frames_results['frame_inds'], + np.array([ + 4, 8, 12, 20, 24, 28, 36, 40, 44, 52, 56, 60, 68, 72, 76, 84, + 88, 92 + ])) + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={3}, ' + f'clip_interval={16}, ' + f'frame_interval={4})') + + def test_sample_ava_frames(self): + target_keys = [ + 'fps', 'timestamp', 'timestamp_start', 'shot_info', 'frame_inds', + 'clip_len', 'frame_interval' + ] + config = dict(clip_len=32, frame_interval=2) + sample_ava_dataset = SampleAVAFrames(**config) + ava_result = sample_ava_dataset(results=self.ava_results) + assert assert_dict_has_keys(ava_result, target_keys) + assert ava_result['clip_len'] == 32 + assert ava_result['frame_interval'] == 2 + assert len(ava_result['frame_inds']) == 32 + assert repr(sample_ava_dataset) == ( + f'{sample_ava_dataset.__class__.__name__}(' + f'clip_len={32}, ' + f'frame_interval={2}, ' + f'test_mode={False})') + + # add test case in Issue #306 + config = dict(clip_len=8, frame_interval=8) + sample_ava_dataset = SampleAVAFrames(**config) + ava_result = sample_ava_dataset(results=self.ava_results) + assert assert_dict_has_keys(ava_result, target_keys) + assert ava_result['clip_len'] == 8 + assert ava_result['frame_interval'] == 8 + assert len(ava_result['frame_inds']) == 8 + assert repr(sample_ava_dataset) == ( + f'{sample_ava_dataset.__class__.__name__}(' + f'clip_len={8}, ' + f'frame_interval={8}, ' + f'test_mode={False})') + + """ TODO + def test_sample_proposal_frames(self): + target_keys = [ + 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', + 'total_frames', 'start_index' + ] + + # test error cases + with pytest.raises(TypeError): + proposal_result = copy.deepcopy(self.proposal_results) + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=('error', 'error'), + aug_ratio=0.5, + temporal_jitter=False) + sample_frames = SampleProposalFrames(**config) + sample_frames(proposal_result) + + # test normal cases + # Sample Frame with no temporal_jitter + # clip_len=1, frame_interval=1 + # body_segments=2, aug_segments=(1, 1) + proposal_result = copy.deepcopy(self.proposal_results) + proposal_result['total_frames'] = 9 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=(1, 1), + aug_ratio=0.5, + temporal_jitter=False) + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={2}, ' + f'aug_segments={(1, 1)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={6}, ' + f'temporal_jitter={False}, ' + f'mode=train)') + + # Sample Frame with temporal_jitter + # clip_len=1, frame_interval=1 + # body_segments=2, aug_segments=(1, 1) + proposal_result = copy.deepcopy(self.proposal_results) + proposal_result['total_frames'] = 9 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=(1, 1), + aug_ratio=0.5, + temporal_jitter=True) + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={2}, ' + f'aug_segments={(1, 1)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={6}, ' + f'temporal_jitter={True}, ' + f'mode=train)') + + # Sample Frame with no temporal_jitter in val mode + # clip_len=1, frame_interval=1 + # body_segments=2, aug_segments=(1, 1) + proposal_result = copy.deepcopy(self.proposals) + proposal_result['total_frames'] = 9 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=(1, 1), + aug_ratio=0.5, + temporal_jitter=False, + mode='val') + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={2}, ' + f'aug_segments={(1, 1)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={6}, ' + f'temporal_jitter={False}, ' + f'mode=val)') + + # Sample Frame with no temporal_jitter in test mode + # test_interval=2 + proposal_result = copy.deepcopy(self.proposals) + proposal_result['out_proposals'] = None + proposal_result['total_frames'] = 10 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=(1, 1), + aug_ratio=0.5, + test_interval=2, + temporal_jitter=False, + mode='test') + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 5 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={2}, ' + f'aug_segments={(1, 1)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={2}, ' + f'temporal_jitter={False}, ' + f'mode=test)') + + # Sample Frame with no temporal_jitter to get clip_offsets zero + # clip_len=1, frame_interval=1 + # body_segments=2, aug_segments=(1, 1) + proposal_result = copy.deepcopy(self.proposals) + proposal_result['total_frames'] = 3 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=2, + aug_segments=(1, 1), + aug_ratio=0.5, + temporal_jitter=False) + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={2}, ' + f'aug_segments={(1, 1)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={6}, ' + f'temporal_jitter={False}, ' + f'mode=train)') + + # Sample Frame with no temporal_jitter to + # get clip_offsets zero in val mode + # clip_len=1, frame_interval=1 + # body_segments=4, aug_segments=(2, 2) + proposal_result = copy.deepcopy(self.proposals) + proposal_result['total_frames'] = 3 + config = dict( + clip_len=1, + frame_interval=1, + body_segments=4, + aug_segments=(2, 2), + aug_ratio=0.5, + temporal_jitter=False, + mode='val') + sample_frames = SampleProposalFrames(**config) + sample_frames_results = sample_frames(proposal_result) + assert assert_dict_has_keys(sample_frames_results, target_keys) + assert len(sample_frames_results['frame_inds']) == 16 + assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}(' + f'clip_len={1}, ' + f'body_segments={4}, ' + f'aug_segments={(2, 2)}, ' + f'aug_ratio={(0.5, 0.5)}, ' + f'frame_interval={1}, ' + f'test_interval={6}, ' + f'temporal_jitter={False}, ' + f'mode=val)') + """ + + def test_audio_feature_selector(self): + target_keys = ['audios'] + # test frame selector with 2 dim input + inputs = copy.deepcopy(self.audio_feature_results) + inputs['frame_inds'] = np.arange(0, self.audio_total_frames, + 2)[:, np.newaxis] + inputs['num_clips'] = 1 + inputs['length'] = 1280 + audio_feature_selector = AudioFeatureSelector() + results = audio_feature_selector(inputs) + assert assert_dict_has_keys(results, target_keys) + assert repr(audio_feature_selector) == ( + f'{audio_feature_selector.__class__.__name__}(' + f'fix_length={128})') diff --git a/tests/datasets/transforms/test_text_transforms.py b/tests/datasets/transforms/test_text_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..e3444db34e32d824a8d06c20ef72f11019a3f430 --- /dev/null +++ b/tests/datasets/transforms/test_text_transforms.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.datasets.transforms import CLIPTokenize + + +class TestTextTransforms: + + @staticmethod + def test_clip_tokenize(): + results = {'text': 'Hello, MMAction2 2.0!'} + clip_tokenize = CLIPTokenize() + results = clip_tokenize(results) + assert results['text'].shape[0] == 77 + assert results['text'].dtype == torch.int32 diff --git a/tests/datasets/transforms/test_wrappers.py b/tests/datasets/transforms/test_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..86a1099202b4622fb5a4b4d78abedb8d014230b5 --- /dev/null +++ b/tests/datasets/transforms/test_wrappers.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +from mmengine.testing import assert_dict_has_keys +from numpy.testing import assert_array_almost_equal + +from mmaction.datasets.transforms import CenterCrop, ImgAug + + +def check_flip(origin_imgs, result_imgs, flip_type): + """Check if the origin_imgs are flipped correctly into result_imgs in + different flip_types.""" + n, _, _, _ = np.shape(origin_imgs) + if flip_type == 'horizontal': + for i in range(n): + if np.any(result_imgs[i] != np.fliplr(origin_imgs[i])): + return False + else: + # yapf: disable + for i in range(n): + if np.any(result_imgs[i] != np.transpose(np.fliplr(np.transpose(origin_imgs[i], (1, 0, 2))), (1, 0, 2))): # noqa:E501 + return False + # yapf: enable + return True + + +class TestAugumentations: + + @staticmethod + def test_ImgAug(): + + with pytest.raises(ValueError): + # transforms only support one string, 'default' + ImgAug(transforms='test') + + with pytest.raises(ValueError): + # transforms only support string or list of dicts + # or iaa.Augmenter object + ImgAug(transforms=dict(type='Rotate')) + + with pytest.raises(AssertionError): + # each dict must have a `type` key + ImgAug(transforms=[dict(rotate=(-30, 30))]) + + with pytest.raises(AttributeError): + # `type` must be available in ImgAug + ImgAug(transforms=[dict(type='BlaBla')]) + + with pytest.raises(TypeError): + # `type` must be str or iaa available type + ImgAug(transforms=[dict(type=CenterCrop)]) + + from imgaug import augmenters as iaa + + # check default configs + target_keys = ['imgs', 'img_shape', 'modality'] + imgs = list(np.random.randint(0, 255, (1, 64, 64, 3)).astype(np.uint8)) + results = dict(imgs=imgs, modality='RGB') + default_ImgAug = ImgAug(transforms='default') + default_results = default_ImgAug(results) + assert_dict_has_keys(default_results, target_keys) + assert default_results['img_shape'] == (64, 64) + + # check flip (both images and bboxes) + target_keys = ['imgs', 'gt_bboxes', 'proposals', 'img_shape'] + imgs = list(np.random.rand(1, 64, 64, 3).astype(np.float32)) + results = dict( + imgs=imgs, + modality='RGB', + proposals=np.array([[0, 0, 25, 35]]), + img_shape=(64, 64), + gt_bboxes=np.array([[0, 0, 25, 35]])) + ImgAug_flip = ImgAug(transforms=[dict(type='Fliplr')]) + flip_results = ImgAug_flip(results) + assert assert_dict_has_keys(flip_results, target_keys) + assert check_flip(imgs, flip_results['imgs'], 'horizontal') + assert_array_almost_equal(flip_results['gt_bboxes'], + np.array([[39, 0, 64, 35]])) + assert_array_almost_equal(flip_results['proposals'], + np.array([[39, 0, 64, 35]])) + transforms = iaa.Sequential([iaa.Fliplr()]) + assert repr(ImgAug_flip) == f'ImgAug(transforms={transforms})' + + # check crop (both images and bboxes) + target_keys = ['crop_bbox', 'gt_bboxes', 'imgs', 'img_shape'] + imgs = list(np.random.rand(1, 122, 122, 3)) + results = dict( + imgs=imgs, + modality='RGB', + img_shape=(122, 122), + gt_bboxes=np.array([[1.5, 2.5, 110, 64]])) + ImgAug_center_crop = ImgAug(transforms=[ + dict( + type=iaa.CropToFixedSize, + width=100, + height=100, + position='center') + ]) + crop_results = ImgAug_center_crop(results) + assert_dict_has_keys(crop_results, target_keys) + assert_array_almost_equal(crop_results['gt_bboxes'], + np.array([[0., 0., 99., 53.]])) + assert 'proposals' not in results + transforms = iaa.Sequential( + [iaa.CropToFixedSize(width=100, height=100, position='center')]) + assert repr(ImgAug_center_crop) == f'ImgAug(transforms={transforms})' + + # check resize (images only) + target_keys = ['imgs', 'img_shape'] + imgs = list(np.random.rand(1, 64, 64, 3)) + results = dict(imgs=imgs, modality='RGB') + transforms = iaa.Resize(32) + ImgAug_resize = ImgAug(transforms=transforms) + resize_results = ImgAug_resize(results) + assert_dict_has_keys(resize_results, target_keys) + assert resize_results['img_shape'] == (32, 32) + assert repr(ImgAug_resize) == f'ImgAug(transforms={transforms})' diff --git a/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py b/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..d9ceba67e03dbf509efc139405ce41530bb7932c --- /dev/null +++ b/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.engine.optimizers import SwinOptimWrapperConstructor + + +class SubModel(nn.Module): + + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2) + self.gn = nn.GroupNorm(2, 2) + self.fc = nn.Linear(2, 2) + self.param1 = nn.Parameter(torch.ones(1)) + + +class ExampleModel(nn.Module): + + def __init__(self): + super().__init__() + self.param1 = nn.Parameter(torch.ones(1)) + self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d(4, 2, kernel_size=1) + self.bn = nn.BatchNorm2d(2) + self.sub = SubModel() + self.fc = nn.Linear(2, 1) + + +base_lr = 0.01 +base_wd = 0.0001 +betas = (0.9, 0.999) + + +def test_swin_optim_wrapper_constructor(): + model = ExampleModel() + optim_wrapper_cfg = dict( + optimizer=dict( + type='AdamW', lr=base_lr, weight_decay=base_wd, betas=betas)) + paramwise_cfg = { + 'base.param1': dict(lr_mult=2.), + 'base.conv1.weight': dict(lr_mult=3.), + 'bn': dict(decay_mult=0.), + 'sub': dict(lr_mult=0.1), + 'sub.conv1.bias': dict(decay_mult=0.1), + 'gn': dict(decay_mult=0.), + } + constructor = SwinOptimWrapperConstructor(optim_wrapper_cfg, paramwise_cfg) + optim_wrapper = constructor(model) + + optimizer = optim_wrapper.optimizer + param_groups = optimizer.param_groups + assert isinstance(optimizer, torch.optim.AdamW) + assert optimizer.defaults['lr'] == base_lr + assert optimizer.defaults['weight_decay'] == base_wd + model_parameters = list(model.parameters()) + assert len(param_groups) == len(model_parameters) + for i, param in enumerate(model_parameters): + param_group = param_groups[i] + assert torch.equal(param_group['params'][0], param) + assert param_group['betas'] == betas + + # param1 + param1 = param_groups[0] + assert param1['lr'] == base_lr * paramwise_cfg['base.param1']['lr_mult'] + assert param1['weight_decay'] == base_wd + # conv1.weight + conv1_weight = param_groups[1] + assert conv1_weight['lr'] == \ + base_lr * paramwise_cfg['base.conv1.weight']['lr_mult'] + assert conv1_weight['weight_decay'] == base_wd + # conv2.weight + conv2_weight = param_groups[2] + assert conv2_weight['lr'] == base_lr + assert conv2_weight['weight_decay'] == base_wd + # conv2.bias + conv2_bias = param_groups[3] + assert conv2_bias['lr'] == base_lr + assert conv2_bias['weight_decay'] == base_wd + # bn.weight + bn_weight = param_groups[4] + assert bn_weight['lr'] == base_lr + assert bn_weight['weight_decay'] == \ + base_wd * paramwise_cfg['bn']['decay_mult'] + # bn.bias + bn_bias = param_groups[5] + assert bn_bias['lr'] == base_lr + assert bn_bias['weight_decay'] == \ + base_wd * paramwise_cfg['bn']['decay_mult'] + # sub.param1 + sub_param1 = param_groups[6] + assert sub_param1['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_param1['weight_decay'] == base_wd + # sub.conv1.weight + sub_conv1_weight = param_groups[7] + assert sub_conv1_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_conv1_weight['weight_decay'] == base_wd + # sub.conv1.bias + sub_conv1_bias = param_groups[8] + assert sub_conv1_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_conv1_bias['weight_decay'] == \ + base_wd * paramwise_cfg['sub.conv1.bias']['decay_mult'] + # sub.gn.weight + sub_gn_weight = param_groups[9] + assert sub_gn_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_gn_weight['weight_decay'] == \ + base_wd * paramwise_cfg['gn']['decay_mult'] + # sub.gn.bias + sub_gn_bias = param_groups[10] + assert sub_gn_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_gn_bias['weight_decay'] == \ + base_wd * paramwise_cfg['gn']['decay_mult'] + # sub.fc.weight + sub_fc_weight = param_groups[11] + assert sub_fc_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_fc_weight['weight_decay'] == base_wd + # sub.fc.bias + sub_fc_bias = param_groups[12] + assert sub_fc_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult'] + assert sub_fc_bias['weight_decay'] == base_wd + # fc.weight + fc_weight = param_groups[13] + assert fc_weight['lr'] == base_lr + assert fc_weight['weight_decay'] == base_wd + # fc.bias + fc_bias = param_groups[14] + assert fc_bias['lr'] == base_lr + assert fc_bias['weight_decay'] == base_wd diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..e51ce0d61c62c2a24f00df509b188d9a46a54635 --- /dev/null +++ b/tests/evaluation/metrics/test_acc_metric.py @@ -0,0 +1,200 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import platform +from unittest import TestCase + +import numpy as np +import pytest +import torch +from mmengine import load +from numpy.testing import assert_array_almost_equal + +from mmaction.evaluation import AccMetric, ConfusionMatrix, MultiSportsMetric +from mmaction.evaluation.functional import ava_eval +from mmaction.registry import METRICS +from mmaction.structures import ActionDataSample + + +def generate_data(num_classes=5, random_label=False, multi_label=False): + data_batch = [] + data_samples = [] + for i in range(num_classes * 10): + scores = torch.randn(num_classes) + if multi_label: + label = torch.ones_like(scores) + elif random_label: + label = torch.randint(num_classes, size=[1]) + else: + label = torch.LongTensor([scores.argmax().item()]) + data_sample = dict(pred_score=scores, gt_label=label) + data_samples.append(data_sample) + return data_batch, data_samples + + +def test_acc_metric(): + num_classes = 32 + metric = AccMetric(metric_list=('top_k_accuracy', 'mean_class_accuracy')) + data_batch, predictions = generate_data( + num_classes=num_classes, random_label=True) + metric.process(data_batch, predictions) + eval_results = metric.compute_metrics(metric.results) + assert 0.0 <= eval_results['top1'] <= eval_results['top5'] <= 1.0 + assert 0.0 <= eval_results['mean1'] <= 1.0 + metric.results.clear() + + data_batch, predictions = generate_data( + num_classes=num_classes, random_label=False) + metric.process(data_batch, predictions) + eval_results = metric.compute_metrics(metric.results) + assert eval_results['top1'] == eval_results['top5'] == 1.0 + assert eval_results['mean1'] == 1.0 + + metric = AccMetric( + metric_list=('mean_average_precision', 'mmit_mean_average_precision')) + data_batch, predictions = generate_data( + num_classes=num_classes, multi_label=True) + metric.process(data_batch, predictions) + eval_results = metric.compute_metrics(metric.results) + assert eval_results['mean_average_precision'] == 1.0 + assert eval_results['mmit_mean_average_precision'] == 1.0 + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Multiprocess Fail') +def test_ava_detection(): + data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/eval_detection')) + + gt_path = osp.join(data_prefix, 'gt.csv') + result_path = osp.join(data_prefix, 'pred.csv') + label_map = osp.join(data_prefix, 'action_list.txt') + + # eval bbox + detection = ava_eval(result_path, 'mAP', label_map, gt_path, None) + assert_array_almost_equal(detection['overall'], 0.09385522) + + +def test_multisport_detection(): + data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/eval_multisports')) + + gt_path = osp.join(data_prefix, 'gt.pkl') + result_path = osp.join(data_prefix, 'data_samples.pkl') + + result_datasamples = load(result_path) + metric = MultiSportsMetric(gt_path) + metric.process(None, result_datasamples) + eval_result = metric.compute_metrics(metric.results) + assert eval_result['frameAP'] == 83.6506 + assert eval_result['v_map@0.2'] == 37.5 + assert eval_result['v_map@0.5'] == 37.5 + assert eval_result['v_map_0.10:0.90'] == 29.1667 + + +class TestConfusionMatrix(TestCase): + + def test_evaluate(self): + """Test using the metric in the same way as Evalutor.""" + pred = [ + ActionDataSample().set_pred_score(i).set_pred_label( + j).set_gt_label(k).to_dict() for i, j, k in zip([ + torch.tensor([0.7, 0.0, 0.3]), + torch.tensor([0.5, 0.2, 0.3]), + torch.tensor([0.4, 0.5, 0.1]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0]) + ] + + # Test with score (use score instead of label if score exists) + metric = METRICS.build(dict(type='ConfusionMatrix')) + metric.process(None, pred) + res = metric.evaluate(6) + self.assertIsInstance(res, dict) + self.assertTensorEqual( + res['confusion_matrix/result'], + torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with label + for sample in pred: + del sample['pred_score'] + metric = METRICS.build(dict(type='ConfusionMatrix')) + metric.process(None, pred) + with self.assertRaisesRegex(AssertionError, + 'Please specify the `num_classes`'): + metric.evaluate(6) + + metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3)) + metric.process(None, pred) + self.assertIsInstance(res, dict) + self.assertTensorEqual( + res['confusion_matrix/result'], + torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + def test_calculate(self): + y_true = np.array([0, 0, 1, 2, 1, 0]) + y_label = torch.tensor([0, 0, 1, 2, 2, 2]) + y_score = [ + [0.7, 0.0, 0.3], + [0.5, 0.2, 0.3], + [0.4, 0.5, 0.1], + [0.0, 0.0, 1.0], + [0.0, 0.0, 1.0], + [0.0, 0.0, 1.0], + ] + + # Test with score + cm = ConfusionMatrix.calculate(y_score, y_true) + self.assertIsInstance(cm, torch.Tensor) + self.assertTensorEqual( + cm, torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with label + with self.assertRaisesRegex(AssertionError, + 'Please specify the `num_classes`'): + ConfusionMatrix.calculate(y_label, y_true) + + cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3) + self.assertIsInstance(cm, torch.Tensor) + self.assertTensorEqual( + cm, torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with invalid inputs + with self.assertRaisesRegex(TypeError, " is not"): + ConfusionMatrix.calculate(y_label, 'hi') + + def test_plot(self): + import matplotlib.pyplot as plt + + cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]]) + fig = ConfusionMatrix.plot(cm, include_values=True, show=False) + + self.assertIsInstance(fig, plt.Figure) + + def assertTensorEqual(self, + tensor: torch.Tensor, + value: float, + msg=None, + **kwarg): + tensor = tensor.to(torch.float32) + value = torch.tensor(value).float() + try: + torch.testing.assert_allclose(tensor, value, **kwarg) + except AssertionError as e: + self.fail(self._formatMessage(msg, str(e))) diff --git a/tests/evaluation/metrics/test_metric_utils.py b/tests/evaluation/metrics/test_metric_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f24fcbbe1e931f65c2a96d11b902e04940d887c9 --- /dev/null +++ b/tests/evaluation/metrics/test_metric_utils.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from mmaction.evaluation.functional import (average_recall_at_avg_proposals, + confusion_matrix, + get_weighted_score, + pairwise_temporal_iou, + top_k_classes) + + +def test_top_k_accurate_classes(): + scores = [ + np.array([0.1, 0.2, 0.3, 0.4]), # 3 + np.array([0.2, 0.3, 0.4, 0.1]), # 2 + np.array([0.3, 0.4, 0.1, 0.2]), # 1 + np.array([0.4, 0.1, 0.2, 0.3]), # 0 + np.array([0.25, 0.1, 0.3, 0.35]), # 3 + np.array([0.2, 0.15, 0.3, 0.35]), # 3 + ] + label = np.array([3, 2, 2, 1, 3, 3], dtype=np.int64) + + with pytest.raises(AssertionError): + top_k_classes(scores, label, 1, mode='wrong') + + results_top1 = top_k_classes(scores, label, 1) + results_top3 = top_k_classes(scores, label, 3) + assert len(results_top1) == 1 + assert len(results_top3) == 3 + assert results_top3[0] == results_top1[0] + assert results_top1 == [(3, 1.)] + assert results_top3 == [(3, 1.), (2, 0.5), (1, 0.0)] + + label = np.array([3, 2, 1, 1, 3, 0], dtype=np.int64) + results_top1 = top_k_classes(scores, label, 1, mode='inaccurate') + results_top3 = top_k_classes(scores, label, 3, mode='inaccurate') + assert len(results_top1) == 1 + assert len(results_top3) == 3 + assert results_top3[0] == results_top1[0] + assert results_top1 == [(0, 0.)] + assert results_top3 == [(0, 0.0), (1, 0.5), (2, 1.0)] + + +def test_pairwise_temporal_iou(): + target_segments = np.array([]) + candidate_segments = np.array([]) + with pytest.raises(ValueError): + pairwise_temporal_iou(target_segments, candidate_segments) + + # test temporal iou + target_segments = np.array([[1, 2], [2, 3]]) + candidate_segments = np.array([[2, 3], [2.5, 3]]) + temporal_iou = pairwise_temporal_iou(candidate_segments, target_segments) + assert_array_equal(temporal_iou, [[0, 0], [1, 0.5]]) + + # test temporal overlap_self + target_segments = np.array([[1, 2], [2, 3]]) + candidate_segments = np.array([[2, 3], [2.5, 3]]) + temporal_iou, temporal_overlap_self = pairwise_temporal_iou( + candidate_segments, target_segments, calculate_overlap_self=True) + assert_array_equal(temporal_overlap_self, [[0, 0], [1, 1]]) + + # test temporal overlap_self when candidate_segments is 1d + target_segments = np.array([[1, 2], [2, 3]]) + candidate_segments = np.array([2.5, 3]) + temporal_iou, temporal_overlap_self = pairwise_temporal_iou( + candidate_segments, target_segments, calculate_overlap_self=True) + assert_array_equal(temporal_overlap_self, [0, 1]) + + +def test_average_recall_at_avg_proposals(): + ground_truth1 = { + 'v_test1': np.array([[0, 1], [1, 2]]), + 'v_test2': np.array([[0, 1], [1, 2]]) + } + ground_truth2 = {'v_test1': np.array([[0, 1]])} + proposals1 = { + 'v_test1': np.array([[0, 1, 1], [1, 2, 1]]), + 'v_test2': np.array([[0, 1, 1], [1, 2, 1]]) + } + proposals2 = { + 'v_test1': np.array([[10, 11, 0.6], [11, 12, 0.4]]), + 'v_test2': np.array([[10, 11, 0.6], [11, 12, 0.4]]) + } + proposals3 = { + 'v_test1': np.array([[i, i + 1, 1 / (i + 1)] for i in range(100)]) + } + + recall, avg_recall, proposals_per_video, auc = ( + average_recall_at_avg_proposals(ground_truth1, proposals1, 4)) + assert_array_equal(recall, [[0.] * 49 + [0.5] * 50 + [1.]] * 10) + assert_array_equal(avg_recall, [0.] * 49 + [0.5] * 50 + [1.]) + assert_array_almost_equal( + proposals_per_video, np.arange(0.02, 2.02, 0.02), decimal=10) + assert auc == 25.5 + + recall, avg_recall, proposals_per_video, auc = ( + average_recall_at_avg_proposals(ground_truth1, proposals2, 4)) + assert_array_equal(recall, [[0.] * 100] * 10) + assert_array_equal(avg_recall, [0.] * 100) + assert_array_almost_equal( + proposals_per_video, np.arange(0.02, 2.02, 0.02), decimal=10) + assert auc == 0 + + recall, avg_recall, proposals_per_video, auc = ( + average_recall_at_avg_proposals(ground_truth2, proposals3, 100)) + assert_array_equal(recall, [[1.] * 100] * 10) + assert_array_equal(avg_recall, ([1.] * 100)) + assert_array_almost_equal( + proposals_per_video, np.arange(1, 101, 1), decimal=10) + assert auc == 99.0 + + +def test_get_weighted_score(): + score_a = [ + np.array([-0.2203, -0.7538, 1.8789, 0.4451, -0.2526]), + np.array([-0.0413, 0.6366, 1.1155, 0.3484, 0.0395]), + np.array([0.0365, 0.5158, 1.1067, -0.9276, -0.2124]), + np.array([0.6232, 0.9912, -0.8562, 0.0148, 1.6413]) + ] + score_b = [ + np.array([-0.0413, 0.6366, 1.1155, 0.3484, 0.0395]), + np.array([0.0365, 0.5158, 1.1067, -0.9276, -0.2124]), + np.array([0.6232, 0.9912, -0.8562, 0.0148, 1.6413]), + np.array([-0.2203, -0.7538, 1.8789, 0.4451, -0.2526]) + ] + weighted_score = get_weighted_score([score_a], [1]) + assert np.all(np.isclose(np.array(score_a), np.array(weighted_score))) + coeff_a, coeff_b = 2., 1. + weighted_score = get_weighted_score([score_a, score_b], [coeff_a, coeff_b]) + ground_truth = [ + x * coeff_a + y * coeff_b for x, y in zip(score_a, score_b) + ] + assert np.all(np.isclose(np.array(ground_truth), np.array(weighted_score))) + + +def gt_confusion_matrix(gt_labels, pred_labels, normalize=None): + """Calculate the ground truth confusion matrix.""" + max_index = max(max(gt_labels), max(pred_labels)) + confusion_mat = np.zeros((max_index + 1, max_index + 1), dtype=np.int64) + for gt, pred in zip(gt_labels, pred_labels): + confusion_mat[gt][pred] += 1 + del_index = [] + for i in range(max_index): + if sum(confusion_mat[i]) == 0 and sum(confusion_mat[:, i]) == 0: + del_index.append(i) + confusion_mat = np.delete(confusion_mat, del_index, axis=0) + confusion_mat = np.delete(confusion_mat, del_index, axis=1) + + if normalize is not None: + confusion_mat = np.array(confusion_mat, dtype=np.float64) + m, n = confusion_mat.shape + if normalize == 'true': + for i in range(m): + s = np.sum(confusion_mat[i], dtype=float) + if s == 0: + continue + confusion_mat[i, :] = confusion_mat[i, :] / s + print(confusion_mat[i, :]) + elif normalize == 'pred': + for i in range(n): + s = sum(confusion_mat[:, i]) + if s == 0: + continue + confusion_mat[:, i] = confusion_mat[:, i] / s + elif normalize == 'all': + s = np.sum(confusion_mat) + if s != 0: + confusion_mat /= s + + return confusion_mat + + +def test_confusion_matrix(): + # custom confusion_matrix + gt_labels = [np.int64(random.randint(0, 9)) for _ in range(100)] + pred_labels = np.random.randint(10, size=100, dtype=np.int64) + + for normalize in [None, 'true', 'pred', 'all']: + cf_mat = confusion_matrix(pred_labels, gt_labels, normalize) + gt_cf_mat = gt_confusion_matrix(gt_labels, pred_labels, normalize) + assert_array_equal(cf_mat, gt_cf_mat) + + with pytest.raises(ValueError): + # normalize must be in ['true', 'pred', 'all', None] + confusion_matrix([1], [1], 'unsupport') + + with pytest.raises(TypeError): + # y_pred must be list or np.ndarray + confusion_matrix(0.5, [1]) + + with pytest.raises(TypeError): + # y_real must be list or np.ndarray + confusion_matrix([1], 0.5) + + with pytest.raises(TypeError): + # y_pred dtype must be np.int64 + confusion_matrix([0.5], [1]) + + with pytest.raises(TypeError): + # y_real dtype must be np.int64 + confusion_matrix([1], [0.5]) diff --git a/tests/evaluation/metrics/test_retrieval_metric.py b/tests/evaluation/metrics/test_retrieval_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..d4709b723a02ff4165bf4de3551e6de736decc17 --- /dev/null +++ b/tests/evaluation/metrics/test_retrieval_metric.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +import pytest +import torch + +from mmaction.evaluation.metrics import RetrievalMetric, RetrievalRecall +from mmaction.registry import METRICS +from mmaction.structures import ActionDataSample + + +def generate_data(num_samples=5, feat_dim=10, random_label=False): + data_batch = [] + data_samples = [] + for i in range(num_samples): + if random_label: + video_feature = torch.randn(feat_dim) + text_feature = torch.randn(feat_dim) + else: + video_feature = torch.randn(feat_dim) + text_feature = video_feature.clone() + + data_sample = dict( + features=dict( + video_feature=video_feature, text_feature=text_feature)) + data_samples.append(data_sample) + return data_batch, data_samples + + +def test_acc_metric(): + with pytest.raises(ValueError): + RetrievalMetric(metric_list='R100') + + num_samples = 20 + metric = RetrievalMetric() + data_batch, predictions = generate_data( + num_samples=num_samples, random_label=True) + metric.process(data_batch, predictions) + eval_results = metric.compute_metrics(metric.results) + assert 0.0 <= eval_results['R1'] <= eval_results['R5'] <= eval_results[ + 'R10'] <= 100.0 + assert 0.0 <= eval_results['MdR'] <= num_samples + assert 0.0 <= eval_results['MnR'] <= num_samples + + metric.results.clear() + + data_batch, predictions = generate_data( + num_samples=num_samples, random_label=False) + metric.process(data_batch, predictions) + eval_results = metric.compute_metrics(metric.results) + assert eval_results['R1'] == eval_results['R5'] == eval_results[ + 'R10'] == 100.0 + assert eval_results['MdR'] == eval_results['MnR'] == 1.0 + + +class TestRetrievalRecall(TestCase): + + def test_evaluate(self): + """Test using the metric in the same way as Evalutor.""" + pred = [ + ActionDataSample().set_pred_score(i).set_gt_label(k).to_dict() + for i, k in zip([ + torch.tensor([0.7, 0.0, 0.3]), + torch.tensor([0.5, 0.2, 0.3]), + torch.tensor([0.4, 0.5, 0.1]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + ], [[0], [0], [1], [2], [2], [0]]) + ] + + # Test with score (use score instead of label if score exists) + metric = METRICS.build(dict(type='RetrievalRecall', topk=1)) + metric.process(None, pred) + recall = metric.evaluate(6) + self.assertIsInstance(recall, dict) + self.assertAlmostEqual( + recall['retrieval/Recall@1'], 5 / 6 * 100, places=4) + + # Test with invalid topk + with self.assertRaisesRegex(RuntimeError, 'selected index k'): + metric = METRICS.build(dict(type='RetrievalRecall', topk=10)) + metric.process(None, pred) + metric.evaluate(6) + + with self.assertRaisesRegex(ValueError, '`topk` must be a'): + METRICS.build(dict(type='RetrievalRecall', topk=-1)) + + # Test initialization + metric = METRICS.build(dict(type='RetrievalRecall', topk=5)) + self.assertEqual(metric.topk, (5, )) + + # Test initialization + metric = METRICS.build(dict(type='RetrievalRecall', topk=(1, 2, 5))) + self.assertEqual(metric.topk, (1, 2, 5)) + + def test_calculate(self): + """Test using the metric from static method.""" + + # seq of indices format + y_true = [[0, 2, 5, 8, 9], [1, 4, 6]] + y_pred = [np.arange(10)] * 2 + + # test with average is 'macro' + recall_score = RetrievalRecall.calculate( + y_pred, y_true, topk=1, pred_indices=True, target_indices=True) + expect_recall = 50. + self.assertEqual(recall_score[0].item(), expect_recall) + + # test with tensor input + y_true = torch.Tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1], + [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]]) + y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2) + recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=1) + expect_recall = 50. + self.assertEqual(recall_score[0].item(), expect_recall) + + # test with topk is 5 + y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2) + recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=2) + expect_recall = 100. + self.assertEqual(recall_score[0].item(), expect_recall) + + # test with topk is (1, 5) + y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2) + recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=(1, 5)) + expect_recalls = [50., 100.] + self.assertEqual(len(recall_score), len(expect_recalls)) + for i in range(len(expect_recalls)): + self.assertEqual(recall_score[i].item(), expect_recalls[i]) + + # Test with invalid pred + y_pred = dict() + y_true = [[0, 2, 5, 8, 9], [1, 4, 6]] + with self.assertRaisesRegex(AssertionError, '`pred` must be Seq'): + RetrievalRecall.calculate(y_pred, y_true, True, True) + + # Test with invalid target + y_true = dict() + y_pred = [np.arange(10)] * 2 + with self.assertRaisesRegex(AssertionError, '`target` must be Seq'): + RetrievalRecall.calculate( + y_pred, y_true, topk=1, pred_indices=True, target_indices=True) + + # Test with different length `pred` with `target` + y_true = [[0, 2, 5, 8, 9], [1, 4, 6]] + y_pred = [np.arange(10)] * 3 + with self.assertRaisesRegex(AssertionError, 'Length of `pred`'): + RetrievalRecall.calculate( + y_pred, y_true, topk=1, pred_indices=True, target_indices=True) + + # Test with invalid pred + y_true = [[0, 2, 5, 8, 9], dict()] + y_pred = [np.arange(10)] * 2 + with self.assertRaisesRegex(AssertionError, '`target` should be'): + RetrievalRecall.calculate( + y_pred, y_true, topk=1, pred_indices=True, target_indices=True) + + # Test with invalid target + y_true = [[0, 2, 5, 8, 9], [1, 4, 6]] + y_pred = [np.arange(10), dict()] + with self.assertRaisesRegex(AssertionError, '`pred` should be'): + RetrievalRecall.calculate( + y_pred, y_true, topk=1, pred_indices=True, target_indices=True) diff --git a/tests/models/backbones/__init__.py b/tests/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/backbones/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/backbones/test_aagcn.py b/tests/models/backbones/test_aagcn.py new file mode 100644 index 0000000000000000000000000000000000000000..0eed3341c90273377521c82e83a9f82d1c3073d8 --- /dev/null +++ b/tests/models/backbones/test_aagcn.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import AAGCN +from mmaction.utils import register_all_modules + + +def test_aagcn_backbone(): + """Test AAGCN backbone.""" + + register_all_modules() + + mode = 'spatial' + batch_size, num_person, num_frames = 2, 2, 150 + + # openpose-18 layout + num_joints = 18 + model = AAGCN(graph_cfg=dict(layout='openpose', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 18]) + + # nturgb+d layout + num_joints = 25 + model = AAGCN(graph_cfg=dict(layout='nturgb+d', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 25]) + + # coco layout + num_joints = 17 + model = AAGCN(graph_cfg=dict(layout='coco', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 17]) + + # custom settings + # disable the attention module to degenerate AAGCN to AGCN + model = AAGCN( + graph_cfg=dict(layout='coco', mode=mode), gcn_attention=False) + model.init_weights() + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 17]) diff --git a/tests/models/backbones/test_c2d.py b/tests/models/backbones/test_c2d.py new file mode 100644 index 0000000000000000000000000000000000000000..fe5d7a484fe9f411f125a0672d2fc0215256812a --- /dev/null +++ b/tests/models/backbones/test_c2d.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import C2D +from mmaction.testing import generate_backbone_demo_inputs + + +def test_c2d_backbone(): + """Test c2d backbone.""" + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + # c2d inference test + c2d_r50 = C2D(depth=50) + c2d_r50.init_weights() + c2d_r50.train() + feat = c2d_r50(imgs) + assert feat.shape == torch.Size([1, 2048, 4, 2, 2]) + + c2d_r101 = C2D(depth=101) + c2d_r101.init_weights() + c2d_r101.train() + feat = c2d_r101(imgs) + assert feat.shape == torch.Size([1, 2048, 4, 2, 2]) diff --git a/tests/models/backbones/test_c3d.py b/tests/models/backbones/test_c3d.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca7879d63a155c4f5de30273579fac0c64e10b3 --- /dev/null +++ b/tests/models/backbones/test_c3d.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import C3D +from mmaction.testing import generate_backbone_demo_inputs + + +def test_c3d_backbone(): + """Test c3d backbone.""" + input_shape = (1, 3, 16, 24, 24) + imgs = generate_backbone_demo_inputs(input_shape) + + # c3d inference test + c3d = C3D(out_dim=512) + c3d.init_weights() + c3d.train() + feat = c3d(imgs) + assert feat.shape == torch.Size([1, 4096]) + + # c3d with bn inference test + c3d_bn = C3D(out_dim=512, norm_cfg=dict(type='BN3d')) + c3d_bn.init_weights() + c3d_bn.train() + feat = c3d_bn(imgs) + assert feat.shape == torch.Size([1, 4096]) diff --git a/tests/models/backbones/test_mobilenet_v2.py b/tests/models/backbones/test_mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..b2d57b8f9760e9f602fbe0fd47e634b23765e4f4 --- /dev/null +++ b/tests/models/backbones/test_mobilenet_v2.py @@ -0,0 +1,218 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import MobileNetV2 +from mmaction.testing import check_norm_state, generate_backbone_demo_inputs + + +def test_mobilenetv2_backbone(): + """Test MobileNetV2. + + Modified from mmclassification. + """ + from torch.nn.modules import GroupNorm + + from mmaction.models.backbones.mobilenet_v2 import InvertedResidual + + def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (InvertedResidual, )): + return True + return False + + with pytest.raises(TypeError): + # pretrained must be a string path + model = MobileNetV2() + model.init_weights(pretrained=0) + + with pytest.raises(ValueError): + # frozen_stages must in range(1, 9) + MobileNetV2(frozen_stages=9) + + with pytest.raises(ValueError): + # tout_indices in range(-1, 8) + MobileNetV2(out_indices=[8]) + + input_shape = (1, 3, 224, 224) + imgs = generate_backbone_demo_inputs(input_shape) + + # Test MobileNetV2 with first stage frozen + frozen_stages = 1 + model = MobileNetV2(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.conv1.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test MobileNetV2 with all stages frozen + frozen_stages = 8 + model = MobileNetV2(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.modules(): + if not isinstance(mod, MobileNetV2): + assert mod.training is False + for param in mod.parameters(): + assert param.requires_grad is False + + # Test MobileNetV2 with norm_eval=True + model = MobileNetV2(norm_eval=True) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), False) + + # Test MobileNetV2 forward with widen_factor=1.0, pretrained + model = MobileNetV2( + widen_factor=1.0, + out_indices=range(0, 8), + pretrained='mmcls://mobilenet_v2') + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), True) + + feat = model(imgs) + assert len(feat) == 8 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + assert feat[7].shape == torch.Size((1, 1280, 7, 7)) + + # Test MobileNetV2 forward with widen_factor=0.5 + model = MobileNetV2(widen_factor=0.5, out_indices=range(0, 7)) + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 8, 112, 112)) + assert feat[1].shape == torch.Size((1, 16, 56, 56)) + assert feat[2].shape == torch.Size((1, 16, 28, 28)) + assert feat[3].shape == torch.Size((1, 32, 14, 14)) + assert feat[4].shape == torch.Size((1, 48, 14, 14)) + assert feat[5].shape == torch.Size((1, 80, 7, 7)) + assert feat[6].shape == torch.Size((1, 160, 7, 7)) + + # Test MobileNetV2 forward with widen_factor=2.0 + model = MobileNetV2(widen_factor=2.0) + model.init_weights() + model.train() + + feat = model(imgs) + assert feat.shape == torch.Size((1, 2560, 7, 7)) + + # Test MobileNetV2 forward with out_indices=None + model = MobileNetV2(widen_factor=1.0) + model.init_weights() + model.train() + + feat = model(imgs) + assert feat.shape == torch.Size((1, 1280, 7, 7)) + + # Test MobileNetV2 forward with dict(type='ReLU') + model = MobileNetV2( + widen_factor=1.0, act_cfg=dict(type='ReLU'), out_indices=range(0, 7)) + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with GroupNorm forward + model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 7)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with BatchNorm forward + model = MobileNetV2( + widen_factor=1.0, + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True), + out_indices=range(0, 7)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with layers 1, 3, 5 out forward + model = MobileNetV2(widen_factor=1.0, out_indices=(0, 2, 4)) + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 32, 28, 28)) + assert feat[2].shape == torch.Size((1, 96, 14, 14)) + + # Test MobileNetV2 with checkpoint forward + model = MobileNetV2( + widen_factor=1.0, with_cp=True, out_indices=range(0, 7)) + for m in model.modules(): + if is_block(m): + assert m.with_cp + model.init_weights() + model.train() + + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) diff --git a/tests/models/backbones/test_mobilenet_v2_tsm.py b/tests/models/backbones/test_mobilenet_v2_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..57e1004ebdd6dd85fd04a6ab9ceaeb4ac380a931 --- /dev/null +++ b/tests/models/backbones/test_mobilenet_v2_tsm.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import MobileNetV2TSM +from mmaction.testing import generate_backbone_demo_inputs + + +def test_mobilenetv2_tsm_backbone(): + """Test mobilenetv2_tsm backbone.""" + from mmcv.cnn import ConvModule + + from mmaction.models.backbones.mobilenet_v2 import InvertedResidual + from mmaction.models.backbones.resnet_tsm import TemporalShift + + input_shape = (8, 3, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + # mobilenetv2_tsm with width_mult = 1.0 + mobilenetv2_tsm = MobileNetV2TSM(pretrained='mmcls://mobilenet_v2') + mobilenetv2_tsm.init_weights() + for cur_module in mobilenetv2_tsm.modules(): + if isinstance(cur_module, InvertedResidual) and \ + len(cur_module.conv) == 3 and \ + cur_module.use_res_connect: + assert isinstance(cur_module.conv[0], TemporalShift) + assert cur_module.conv[0].num_segments == \ + mobilenetv2_tsm.num_segments + assert cur_module.conv[0].shift_div == mobilenetv2_tsm.shift_div + assert isinstance(cur_module.conv[0].net, ConvModule) + + # TSM-MobileNetV2 with widen_factor = 1.0 forword + feat = mobilenetv2_tsm(imgs) + assert feat.shape == torch.Size([8, 1280, 2, 2]) + + # mobilenetv2 with widen_factor = 0.5 forword + mobilenetv2_tsm_05 = MobileNetV2TSM(widen_factor=0.5, pretrained2d=False) + mobilenetv2_tsm_05.init_weights() + feat = mobilenetv2_tsm_05(imgs) + assert feat.shape == torch.Size([8, 1280, 2, 2]) + + # mobilenetv2 with widen_factor = 1.5 forword + mobilenetv2_tsm_15 = MobileNetV2TSM(widen_factor=1.5, pretrained2d=False) + mobilenetv2_tsm_15.init_weights() + feat = mobilenetv2_tsm_15(imgs) + assert feat.shape == torch.Size([8, 1920, 2, 2]) diff --git a/tests/models/backbones/test_mobileone_tsm.py b/tests/models/backbones/test_mobileone_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..b4d6ea87a3203037cb524c3878ca71264ee06a66 --- /dev/null +++ b/tests/models/backbones/test_mobileone_tsm.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import tempfile + +import torch +from mmengine.runner import load_checkpoint, save_checkpoint +from mmengine.runner.checkpoint import _load_checkpoint_with_prefix + +from mmaction.models.backbones.mobileone_tsm import MobileOneTSM +from mmaction.testing import generate_backbone_demo_inputs + + +def test_mobileone_tsm_backbone(): + """Test MobileOne TSM backbone.""" + + from mmpretrain.models.backbones.mobileone import MobileOneBlock + + from mmaction.models.backbones.resnet_tsm import TemporalShift + + model = MobileOneTSM('s0', pretrained2d=False) + model.init_weights() + for cur_module in model.modules(): + if isinstance(cur_module, TemporalShift): + # TemporalShift is a wrapper of MobileOneBlock + assert isinstance(cur_module.net, MobileOneBlock) + assert cur_module.num_segments == model.num_segments + assert cur_module.shift_div == model.shift_div + + inputs = generate_backbone_demo_inputs((8, 3, 64, 64)) + + feat = model(inputs) + assert feat.shape == torch.Size([8, 1024, 2, 2]) + + model = MobileOneTSM('s1', pretrained2d=False) + feat = model(inputs) + assert feat.shape == torch.Size([8, 1280, 2, 2]) + + model = MobileOneTSM('s2', pretrained2d=False) + feat = model(inputs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + model = MobileOneTSM('s3', pretrained2d=False) + feat = model(inputs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + model = MobileOneTSM('s4', pretrained2d=False) + feat = model(inputs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + +def test_mobileone_init_weight(): + checkpoint = ('https://download.openmmlab.com/mmclassification/v0' + '/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth') + # ckpt = torch.load(checkpoint)['state_dict'] + model = MobileOneTSM( + arch='s0', + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint, prefix='backbone')) + model.init_weights() + ori_ckpt = _load_checkpoint_with_prefix( + 'backbone', model.init_cfg['checkpoint'], map_location='cpu') + for name, param in model.named_parameters(): + ori_name = name.replace('.net', '') + assert torch.allclose(param, ori_ckpt[ori_name]), \ + f'layer {name} fail to load from pretrained checkpoint' + + +def test_load_deploy_mobileone(): + # Test output before and load from deploy checkpoint + model = MobileOneTSM('s0', pretrained2d=False) + inputs = generate_backbone_demo_inputs((8, 3, 64, 64)) + tmpdir = tempfile.gettempdir() + ckpt_path = os.path.join(tmpdir, 'ckpt.pth') + model.switch_to_deploy() + model.eval() + outputs = model(inputs) + + model_deploy = MobileOneTSM('s0', pretrained2d=False, deploy=True) + save_checkpoint(model.state_dict(), ckpt_path) + load_checkpoint(model_deploy, ckpt_path) + + outputs_load = model_deploy(inputs) + for feat, feat_load in zip(outputs, outputs_load): + assert torch.allclose(feat, feat_load) + os.remove(ckpt_path) diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py new file mode 100644 index 0000000000000000000000000000000000000000..bff1d5d7382234b98acf29d43f6c8b1d0ee856d2 --- /dev/null +++ b/tests/models/backbones/test_mvit.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from unittest import TestCase + +import torch + +from mmaction.models import MViT + + +class TestMViT(TestCase): + + def setUp(self): + self.cfg = dict(arch='tiny', drop_path_rate=0.1) + + def test_structure(self): + # Test invalid default arch + with self.assertRaisesRegex(AssertionError, 'not in default archs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = 'unknown' + MViT(**cfg) + + # Test invalid custom arch + with self.assertRaisesRegex(AssertionError, 'Custom arch needs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'num_layers': 24, + 'num_heads': 16, + 'feedforward_channels': 4096 + } + MViT(**cfg) + + # Test custom arch + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [2, 5, 8] + } + stage_indices = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] + model = MViT(**cfg) + self.assertEqual(model.embed_dims, 96) + self.assertEqual(model.num_layers, 10) + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + self.assertEqual(block.out_dims, 96 * 2**(stage)) + + # Test out_indices + cfg = deepcopy(self.cfg) + cfg['out_scales'] = {1: 1} + with self.assertRaisesRegex(AssertionError, "get "): + MViT(**cfg) + cfg['out_scales'] = [0, 13] + with self.assertRaisesRegex(AssertionError, 'Invalid out_scales 13'): + MViT(**cfg) + + # Test model structure + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + stage_indices = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3] + self.assertEqual(len(model.blocks), 10) + dpr_inc = 0.1 / (10 - 1) + dpr = 0 + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + print(i, stage) + self.assertEqual(block.attn.num_heads, 2**stage) + if dpr > 0: + self.assertAlmostEqual(block.drop_path.drop_prob, dpr) + dpr += dpr_inc + + def test_init_weights(self): + # test weight init cfg + cfg = deepcopy(self.cfg) + cfg['init_cfg'] = [ + dict( + type='Kaiming', + layer='Conv3d', + mode='fan_in', + nonlinearity='linear') + ] + cfg['use_abs_pos_embed'] = True + model = MViT(**cfg) + ori_weight = model.patch_embed.projection.weight.clone().detach() + # The pos_embed is all zero before initialize + self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.))) + + model.init_weights() + initialized_weight = model.patch_embed.projection.weight + self.assertFalse(torch.allclose(ori_weight, initialized_weight)) + self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.))) + + def test_forward(self): + imgs = torch.randn(1, 3, 6, 64, 64) + + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token, cls_token = outs[-1] + self.assertEqual(patch_token.shape, (1, 768, 3, 2, 2)) + + # Test forward with multi out scales + cfg = deepcopy(self.cfg) + cfg['out_scales'] = (0, 1, 2, 3) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 4) + for stage, out in enumerate(outs): + stride = 2**stage + patch_token, cls_token = out + self.assertEqual(patch_token.shape, + (1, 96 * stride, 3, 16 // stride, 16 // stride)) + self.assertEqual(cls_token.shape, (1, 96 * stride)) + + # Test forward with dynamic input size + imgs1 = torch.randn(1, 3, 2, 64, 64) + imgs2 = torch.randn(1, 3, 2, 96, 96) + imgs3 = torch.randn(1, 3, 2, 96, 128) + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + for imgs in [imgs1, imgs2, imgs3]: + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token, cls_token = outs[-1] + expect_feat_shape = (math.ceil(imgs.shape[2] / 2), + math.ceil(imgs.shape[3] / 32), + math.ceil(imgs.shape[4] / 32)) + self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape)) + self.assertEqual(cls_token.shape, (1, 768)) diff --git a/tests/models/backbones/test_resnet.py b/tests/models/backbones/test_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..dab195461fd74b72b8053cfc089105e5e793bd11 --- /dev/null +++ b/tests/models/backbones/test_resnet.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +import torch.nn as nn +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import ResNet +from mmaction.testing import check_norm_state, generate_backbone_demo_inputs + + +def test_resnet_backbone(): + """Test resnet backbone.""" + with pytest.raises(KeyError): + # ResNet depth should be in [18, 34, 50, 101, 152] + ResNet(20) + + with pytest.raises(AssertionError): + # In ResNet: 1 <= num_stages <= 4 + ResNet(50, num_stages=0) + + with pytest.raises(AssertionError): + # In ResNet: 1 <= num_stages <= 4 + ResNet(50, num_stages=5) + + with pytest.raises(AssertionError): + # len(strides) == len(dilations) == num_stages + ResNet(50, strides=(1, ), dilations=(1, 1), num_stages=3) + + with pytest.raises(TypeError): + # pretrain must be a str + resnet50 = ResNet(50, pretrained=0) + resnet50.init_weights() + + with pytest.raises(AssertionError): + # style must be in ['pytorch', 'caffe'] + ResNet(18, style='tensorflow') + + with pytest.raises(AssertionError): + # assert not with_cp + ResNet(18, with_cp=True) + + # resnet with depth 18, norm_eval False, initial weights + resnet18 = ResNet(18) + resnet18.init_weights() + + # resnet with depth 50, norm_eval True + resnet50 = ResNet(50, norm_eval=True) + resnet50.init_weights() + resnet50.train() + assert check_norm_state(resnet50.modules(), False) + + # resnet with depth 50, norm_eval True, pretrained + resnet50_pretrain = ResNet( + pretrained='torchvision://resnet50', depth=50, norm_eval=True) + resnet50_pretrain.init_weights() + resnet50_pretrain.train() + assert check_norm_state(resnet50_pretrain.modules(), False) + + # resnet with depth 50, norm_eval True, frozen_stages 1 + frozen_stages = 1 + resnet50_frozen = ResNet(50, frozen_stages=frozen_stages) + resnet50_frozen.init_weights() + resnet50_frozen.train() + assert resnet50_frozen.conv1.bn.training is False + for layer in resnet50_frozen.conv1.modules(): + for param in layer.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(resnet50_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # resnet with depth 50, partial batchnorm + resnet_pbn = ResNet(50, partial_bn=True) + resnet_pbn.train() + count_bn = 0 + for m in resnet_pbn.modules(): + if isinstance(m, nn.BatchNorm2d): + count_bn += 1 + if count_bn >= 2: + assert m.weight.requires_grad is False + assert m.bias.requires_grad is False + assert m.training is False + else: + assert m.weight.requires_grad is True + assert m.bias.requires_grad is True + assert m.training is True + + input_shape = (1, 3, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + # resnet with depth 18 inference + resnet18 = ResNet(18, norm_eval=False) + resnet18.init_weights() + resnet18.train() + feat = resnet18(imgs) + assert feat.shape == torch.Size([1, 512, 2, 2]) + + # resnet with depth 50 inference + resnet50 = ResNet(50, norm_eval=False) + resnet50.init_weights() + resnet50.train() + feat = resnet50(imgs) + assert feat.shape == torch.Size([1, 2048, 2, 2]) + + # resnet with depth 50 in caffe style inference + resnet50_caffe = ResNet(50, style='caffe', norm_eval=False) + resnet50_caffe.init_weights() + resnet50_caffe.train() + feat = resnet50_caffe(imgs) + assert feat.shape == torch.Size([1, 2048, 2, 2]) + + resnet50_flow = ResNet( + depth=50, pretrained='torchvision://resnet50', in_channels=10) + input_shape = (1, 10, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + feat = resnet50_flow(imgs) + assert feat.shape == torch.Size([1, 2048, 2, 2]) + + resnet50 = ResNet( + depth=50, pretrained='torchvision://resnet50', in_channels=3) + input_shape = (1, 3, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + feat = resnet50(imgs) + assert feat.shape == torch.Size([1, 2048, 2, 2]) diff --git a/tests/models/backbones/test_resnet2plus1d.py b/tests/models/backbones/test_resnet2plus1d.py new file mode 100644 index 0000000000000000000000000000000000000000..8af90d5a5ac2ab0af011111233ef2ecb541f1d82 --- /dev/null +++ b/tests/models/backbones/test_resnet2plus1d.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import ResNet2Plus1d +from mmaction.testing import generate_backbone_demo_inputs + + +def test_resnet2plus1d_backbone(): + # Test r2+1d backbone + with pytest.raises(AssertionError): + # r2+1d does not support inflation + ResNet2Plus1d(50, None, pretrained2d=True) + + with pytest.raises(AssertionError): + # r2+1d requires conv(2+1)d module + ResNet2Plus1d( + 50, None, pretrained2d=False, conv_cfg=dict(type='Conv3d')) + + frozen_stages = 1 + r2plus1d_34_frozen = ResNet2Plus1d( + 34, + None, + conv_cfg=dict(type='Conv2plus1d'), + pretrained2d=False, + frozen_stages=frozen_stages, + conv1_kernel=(3, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(1, 1, 1, 1), + spatial_strides=(1, 2, 2, 2), + temporal_strides=(1, 2, 2, 2)) + r2plus1d_34_frozen.init_weights() + r2plus1d_34_frozen.train() + assert r2plus1d_34_frozen.conv1.conv.bn_s.training is False + assert r2plus1d_34_frozen.conv1.bn.training is False + for param in r2plus1d_34_frozen.conv1.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(r2plus1d_34_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + r2plus1d_34_frozen = r2plus1d_34_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = r2plus1d_34_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 512, 1, 2, 2]) + else: + feat = r2plus1d_34_frozen(imgs) + assert feat.shape == torch.Size([1, 512, 1, 2, 2]) + + r2plus1d_50_frozen = ResNet2Plus1d( + 50, + None, + conv_cfg=dict(type='Conv2plus1d'), + pretrained2d=False, + conv1_kernel=(3, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(1, 1, 1, 1), + spatial_strides=(1, 2, 2, 2), + temporal_strides=(1, 2, 2, 2), + frozen_stages=frozen_stages) + r2plus1d_50_frozen.init_weights() + + r2plus1d_50_frozen.train() + assert r2plus1d_50_frozen.conv1.conv.bn_s.training is False + assert r2plus1d_50_frozen.conv1.bn.training is False + for param in r2plus1d_50_frozen.conv1.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(r2plus1d_50_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + r2plus1d_50_frozen = r2plus1d_50_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = r2plus1d_50_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) + else: + feat = r2plus1d_50_frozen(imgs) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) diff --git a/tests/models/backbones/test_resnet3d.py b/tests/models/backbones/test_resnet3d.py new file mode 100644 index 0000000000000000000000000000000000000000..2467ac04236f7d3050083a770b9867d8abad0da0 --- /dev/null +++ b/tests/models/backbones/test_resnet3d.py @@ -0,0 +1,338 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import pytest +import torch +import torch.nn as nn +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import ResNet3d, ResNet3dLayer +from mmaction.testing import check_norm_state, generate_backbone_demo_inputs + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_resnet3d_backbone(): + """Test resnet3d backbone.""" + with pytest.raises(AssertionError): + # In ResNet3d: 1 <= num_stages <= 4 + ResNet3d(34, None, num_stages=0) + + with pytest.raises(AssertionError): + # In ResNet3d: 1 <= num_stages <= 4 + ResNet3d(34, None, num_stages=5) + + with pytest.raises(AssertionError): + # In ResNet3d: 1 <= num_stages <= 4 + ResNet3d(50, None, num_stages=0) + + with pytest.raises(AssertionError): + # In ResNet3d: 1 <= num_stages <= 4 + ResNet3d(50, None, num_stages=5) + + with pytest.raises(AssertionError): + # len(spatial_strides) == len(temporal_strides) + # == len(dilations) == num_stages + ResNet3d( + 50, + None, + spatial_strides=(1, ), + temporal_strides=(1, 1), + dilations=(1, 1, 1), + num_stages=4) + + with pytest.raises(AssertionError): + # len(spatial_strides) == len(temporal_strides) + # == len(dilations) == num_stages + ResNet3d( + 34, + None, + spatial_strides=(1, ), + temporal_strides=(1, 1), + dilations=(1, 1, 1), + num_stages=4) + + with pytest.raises(TypeError): + # pretrain must be str or None. + resnet3d_34 = ResNet3d(34, ['resnet', 'bninception']) + resnet3d_34.init_weights() + + with pytest.raises(TypeError): + # pretrain must be str or None. + resnet3d_50 = ResNet3d(50, ['resnet', 'bninception']) + resnet3d_50.init_weights() + + # resnet3d with depth 34, no pretrained, norm_eval True + resnet3d_34 = ResNet3d(34, None, pretrained2d=False, norm_eval=True) + resnet3d_34.init_weights() + resnet3d_34.train() + assert check_norm_state(resnet3d_34.modules(), False) + + # resnet3d with depth 50, no pretrained, norm_eval True + resnet3d_50 = ResNet3d(50, None, pretrained2d=False, norm_eval=True) + resnet3d_50.init_weights() + resnet3d_50.train() + assert check_norm_state(resnet3d_50.modules(), False) + + # resnet3d with depth 50, pretrained2d, norm_eval True + resnet3d_50_pretrain = ResNet3d( + 50, 'torchvision://resnet50', norm_eval=True) + resnet3d_50_pretrain.init_weights() + resnet3d_50_pretrain.train() + assert check_norm_state(resnet3d_50_pretrain.modules(), False) + from mmengine.runner.checkpoint import _load_checkpoint + chkp_2d = _load_checkpoint('torchvision://resnet50') + for name, module in resnet3d_50_pretrain.named_modules(): + if len(name.split('.')) == 4: + # layer.block.module.submodule + prefix = name.split('.')[:2] + module_type = name.split('.')[2] + submodule_type = name.split('.')[3] + + if module_type == 'downsample': + name2d = name.replace('conv', '0').replace('bn', '1') + else: + layer_id = name.split('.')[2][-1] + name2d = prefix[0] + '.' + prefix[1] + '.' + \ + submodule_type + layer_id + + if isinstance(module, nn.Conv3d): + conv2d_weight = chkp_2d[name2d + '.weight'] + conv3d_weight = getattr(module, 'weight').data + assert torch.equal( + conv3d_weight, + conv2d_weight.data.unsqueeze(2).expand_as(conv3d_weight) / + conv3d_weight.shape[2]) + if getattr(module, 'bias') is not None: + conv2d_bias = chkp_2d[name2d + '.bias'] + conv3d_bias = getattr(module, 'bias').data + assert torch.equal(conv2d_bias, conv3d_bias) + + elif isinstance(module, nn.BatchNorm3d): + for pname in ['weight', 'bias', 'running_mean', 'running_var']: + param_2d = chkp_2d[name2d + '.' + pname] + param_3d = getattr(module, pname).data + assert torch.equal(param_2d, param_3d) + + conv3d = resnet3d_50_pretrain.conv1.conv + assert torch.equal( + conv3d.weight, + chkp_2d['conv1.weight'].unsqueeze(2).expand_as(conv3d.weight) / + conv3d.weight.shape[2]) + conv3d = resnet3d_50_pretrain.layer3[2].conv2.conv + assert torch.equal( + conv3d.weight, chkp_2d['layer3.2.conv2.weight'].unsqueeze(2).expand_as( + conv3d.weight) / conv3d.weight.shape[2]) + + # resnet3d with depth 34, no pretrained, norm_eval False + resnet3d_34_no_bn_eval = ResNet3d( + 34, None, pretrained2d=False, norm_eval=False) + resnet3d_34_no_bn_eval.init_weights() + resnet3d_34_no_bn_eval.train() + assert check_norm_state(resnet3d_34_no_bn_eval.modules(), True) + + # resnet3d with depth 50, no pretrained, norm_eval False + resnet3d_50_no_bn_eval = ResNet3d( + 50, None, pretrained2d=False, norm_eval=False) + resnet3d_50_no_bn_eval.init_weights() + resnet3d_50_no_bn_eval.train() + assert check_norm_state(resnet3d_50_no_bn_eval.modules(), True) + + # resnet3d with depth 34, no pretrained, frozen_stages, norm_eval False + frozen_stages = 1 + resnet3d_34_frozen = ResNet3d( + 34, None, pretrained2d=False, frozen_stages=frozen_stages) + resnet3d_34_frozen.init_weights() + resnet3d_34_frozen.train() + assert resnet3d_34_frozen.conv1.bn.training is False + for param in resnet3d_34_frozen.conv1.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(resnet3d_34_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + # test zero_init_residual + for m in resnet3d_34_frozen.modules(): + if hasattr(m, 'conv2'): + assert torch.equal(m.conv2.bn.weight, + torch.zeros_like(m.conv2.bn.weight)) + assert torch.equal(m.conv2.bn.bias, + torch.zeros_like(m.conv2.bn.bias)) + + # resnet3d with depth 50, no pretrained, frozen_stages, norm_eval False + frozen_stages = 1 + resnet3d_50_frozen = ResNet3d( + 50, None, pretrained2d=False, frozen_stages=frozen_stages) + resnet3d_50_frozen.init_weights() + resnet3d_50_frozen.train() + assert resnet3d_50_frozen.conv1.bn.training is False + for param in resnet3d_50_frozen.conv1.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(resnet3d_50_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + # test zero_init_residual + for m in resnet3d_50_frozen.modules(): + if hasattr(m, 'conv3'): + assert torch.equal(m.conv3.bn.weight, + torch.zeros_like(m.conv3.bn.weight)) + assert torch.equal(m.conv3.bn.bias, + torch.zeros_like(m.conv3.bn.bias)) + + # resnet3d frozen with depth 34 inference + input_shape = (1, 3, 6, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_34_frozen = resnet3d_34_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_34_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + else: + feat = resnet3d_34_frozen(imgs) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + + # resnet3d with depth 50 inference + input_shape = (1, 3, 6, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_50_frozen = resnet3d_50_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_50_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + else: + feat = resnet3d_50_frozen(imgs) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + + # resnet3d with depth 50 in caffe style inference + resnet3d_50_caffe = ResNet3d(50, None, pretrained2d=False, style='caffe') + resnet3d_50_caffe.init_weights() + resnet3d_50_caffe.train() + + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_50_caffe = resnet3d_50_caffe.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_50_caffe(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + else: + feat = resnet3d_50_caffe(imgs) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + + # resnet3d with depth 34 in caffe style inference + resnet3d_34_caffe = ResNet3d(34, None, pretrained2d=False, style='caffe') + resnet3d_34_caffe.init_weights() + resnet3d_34_caffe.train() + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_34_caffe = resnet3d_34_caffe.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_34_caffe(imgs_gpu) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + else: + feat = resnet3d_34_caffe(imgs) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + + # resnet3d with depth with 3x3x3 inflate_style inference + resnet3d_50_1x1x1 = ResNet3d( + 50, None, pretrained2d=False, inflate_style='3x3x3') + resnet3d_50_1x1x1.init_weights() + resnet3d_50_1x1x1.train() + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_50_1x1x1 = resnet3d_50_1x1x1.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_50_1x1x1(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + else: + feat = resnet3d_50_1x1x1(imgs) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + + resnet3d_34_1x1x1 = ResNet3d( + 34, None, pretrained2d=False, inflate_style='3x3x3') + resnet3d_34_1x1x1.init_weights() + resnet3d_34_1x1x1.train() + + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_34_1x1x1 = resnet3d_34_1x1x1.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_34_1x1x1(imgs_gpu) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + else: + feat = resnet3d_34_1x1x1(imgs) + assert feat.shape == torch.Size([1, 512, 3, 2, 2]) + + # resnet3d with non-local module + non_local_cfg = dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian') + non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)) + resnet3d_nonlocal = ResNet3d( + 50, + None, + pretrained2d=False, + non_local=non_local, + non_local_cfg=non_local_cfg) + resnet3d_nonlocal.init_weights() + for layer_name in ['layer2', 'layer3']: + layer = getattr(resnet3d_nonlocal, layer_name) + for i, _ in enumerate(layer): + if i % 2 == 0: + assert hasattr(layer[i], 'non_local_block') + + feat = resnet3d_nonlocal(imgs) + assert feat.shape == torch.Size([1, 2048, 3, 2, 2]) + + +def test_resnet3d_layer(): + with pytest.raises(AssertionError): + ResNet3dLayer(22, None) + + with pytest.raises(AssertionError): + ResNet3dLayer(50, None, stage=4) + + res_layer = ResNet3dLayer(50, None, stage=3, norm_eval=True) + res_layer.init_weights() + res_layer.train() + input_shape = (1, 1024, 1, 4, 4) + imgs = generate_backbone_demo_inputs(input_shape) + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + res_layer = res_layer.cuda() + imgs_gpu = imgs.cuda() + feat = res_layer(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) + else: + feat = res_layer(imgs) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) + + res_layer = ResNet3dLayer( + 50, 'torchvision://resnet50', stage=3, all_frozen=True) + res_layer.init_weights() + res_layer.train() + imgs = generate_backbone_demo_inputs(input_shape) + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + res_layer = res_layer.cuda() + imgs_gpu = imgs.cuda() + feat = res_layer(imgs_gpu) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) + else: + feat = res_layer(imgs) + assert feat.shape == torch.Size([1, 2048, 1, 2, 2]) diff --git a/tests/models/backbones/test_resnet3d_csn.py b/tests/models/backbones/test_resnet3d_csn.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4da7ac45eb59847520ac916c1143e526bbb15b --- /dev/null +++ b/tests/models/backbones/test_resnet3d_csn.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +import torch.nn as nn +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import ResNet3dCSN +from mmaction.testing import generate_backbone_demo_inputs + + +def test_resnet_csn_backbone(): + """Test resnet_csn backbone.""" + with pytest.raises(ValueError): + # Bottleneck mode must be "ip" or "ir" + ResNet3dCSN(152, None, bottleneck_mode='id') + + input_shape = (2, 3, 6, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + resnet3d_csn_frozen = ResNet3dCSN( + 152, None, bn_frozen=True, norm_eval=True) + resnet3d_csn_frozen.train() + for m in resnet3d_csn_frozen.modules(): + if isinstance(m, _BatchNorm): + for param in m.parameters(): + assert param.requires_grad is False + + # Interaction-preserved channel-separated bottleneck block + resnet3d_csn_ip = ResNet3dCSN(152, None, bottleneck_mode='ip') + resnet3d_csn_ip.init_weights() + resnet3d_csn_ip.train() + for i, layer_name in enumerate(resnet3d_csn_ip.res_layers): + layers = getattr(resnet3d_csn_ip, layer_name) + num_blocks = resnet3d_csn_ip.stage_blocks[i] + assert len(layers) == num_blocks + for layer in layers: + assert isinstance(layer.conv2, nn.Sequential) + assert len(layer.conv2) == 2 + assert layer.conv2[1].groups == layer.planes + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_csn_ip = resnet3d_csn_ip.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_csn_ip(imgs_gpu) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + else: + feat = resnet3d_csn_ip(imgs) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + + # Interaction-reduced channel-separated bottleneck block + resnet3d_csn_ir = ResNet3dCSN(152, None, bottleneck_mode='ir') + resnet3d_csn_ir.init_weights() + resnet3d_csn_ir.train() + for i, layer_name in enumerate(resnet3d_csn_ir.res_layers): + layers = getattr(resnet3d_csn_ir, layer_name) + num_blocks = resnet3d_csn_ir.stage_blocks[i] + assert len(layers) == num_blocks + for layer in layers: + assert isinstance(layer.conv2, nn.Sequential) + assert len(layer.conv2) == 1 + assert layer.conv2[0].groups == layer.planes + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_csn_ir = resnet3d_csn_ir.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_csn_ir(imgs_gpu) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + else: + feat = resnet3d_csn_ir(imgs) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + + # Set training status = False + resnet3d_csn_ip = ResNet3dCSN(152, None, bottleneck_mode='ip') + resnet3d_csn_ip.init_weights() + resnet3d_csn_ip.train(False) + for module in resnet3d_csn_ip.children(): + assert module.training is False diff --git a/tests/models/backbones/test_resnet3d_slowfast.py b/tests/models/backbones/test_resnet3d_slowfast.py new file mode 100644 index 0000000000000000000000000000000000000000..4d8fbc38eb2b069a27840fce17bd54364af7d46d --- /dev/null +++ b/tests/models/backbones/test_resnet3d_slowfast.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import ResNet3dSlowFast +from mmaction.testing import generate_backbone_demo_inputs + + +def test_slowfast_backbone(): + """Test SlowFast backbone.""" + with pytest.raises(TypeError): + # cfg should be a dict + ResNet3dSlowFast(slow_pathway=list(['foo', 'bar'])) + with pytest.raises(KeyError): + # pathway type should be implemented + ResNet3dSlowFast(slow_pathway=dict(type='resnext')) + + # test slowfast with slow inflated + sf_50_inflate = ResNet3dSlowFast( + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained='torchvision://resnet50', + pretrained2d=True, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1))) + sf_50_inflate.init_weights() + sf_50_inflate.train() + + # test slowfast with no lateral connection + sf_50_wo_lateral = ResNet3dSlowFast( + None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1))) + sf_50_wo_lateral.init_weights() + sf_50_wo_lateral.train() + + # slowfast w/o lateral connection inference test + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + feat = sf_50_wo_lateral(imgs) + + assert isinstance(feat, tuple) + assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2]) + assert feat[1].shape == torch.Size([1, 256, 8, 2, 2]) + + # test slowfast with frozen stages config + frozen_slow = 3 + sf_50 = ResNet3dSlowFast( + None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + pretrained2d=True, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + frozen_stages=frozen_slow)) + sf_50.init_weights() + sf_50.train() + + for stage in range(1, sf_50.slow_path.num_stages): + lateral_name = sf_50.slow_path.lateral_connections[stage - 1] + conv_lateral = getattr(sf_50.slow_path, lateral_name) + for mod in conv_lateral.modules(): + if isinstance(mod, _BatchNorm): + if stage <= frozen_slow: + assert mod.training is False + else: + assert mod.training is True + for param in conv_lateral.parameters(): + if stage <= frozen_slow: + assert param.requires_grad is False + else: + assert param.requires_grad is True + + # test slowfast with normal config + sf_50 = ResNet3dSlowFast() + sf_50.init_weights() + sf_50.train() + + # slowfast inference test + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + feat = sf_50(imgs) + + assert isinstance(feat, tuple) + assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2]) + assert feat[1].shape == torch.Size([1, 256, 8, 2, 2]) diff --git a/tests/models/backbones/test_resnet3d_slowonly.py b/tests/models/backbones/test_resnet3d_slowonly.py new file mode 100644 index 0000000000000000000000000000000000000000..7557edbd2765c0d4cd2c10227ea3db2e4cefb4b3 --- /dev/null +++ b/tests/models/backbones/test_resnet3d_slowonly.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import ResNet3dSlowOnly +from mmaction.testing import generate_backbone_demo_inputs + + +def test_slowonly_backbone(): + """Test SlowOnly backbone.""" + with pytest.raises(AssertionError): + # SlowOnly should contain no lateral connection + ResNet3dSlowOnly(depth=50, pretrained=None, lateral=True) + + # test SlowOnly for PoseC3D + so_50 = ResNet3dSlowOnly( + depth=50, + pretrained=None, + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 2), + dilations=(1, 1, 1)) + so_50.init_weights() + so_50.train() + + # test SlowOnly with normal config + so_50 = ResNet3dSlowOnly(depth=50, pretrained=None) + so_50.init_weights() + so_50.train() + + # SlowOnly inference test + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + so_50 = so_50.cuda() + imgs_gpu = imgs.cuda() + feat = so_50(imgs_gpu) + else: + feat = so_50(imgs) + assert feat.shape == torch.Size([1, 2048, 8, 2, 2]) diff --git a/tests/models/backbones/test_resnet_audio.py b/tests/models/backbones/test_resnet_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..826ba3f67cfd8f7a412df3c0456c7f98a91f1bd5 --- /dev/null +++ b/tests/models/backbones/test_resnet_audio.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import ResNetAudio +from mmaction.testing import generate_backbone_demo_inputs +from mmaction.utils import register_all_modules + + +def test_resnet_audio_backbone(): + """Test ResNetAudio backbone.""" + input_shape = (1, 1, 16, 16) + spec = generate_backbone_demo_inputs(input_shape) + # inference + register_all_modules() + audioonly = ResNetAudio(50, None) + audioonly.init_weights() + audioonly.train() + feat = audioonly(spec) + assert feat.shape == torch.Size([1, 1024, 2, 2]) diff --git a/tests/models/backbones/test_resnet_omni.py b/tests/models/backbones/test_resnet_omni.py new file mode 100644 index 0000000000000000000000000000000000000000..c4f5da14046d3d68588eea1ab0fe8e70cc02f027 --- /dev/null +++ b/tests/models/backbones/test_resnet_omni.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torchvision + +from mmaction.models import OmniResNet +from mmaction.testing import generate_backbone_demo_inputs + + +def test_x3d_backbone(): + """Test x3d backbone.""" + _ = OmniResNet() + + resnet50 = torchvision.models.resnet50() + params = resnet50.state_dict() + torch.save(params, './r50.pth') + model = OmniResNet(pretrain_2d='./r50.pth') + + input_shape = (2, 3, 8, 64, 64) + videos = generate_backbone_demo_inputs(input_shape) + feat = model(videos) + assert feat.shape == torch.Size([2, 2048, 8, 2, 2]) + + input_shape = (2, 3, 64, 64) + images = generate_backbone_demo_inputs(input_shape) + feat = model(images) + assert feat.shape == torch.Size([2, 2048, 2, 2]) diff --git a/tests/models/backbones/test_resnet_tin.py b/tests/models/backbones/test_resnet_tin.py new file mode 100644 index 0000000000000000000000000000000000000000..26f0aab13d463dceb09af09db4a5ecced94d525e --- /dev/null +++ b/tests/models/backbones/test_resnet_tin.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +import torch.nn as nn + +from mmaction.models import ResNetTIN +from mmaction.testing import generate_backbone_demo_inputs + + +@pytest.mark.skipif( + not torch.cuda.is_available(), reason='requires CUDA support') +def test_resnet_tin_backbone(): + """Test resnet_tin backbone.""" + with pytest.raises(AssertionError): + # num_segments should be positive + resnet_tin = ResNetTIN(50, num_segments=-1) + resnet_tin.init_weights() + + from mmaction.models.backbones.resnet_tin import (CombineNet, + TemporalInterlace) + + # resnet_tin with normal config + resnet_tin = ResNetTIN(50) + resnet_tin.init_weights() + for layer_name in resnet_tin.res_layers: + layer = getattr(resnet_tin, layer_name) + blocks = list(layer.children()) + for block in blocks: + assert isinstance(block.conv1.conv, CombineNet) + assert isinstance(block.conv1.conv.net1, TemporalInterlace) + assert ( + block.conv1.conv.net1.num_segments == resnet_tin.num_segments) + assert block.conv1.conv.net1.shift_div == resnet_tin.shift_div + + # resnet_tin with partial batchnorm + resnet_tin_pbn = ResNetTIN(50, partial_bn=True) + resnet_tin_pbn.train() + count_bn = 0 + for m in resnet_tin_pbn.modules(): + if isinstance(m, nn.BatchNorm2d): + count_bn += 1 + if count_bn >= 2: + assert m.training is False + assert m.weight.requires_grad is False + assert m.bias.requires_grad is False + else: + assert m.training is True + assert m.weight.requires_grad is True + assert m.bias.requires_grad is True + + input_shape = (8, 3, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape).cuda() + resnet_tin = resnet_tin.cuda() + + # resnet_tin with normal cfg inference + feat = resnet_tin(imgs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) diff --git a/tests/models/backbones/test_resnet_tsm.py b/tests/models/backbones/test_resnet_tsm.py new file mode 100644 index 0000000000000000000000000000000000000000..9f852df43eaae48a70fa61825a95c905f21f1f39 --- /dev/null +++ b/tests/models/backbones/test_resnet_tsm.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from unittest import TestCase + +import pytest +import torch +import torch.nn as nn + +from mmaction.models import ResNetTSM +from mmaction.models.backbones.resnet import Bottleneck +from mmaction.models.backbones.resnet_tsm import NL3DWrapper, TemporalShift +from mmaction.testing import generate_backbone_demo_inputs + + +class Test_ResNet_TSM(TestCase): + + def setUp(self): + input_shape = (8, 3, 64, 64) + self.imgs = generate_backbone_demo_inputs(input_shape) + + def test_init(self): + with pytest.raises(NotImplementedError): + # shift_place must be block or blockres + resnet_tsm_50_block = ResNetTSM(50, shift_place='Block') + resnet_tsm_50_block.init_weights() + + def test_init_from_scratch(self): + resnet_tsm_50 = ResNetTSM(50, pretrained=None, pretrained2d=False) + resnet_tsm_50.init_weights() + + def test_resnet_tsm_temporal_shift_blockres(self): + # resnet_tsm with depth 50 + resnet_tsm_50 = ResNetTSM(50, pretrained='torchvision://resnet50') + resnet_tsm_50.init_weights() + for layer_name in resnet_tsm_50.res_layers: + layer = getattr(resnet_tsm_50, layer_name) + blocks = list(layer.children()) + for block in blocks: + assert isinstance(block.conv1.conv, TemporalShift) + assert block.conv1.conv.num_segments == resnet_tsm_50.num_segments # noqa: E501 + assert block.conv1.conv.shift_div == resnet_tsm_50.shift_div + assert isinstance(block.conv1.conv.net, nn.Conv2d) + feat = resnet_tsm_50(self.imgs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + def test_resnet_tsm_temporal_shift_block(self): + # resnet_tsm with depth 50, no pretrained, shift_place is block + resnet_tsm_50_block = ResNetTSM( + 50, shift_place='block', pretrained='torchvision://resnet50') + resnet_tsm_50_block.init_weights() + for layer_name in resnet_tsm_50_block.res_layers: + layer = getattr(resnet_tsm_50_block, layer_name) + blocks = list(layer.children()) + for block in blocks: + assert isinstance(block, TemporalShift) + assert block.num_segments == resnet_tsm_50_block.num_segments + assert block.num_segments == resnet_tsm_50_block.num_segments + assert block.shift_div == resnet_tsm_50_block.shift_div + assert isinstance(block.net, Bottleneck) + + def test_resnet_tsm_temporal_pool(self): + # resnet_tsm with depth 50, no pretrained, use temporal_pool + resnet_tsm_50_temporal_pool = ResNetTSM( + 50, temporal_pool=True, pretrained='torchvision://resnet50') + resnet_tsm_50_temporal_pool.init_weights() + for layer_name in resnet_tsm_50_temporal_pool.res_layers: + layer = getattr(resnet_tsm_50_temporal_pool, layer_name) + blocks = list(layer.children()) + + if layer_name == 'layer2': + assert len(blocks) == 2 + assert isinstance(blocks[1], nn.MaxPool3d) + blocks = copy.deepcopy(blocks[0]) + + for block in blocks: + assert isinstance(block.conv1.conv, TemporalShift) + if layer_name == 'layer1': + assert block.conv1.conv.num_segments == \ + resnet_tsm_50_temporal_pool.num_segments + else: + assert block.conv1.conv.num_segments == \ + resnet_tsm_50_temporal_pool.num_segments // 2 + assert block.conv1.conv.shift_div == resnet_tsm_50_temporal_pool.shift_div # noqa: E501 + assert isinstance(block.conv1.conv.net, nn.Conv2d) + + feat = resnet_tsm_50_temporal_pool(self.imgs) + assert feat.shape == torch.Size([4, 2048, 2, 2]) + + def test_resnet_tsm_non_local(self): + # resnet_tsm with non-local module + non_local_cfg = dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian') + non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)) + resnet_tsm_nonlocal = ResNetTSM( + 50, + non_local=non_local, + non_local_cfg=non_local_cfg, + pretrained='torchvision://resnet50') + resnet_tsm_nonlocal.init_weights() + for layer_name in ['layer2', 'layer3']: + layer = getattr(resnet_tsm_nonlocal, layer_name) + for i, _ in enumerate(layer): + if i % 2 == 0: + assert isinstance(layer[i], NL3DWrapper) + + feat = resnet_tsm_nonlocal(self.imgs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + def test_resnet_tsm_full(self): + non_local_cfg = dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian') + non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)) + resnet_tsm_50_full = ResNetTSM( + 50, + pretrained='torchvision://resnet50', + non_local=non_local, + non_local_cfg=non_local_cfg, + temporal_pool=True) + resnet_tsm_50_full.init_weights() + + input_shape = (16, 3, 32, 32) + imgs = generate_backbone_demo_inputs(input_shape) + feat = resnet_tsm_50_full(imgs) + assert feat.shape == torch.Size([8, 2048, 1, 1]) diff --git a/tests/models/backbones/test_rgbposeconv3d.py b/tests/models/backbones/test_rgbposeconv3d.py new file mode 100644 index 0000000000000000000000000000000000000000..1c44ed5d8db28fabdc24109d9bb5d0da217f0298 --- /dev/null +++ b/tests/models/backbones/test_rgbposeconv3d.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import RGBPoseConv3D +from mmaction.testing import generate_backbone_demo_inputs + + +def test_rgbposeconv3d(): + """Test RGBPoseConv3D backbone.""" + + with pytest.raises(AssertionError): + RGBPoseConv3D(pose_drop_path=1.1, rgb_drop_path=1.1) + + rgbposec3d = RGBPoseConv3D() + rgbposec3d.init_weights() + rgbposec3d.train() + + imgs_shape = (1, 3, 8, 224, 224) + heatmap_imgs_shape = (1, 17, 32, 56, 56) + imgs = generate_backbone_demo_inputs(imgs_shape) + heatmap_imgs = generate_backbone_demo_inputs(heatmap_imgs_shape) + + (x_rgb, x_pose) = rgbposec3d(imgs, heatmap_imgs) + + assert x_rgb.shape == torch.Size([1, 2048, 8, 7, 7]) + assert x_pose.shape == torch.Size([1, 512, 32, 7, 7]) diff --git a/tests/models/backbones/test_stgcn.py b/tests/models/backbones/test_stgcn.py new file mode 100644 index 0000000000000000000000000000000000000000..45bcb4bb54c1fd4b2fcd028c6fd1c1f3a869808d --- /dev/null +++ b/tests/models/backbones/test_stgcn.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import STGCN + + +def test_stgcn_backbone(): + """Test STGCN backbone.""" + + mode = 'stgcn_spatial' + batch_size, num_person, num_frames = 2, 2, 150 + + # openpose-18 layout + num_joints = 18 + model = STGCN(graph_cfg=dict(layout='openpose', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 18]) + + # nturgb+d layout + num_joints = 25 + model = STGCN(graph_cfg=dict(layout='nturgb+d', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 25]) + + # coco layout + num_joints = 17 + model = STGCN(graph_cfg=dict(layout='coco', mode=mode)) + model.init_weights() + inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3) + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 17]) + + # custom settings + # instantiate STGCN++ + model = STGCN( + graph_cfg=dict(layout='coco', mode='spatial'), + gcn_adaptive='init', + gcn_with_res=True, + tcn_type='mstcn') + model.init_weights() + output = model(inputs) + assert output.shape == torch.Size([2, 2, 256, 38, 17]) diff --git a/tests/models/backbones/test_swin.py b/tests/models/backbones/test_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..be3921f1361ffe0428b814e7f41ad47705c3c3b4 --- /dev/null +++ b/tests/models/backbones/test_swin.py @@ -0,0 +1,116 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import SwinTransformer3D +from mmaction.testing import generate_backbone_demo_inputs + + +def test_swin_backbone(): + """Test swin backbone.""" + with pytest.raises(AssertionError): + SwinTransformer3D(arch='-t') + + with pytest.raises(AssertionError): + SwinTransformer3D(arch={'embed_dims': 96}) + + with pytest.raises(AssertionError): + SwinTransformer3D(arch={ + 'embed_dims': 96, + 'depths': [2, 2, 6], + 'num_heads': [3, 6, 12, 24] + }) + + with pytest.raises(AssertionError): + SwinTransformer3D( + arch={ + 'embed_dims': 96, + 'depths': [2, 2, 6, 2, 2], + 'num_heads': [3, 6, 12, 24, 48] + }) + + with pytest.raises(AssertionError): + SwinTransformer3D(arch='t', out_indices=(4, )) + + with pytest.raises(TypeError): + swin_t = SwinTransformer3D(arch='t', pretrained=[0, 1, 1]) + swin_t.init_weights() + + with pytest.raises(TypeError): + swin_t = SwinTransformer3D(arch='t') + swin_t.init_weights(pretrained=[0, 1, 1]) + + swin_b = SwinTransformer3D(arch='b', pretrained=None, pretrained2d=False) + swin_b.init_weights() + swin_b.train() + + pretrained_url = 'https://download.openmmlab.com/mmaction/v1.0/' \ + 'recognition/swin/swin_tiny_patch4_window7_224.pth' + + swin_t_pre = SwinTransformer3D( + arch='t', pretrained=pretrained_url, pretrained2d=True) + swin_t_pre.init_weights() + swin_t_pre.train() + + from mmengine.runner.checkpoint import _load_checkpoint + ckpt_2d = _load_checkpoint(pretrained_url, map_location='cpu') + state_dict = ckpt_2d['model'] + + patch_embed_weight2d = state_dict['patch_embed.proj.weight'].data + patch_embed_weight3d = swin_t_pre.patch_embed.proj.weight.data + assert torch.equal( + patch_embed_weight3d, + patch_embed_weight2d.unsqueeze(2).expand_as(patch_embed_weight3d) / + patch_embed_weight3d.shape[2]) + + norm = swin_t_pre.norm3 + assert torch.equal(norm.weight.data, state_dict['norm.weight']) + assert torch.equal(norm.bias.data, state_dict['norm.bias']) + + for name, param in swin_t_pre.named_parameters(): + if 'relative_position_bias_table' in name: + bias2d = state_dict[name] + assert torch.equal( + param.data, bias2d.repeat(2 * swin_t_pre.window_size[0] - 1, + 1)) + + frozen_stages = 1 + swin_t_frozen = SwinTransformer3D( + arch='t', + pretrained=None, + pretrained2d=False, + frozen_stages=frozen_stages) + swin_t_frozen.init_weights() + swin_t_frozen.train() + for param in swin_t_frozen.patch_embed.parameters(): + assert param.requires_grad is False + for i in range(frozen_stages): + layer = swin_t_frozen.layers[i] + for param in layer.parameters(): + assert param.requires_grad is False + + input_shape = (1, 3, 6, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + feat = swin_t_frozen(imgs) + assert feat.shape == torch.Size([1, 768, 3, 2, 2]) + + input_shape = (1, 3, 5, 63, 63) + imgs = generate_backbone_demo_inputs(input_shape) + feat = swin_t_frozen(imgs) + assert feat.shape == torch.Size([1, 768, 3, 2, 2]) + + swin_t_all_stages = SwinTransformer3D(arch='t', out_indices=(0, 1, 2, 3)) + feats = swin_t_all_stages(imgs) + assert feats[0].shape == torch.Size([1, 96, 3, 16, 16]) + assert feats[1].shape == torch.Size([1, 192, 3, 8, 8]) + assert feats[2].shape == torch.Size([1, 384, 3, 4, 4]) + assert feats[3].shape == torch.Size([1, 768, 3, 2, 2]) + + swin_t_all_stages_after_ds = SwinTransformer3D( + arch='t', out_indices=(0, 1, 2, 3), out_after_downsample=True) + feats = swin_t_all_stages_after_ds(imgs) + assert feats[0].shape == torch.Size([1, 192, 3, 8, 8]) + assert feats[1].shape == torch.Size([1, 384, 3, 4, 4]) + assert feats[2].shape == torch.Size([1, 768, 3, 2, 2]) + assert feats[3].shape == torch.Size([1, 768, 3, 2, 2]) diff --git a/tests/models/backbones/test_tanet.py b/tests/models/backbones/test_tanet.py new file mode 100644 index 0000000000000000000000000000000000000000..82b74428fc49a1ad243d31bafe83c1f64a99467e --- /dev/null +++ b/tests/models/backbones/test_tanet.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import TANet +from mmaction.testing import generate_backbone_demo_inputs + + +def test_tanet_backbone(): + """Test tanet backbone.""" + with pytest.raises(NotImplementedError): + # TA-Blocks are only based on Bottleneck block now + tanet_18 = TANet(18, 8) + tanet_18.init_weights() + + from mmaction.models.backbones.resnet import Bottleneck + from mmaction.models.backbones.tanet import TABlock + + # tanet with depth 50 + tanet_50 = TANet(50, 8) + tanet_50.init_weights() + + for layer_name in tanet_50.res_layers: + layer = getattr(tanet_50, layer_name) + blocks = list(layer.children()) + for block in blocks: + assert isinstance(block, TABlock) + assert isinstance(block.block, Bottleneck) + assert block.tam.num_segments == block.num_segments + assert block.tam.in_channels == block.block.conv1.out_channels + + input_shape = (8, 3, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + feat = tanet_50(imgs) + assert feat.shape == torch.Size([8, 2048, 2, 2]) + + input_shape = (16, 3, 32, 32) + imgs = generate_backbone_demo_inputs(input_shape) + feat = tanet_50(imgs) + assert feat.shape == torch.Size([16, 2048, 1, 1]) diff --git a/tests/models/backbones/test_timesformer.py b/tests/models/backbones/test_timesformer.py new file mode 100644 index 0000000000000000000000000000000000000000..81843e08483502dac2eaa67ab1a722cce354203b --- /dev/null +++ b/tests/models/backbones/test_timesformer.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import TimeSformer +from mmaction.testing import generate_backbone_demo_inputs + + +def test_timesformer_backbone(): + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + # divided_space_time + timesformer = TimeSformer( + 8, 64, 16, embed_dims=768, attention_type='divided_space_time') + timesformer.init_weights() + from mmaction.models.common import (DividedSpatialAttentionWithNorm, + DividedTemporalAttentionWithNorm, + FFNWithNorm) + assert isinstance(timesformer.transformer_layers.layers[0].attentions[0], + DividedTemporalAttentionWithNorm) + assert isinstance(timesformer.transformer_layers.layers[11].attentions[1], + DividedSpatialAttentionWithNorm) + assert isinstance(timesformer.transformer_layers.layers[0].ffns[0], + FFNWithNorm) + assert hasattr(timesformer, 'time_embed') + assert timesformer.patch_embed.num_patches == 16 + + cls_tokens = timesformer(imgs) + assert cls_tokens.shape == torch.Size([1, 768]) + + # space_only + timesformer = TimeSformer( + 8, 64, 16, embed_dims=512, num_heads=8, attention_type='space_only') + timesformer.init_weights() + + assert not hasattr(timesformer, 'time_embed') + assert timesformer.patch_embed.num_patches == 16 + + cls_tokens = timesformer(imgs) + assert cls_tokens.shape == torch.Size([1, 512]) + + # joint_space_time + input_shape = (1, 3, 2, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + timesformer = TimeSformer( + 2, + 64, + 8, + embed_dims=256, + num_heads=8, + attention_type='joint_space_time') + timesformer.init_weights() + + assert hasattr(timesformer, 'time_embed') + assert timesformer.patch_embed.num_patches == 64 + + cls_tokens = timesformer(imgs) + assert cls_tokens.shape == torch.Size([1, 256]) + + with pytest.raises(AssertionError): + # unsupported attention type + timesformer = TimeSformer( + 8, 64, 16, attention_type='wrong_attention_type') + + with pytest.raises(AssertionError): + # Wrong transformer_layers type + timesformer = TimeSformer(8, 64, 16, transformer_layers='wrong_type') diff --git a/tests/models/backbones/test_uniformer.py b/tests/models/backbones/test_uniformer.py new file mode 100644 index 0000000000000000000000000000000000000000..eb3c8d173aed930f972ea6f430de1792d03ab473 --- /dev/null +++ b/tests/models/backbones/test_uniformer.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import UniFormer +from mmaction.testing import generate_backbone_demo_inputs + + +def test_uniformer_backbone(): + """Test uniformer backbone.""" + input_shape = (1, 3, 16, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + model = UniFormer( + depth=[3, 4, 8, 3], + embed_dim=[64, 128, 320, 512], + head_dim=64, + drop_path_rate=0.1) + model.init_weights() + + model.eval() + assert model(imgs).shape == torch.Size([1, 512, 8, 2, 2]) diff --git a/tests/models/backbones/test_uniformerv2.py b/tests/models/backbones/test_uniformerv2.py new file mode 100644 index 0000000000000000000000000000000000000000..be4357bb026383db8e0f8aa30146e217eec6cafa --- /dev/null +++ b/tests/models/backbones/test_uniformerv2.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import UniFormerV2 +from mmaction.testing import generate_backbone_demo_inputs + + +def test_uniformerv2_backbone(): + """Test uniformer backbone.""" + input_shape = (1, 3, 8, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + model = UniFormerV2( + input_resolution=64, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=8, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + clip_pretrained=False, + mlp_dropout=[0.5, 0.5, 0.5, 0.5]) + model.init_weights() + + model.eval() + assert model(imgs).shape == torch.Size([1, 768]) + + # SthSth + input_shape = (1, 3, 16, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + + model = UniFormerV2( + input_resolution=64, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=16, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=True, + no_lmhra=False, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + clip_pretrained=False, + mlp_dropout=[0.5, 0.5, 0.5, 0.5]) + model.init_weights() + + model.eval() + assert model(imgs).shape == torch.Size([1, 768]) diff --git a/tests/models/backbones/test_vit_mae.py b/tests/models/backbones/test_vit_mae.py new file mode 100644 index 0000000000000000000000000000000000000000..da184c675d588a46020a4cd8b698e9bee3d97bfa --- /dev/null +++ b/tests/models/backbones/test_vit_mae.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import VisionTransformer + + +def test_vit_backbone(): + """Test vit backbone.""" + x = torch.randn(1, 3, 8, 64, 64) + model = VisionTransformer( + img_size=64, + num_frames=8, + qkv_bias=True, + drop_path_rate=0.2, + init_values=0.1) + model.init_weights() + + assert model(x).shape == torch.Size([1, 768]) + model.eval() + assert model(x).shape == torch.Size([1, 768]) + + model = VisionTransformer( + img_size=64, + num_frames=8, + use_learnable_pos_emb=True, + drop_rate=0.1, + use_mean_pooling=False) + model.init_weights() + + assert model(x).shape == torch.Size([1, 768]) + model.eval() + assert model(x).shape == torch.Size([1, 768]) diff --git a/tests/models/backbones/test_x3d.py b/tests/models/backbones/test_x3d.py new file mode 100644 index 0000000000000000000000000000000000000000..5f4c86b6b3e99b6ed216dd2ed3556e025f600b9e --- /dev/null +++ b/tests/models/backbones/test_x3d.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.models import X3D +from mmaction.testing import check_norm_state, generate_backbone_demo_inputs + + +def test_x3d_backbone(): + """Test x3d backbone.""" + with pytest.raises(AssertionError): + # In X3D: 1 <= num_stages <= 4 + X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, num_stages=0) + + with pytest.raises(AssertionError): + # In X3D: 1 <= num_stages <= 4 + X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, num_stages=5) + + with pytest.raises(AssertionError): + # len(spatial_strides) == num_stages + X3D(gamma_w=1.0, + gamma_b=2.25, + gamma_d=2.2, + spatial_strides=(1, 2), + num_stages=4) + + with pytest.raises(AssertionError): + # se_style in ['half', 'all'] + X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, se_style=None) + + with pytest.raises(AssertionError): + # se_ratio should be None or > 0 + X3D(gamma_w=1.0, + gamma_b=2.25, + gamma_d=2.2, + se_style='half', + se_ratio=0) + + # x3d_s, no pretrained, norm_eval True + x3d_s = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, norm_eval=True) + x3d_s.init_weights() + x3d_s.train() + assert check_norm_state(x3d_s.modules(), False) + + # x3d_l, no pretrained, norm_eval True + x3d_l = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=5.0, norm_eval=True) + x3d_l.init_weights() + x3d_l.train() + assert check_norm_state(x3d_l.modules(), False) + + # x3d_s, no pretrained, norm_eval False + x3d_s = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, norm_eval=False) + x3d_s.init_weights() + x3d_s.train() + assert check_norm_state(x3d_s.modules(), True) + + # x3d_l, no pretrained, norm_eval False + x3d_l = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=5.0, norm_eval=False) + x3d_l.init_weights() + x3d_l.train() + assert check_norm_state(x3d_l.modules(), True) + + # x3d_s, no pretrained, frozen_stages, norm_eval False + frozen_stages = 1 + x3d_s_frozen = X3D( + gamma_w=1.0, + gamma_b=2.25, + gamma_d=2.2, + norm_eval=False, + frozen_stages=frozen_stages) + + x3d_s_frozen.init_weights() + x3d_s_frozen.train() + assert x3d_s_frozen.conv1_t.bn.training is False + for param in x3d_s_frozen.conv1_s.parameters(): + assert param.requires_grad is False + for param in x3d_s_frozen.conv1_t.parameters(): + assert param.requires_grad is False + + for i in range(1, frozen_stages + 1): + layer = getattr(x3d_s_frozen, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # test zero_init_residual, zero_init_residual is True by default + for m in x3d_s_frozen.modules(): + if hasattr(m, 'conv3'): + assert torch.equal(m.conv3.bn.weight, + torch.zeros_like(m.conv3.bn.weight)) + assert torch.equal(m.conv3.bn.bias, + torch.zeros_like(m.conv3.bn.bias)) + + # x3d_s inference + input_shape = (1, 3, 13, 64, 64) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + x3d_s_frozen = x3d_s_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = x3d_s_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 432, 13, 2, 2]) + else: + feat = x3d_s_frozen(imgs) + assert feat.shape == torch.Size([1, 432, 13, 2, 2]) + + # x3d_m inference + input_shape = (1, 3, 16, 96, 96) + imgs = generate_backbone_demo_inputs(input_shape) + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + x3d_s_frozen = x3d_s_frozen.cuda() + imgs_gpu = imgs.cuda() + feat = x3d_s_frozen(imgs_gpu) + assert feat.shape == torch.Size([1, 432, 16, 3, 3]) + else: + feat = x3d_s_frozen(imgs) + assert feat.shape == torch.Size([1, 432, 16, 3, 3]) diff --git a/tests/models/common/test_conv2plus1d.py b/tests/models/common/test_conv2plus1d.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc30984d0c6eceb441b87c81d8def1d99996455 --- /dev/null +++ b/tests/models/common/test_conv2plus1d.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models.common import Conv2plus1d + + +def test_conv2plus1d(): + with pytest.raises(AssertionError): + # Length of kernel size, stride and padding must be the same + Conv2plus1d(3, 8, (2, 2)) + + conv_2plus1d = Conv2plus1d(3, 8, 2) + conv_2plus1d.init_weights() + + assert torch.equal(conv_2plus1d.bn_s.weight, + torch.ones_like(conv_2plus1d.bn_s.weight)) + assert torch.equal(conv_2plus1d.bn_s.bias, + torch.zeros_like(conv_2plus1d.bn_s.bias)) + + x = torch.rand(1, 3, 8, 256, 256) + output = conv_2plus1d(x) + assert output.shape == torch.Size([1, 8, 7, 255, 255]) diff --git a/tests/models/common/test_conv_audio.py b/tests/models/common/test_conv_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..448be07d7b1c9cd343c875449f8377d63b6ba980 --- /dev/null +++ b/tests/models/common/test_conv_audio.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models.common import ConvAudio + + +def test_conv_audio(): + conv_audio = ConvAudio(3, 8, 3) + conv_audio.init_weights() + + x = torch.rand(1, 3, 8, 8) + output = conv_audio(x) + assert output.shape == torch.Size([1, 16, 8, 8]) + + conv_audio_sum = ConvAudio(3, 8, 3, op='sum') + output = conv_audio_sum(x) + assert output.shape == torch.Size([1, 8, 8, 8]) diff --git a/tests/models/common/test_sub_batchnorm3d.py b/tests/models/common/test_sub_batchnorm3d.py new file mode 100644 index 0000000000000000000000000000000000000000..ade756ec083d59d6bfd006ef23301f9e17a1e75f --- /dev/null +++ b/tests/models/common/test_sub_batchnorm3d.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmaction.models.common import SubBatchNorm3D + + +def test_SubBatchNorm3D(): + _cfg = dict(num_splits=2) + num_features = 4 + sub_batchnorm_3d = SubBatchNorm3D(num_features, **_cfg) + assert sub_batchnorm_3d.bn.num_features == num_features + assert sub_batchnorm_3d.split_bn.num_features == num_features * 2 + + assert sub_batchnorm_3d.bn.affine is False + assert sub_batchnorm_3d.split_bn.affine is False diff --git a/tests/models/common/test_tam.py b/tests/models/common/test_tam.py new file mode 100644 index 0000000000000000000000000000000000000000..ee72498a36c7c4f71ec73dc375b4de1412271ada --- /dev/null +++ b/tests/models/common/test_tam.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models.common import TAM + + +def test_TAM(): + """test TAM.""" + with pytest.raises(AssertionError): + # alpha must be a positive integer + TAM(16, 8, alpha=0, beta=4) + + with pytest.raises(AssertionError): + # beta must be a positive integer + TAM(16, 8, alpha=2, beta=0) + + with pytest.raises(AssertionError): + # the channels number of x should be equal to self.in_channels of TAM + tam = TAM(16, 8) + x = torch.rand(64, 8, 112, 112) + tam(x) + + tam = TAM(16, 8) + x = torch.rand(32, 16, 112, 112) + output = tam(x) + assert output.shape == torch.Size([32, 16, 112, 112]) diff --git a/tests/models/common/test_transformer.py b/tests/models/common/test_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9f2ae2329922532d3ec71cf2371f5b7d710ce5b7 --- /dev/null +++ b/tests/models/common/test_transformer.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.testing import assert_params_all_zeros + +from mmaction.models.common import (DividedSpatialAttentionWithNorm, + DividedTemporalAttentionWithNorm, + FFNWithNorm) + + +def test_divided_temporal_attention_with_norm(): + _cfg = dict(embed_dims=768, num_heads=12, num_frames=8) + divided_temporal_attention = DividedTemporalAttentionWithNorm(**_cfg) + assert isinstance(divided_temporal_attention.norm, nn.LayerNorm) + assert assert_params_all_zeros(divided_temporal_attention.temporal_fc) + + x = torch.rand(1, 1 + 8 * 14 * 14, 768) + output = divided_temporal_attention(x) + assert output.shape == torch.Size([1, 1 + 8 * 14 * 14, 768]) + + +def test_divided_spatial_attention_with_norm(): + _cfg = dict(embed_dims=512, num_heads=8, num_frames=4, dropout_layer=None) + divided_spatial_attention = DividedSpatialAttentionWithNorm(**_cfg) + assert isinstance(divided_spatial_attention.dropout_layer, nn.Identity) + assert isinstance(divided_spatial_attention.norm, nn.LayerNorm) + + x = torch.rand(1, 1 + 4 * 14 * 14, 512) + output = divided_spatial_attention(x) + assert output.shape == torch.Size([1, 1 + 4 * 14 * 14, 512]) + + +def test_ffn_with_norm(): + _cfg = dict( + embed_dims=256, feedforward_channels=256 * 2, norm_cfg=dict(type='LN')) + ffn_with_norm = FFNWithNorm(**_cfg) + assert isinstance(ffn_with_norm.norm, nn.LayerNorm) + + x = torch.rand(1, 1 + 4 * 14 * 14, 256) + output = ffn_with_norm(x) + assert output.shape == torch.Size([1, 1 + 4 * 14 * 14, 256]) diff --git a/tests/models/data_preprocessors/__init__.py b/tests/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/data_preprocessors/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..b7b482d2f06177c41f2576131d6df8546498cab0 --- /dev/null +++ b/tests/models/data_preprocessors/test_data_preprocessor.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy + +import pytest +import torch +from numpy.testing import assert_array_equal + +from mmaction.models import ActionDataPreprocessor +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules + + +def generate_dummy_data(batch_size, input_shape): + data = { + 'inputs': + [torch.randint(0, 255, input_shape) for _ in range(batch_size)], + 'data_samples': + [ActionDataSample().set_gt_label(2) for _ in range(batch_size)] + } + return data + + +def test_data_preprocessor(): + with pytest.raises(ValueError): + ActionDataPreprocessor( + mean=[1, 1], std=[0, 0], format_shape='NCTHW_Heatmap') + with pytest.raises(ValueError): + psr = ActionDataPreprocessor(format_shape='NCTHW_Heatmap', to_rgb=True) + psr(generate_dummy_data(1, (3, 224, 224))) + + raw_data = generate_dummy_data(2, (1, 3, 8, 224, 224)) + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW') + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 8, 224, 224) + assert_array_equal(data['inputs'][0], + (raw_data['inputs'][0] - psr.mean) / psr.std) + assert_array_equal(data['inputs'][1], + (raw_data['inputs'][1] - psr.mean) / psr.std) + + psr = ActionDataPreprocessor(format_shape='NCTHW', to_rgb=True) + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 8, 224, 224) + assert_array_equal(data['inputs'][0], raw_data['inputs'][0][:, [2, 1, 0]]) + assert_array_equal(data['inputs'][1], raw_data['inputs'][1][:, [2, 1, 0]]) + + register_all_modules() + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW', + blending=dict(type='MixupBlending', num_classes=5)) + data = psr(deepcopy(raw_data), training=True) + assert data['data_samples'][0].gt_label.shape == (5, ) + assert data['data_samples'][1].gt_label.shape == (5, ) + + raw_data = generate_dummy_data(2, (1, 3, 224, 224)) + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW', + to_rgb=True) + data = psr(deepcopy(raw_data)) + assert_array_equal(data['inputs'][0], + (raw_data['inputs'][0][:, [2, 1, 0]] - psr.mean) / + psr.std) + assert_array_equal(data['inputs'][1], + (raw_data['inputs'][1][:, [2, 1, 0]] - psr.mean) / + psr.std) + + psr = ActionDataPreprocessor() + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 224, 224) + assert_array_equal(data['inputs'][0], raw_data['inputs'][0]) + assert_array_equal(data['inputs'][1], raw_data['inputs'][1]) + + raw_2d_data = generate_dummy_data(2, (3, 224, 224)) + raw_3d_data = generate_dummy_data(2, (1, 3, 8, 224, 224)) + raw_data = (raw_2d_data, raw_3d_data) + + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='MIX2d3d') + data = psr(raw_data) + assert_array_equal(data[0]['inputs'][0], + (raw_2d_data['inputs'][0] - psr.mean.view(-1, 1, 1)) / + psr.std.view(-1, 1, 1)) + assert_array_equal(data[0]['inputs'][1], + (raw_2d_data['inputs'][1] - psr.mean.view(-1, 1, 1)) / + psr.std.view(-1, 1, 1)) + assert_array_equal(data[1]['inputs'][0], + (raw_3d_data['inputs'][0] - psr.mean) / psr.std) + assert_array_equal(data[1]['inputs'][1], + (raw_3d_data['inputs'][1] - psr.mean) / psr.std) + + raw_data = generate_dummy_data(2, (77, )) + psr = ActionDataPreprocessor(to_float32=False) + data = psr(raw_data) + assert data['inputs'].dtype == raw_data['inputs'][0].dtype diff --git a/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..054d1056a6d3f3a91c7e14ce9361e01185b466e3 --- /dev/null +++ b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +import torch +from numpy.testing import assert_array_equal + +from mmaction.models import MultiModalDataPreprocessor +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules + + +def generate_dummy_data(batch_size, input_keys, input_shapes): + data = dict() + data['data_samples'] = [ + ActionDataSample().set_gt_label(2) for _ in range(batch_size) + ] + data['inputs'] = dict() + for key, shape in zip(input_keys, input_shapes): + data['inputs'][key] = [ + torch.randint(0, 255, shape) for _ in range(batch_size) + ] + + return data + + +def test_multimodal_data_preprocessor(): + with pytest.raises(AssertionError): + MultiModalDataPreprocessor( + preprocessors=dict(imgs=dict(format_shape='NCTHW'))) + + register_all_modules() + data_keys = ('imgs', 'heatmap_imgs') + data_shapes = ((1, 3, 8, 224, 224), (1, 17, 32, 64, 64)) + raw_data = generate_dummy_data(2, data_keys, data_shapes) + + psr = MultiModalDataPreprocessor( + preprocessors=dict( + imgs=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + heatmap_imgs=dict(type='ActionDataPreprocessor'))) + + data = psr(copy.deepcopy(raw_data)) + assert data['inputs']['imgs'].shape == (2, 1, 3, 8, 224, 224) + assert data['inputs']['heatmap_imgs'].shape == (2, 1, 17, 32, 64, 64) + psr_imgs = psr.preprocessors['imgs'] + assert_array_equal(data['inputs']['imgs'][0], + (raw_data['inputs']['imgs'][0] - psr_imgs.mean) / + psr_imgs.std) + assert_array_equal(data['inputs']['imgs'][1], + (raw_data['inputs']['imgs'][1] - psr_imgs.mean) / + psr_imgs.std) + assert_array_equal(data['inputs']['heatmap_imgs'][0], + raw_data['inputs']['heatmap_imgs'][0]) + assert_array_equal(data['inputs']['heatmap_imgs'][1], + raw_data['inputs']['heatmap_imgs'][1]) + + data_keys = ('imgs_2D', 'imgs_3D') + data_shapes = ((1, 3, 224, 224), (1, 3, 8, 224, 224)) + raw_data = generate_dummy_data(2, data_keys, data_shapes) + + psr = MultiModalDataPreprocessor( + preprocessors=dict( + imgs_2D=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + imgs_3D=dict( + type='ActionDataPreprocessor', + mean=[127.5, 127.5, 127.5], + std=[57.5, 57.5, 57.5], + format_shape='NCTHW'))) + + data = psr(copy.deepcopy(raw_data)) + assert data['inputs']['imgs_2D'].shape == (2, 1, 3, 224, 224) + assert data['inputs']['imgs_3D'].shape == (2, 1, 3, 8, 224, 224) + psr_imgs2d = psr.preprocessors['imgs_2D'] + psr_imgs3d = psr.preprocessors['imgs_3D'] + assert_array_equal(data['inputs']['imgs_2D'][0], + (raw_data['inputs']['imgs_2D'][0] - psr_imgs2d.mean) / + psr_imgs2d.std) + assert_array_equal(data['inputs']['imgs_2D'][1], + (raw_data['inputs']['imgs_2D'][1] - psr_imgs2d.mean) / + psr_imgs2d.std) + assert_array_equal(data['inputs']['imgs_3D'][0], + (raw_data['inputs']['imgs_3D'][0] - psr_imgs3d.mean) / + psr_imgs3d.std) + assert_array_equal(data['inputs']['imgs_3D'][1], + (raw_data['inputs']['imgs_3D'][1] - psr_imgs3d.mean) / + psr_imgs3d.std) diff --git a/tests/models/heads/test_feature_head.py b/tests/models/heads/test_feature_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1f13574f5ed6efdb0eac0a8f6415f93da46539c7 --- /dev/null +++ b/tests/models/heads/test_feature_head.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch + +from mmaction.models import FeatureHead +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_recognizer_cfg +from mmaction.utils import register_all_modules + + +class TestFeatureHead(TestCase): + + def test_2d_recognizer(self): + register_all_modules() + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['pretrained'] = None + config.model['cls_head'] = dict( + type='FeatureHead', average_clips='score') + + recognizer = MODELS.build(config.model) + + input_shape = [3, 3, 32, 32] + data_batch = { + 'inputs': [torch.randint(0, 256, input_shape)], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + feat = recognizer.test_step(data_batch) + assert isinstance(feat, torch.Tensor) + assert feat.shape == torch.Size([1, 2048]) + + def test_3d_recognizer(self): + register_all_modules() + config = get_recognizer_cfg( + 'slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + config.model['backbone']['pretrained2d'] = False + config.model['cls_head'] = dict( + type='FeatureHead', average_clips='score') + + recognizer = MODELS.build(config.model) + input_shape = [1, 3, 4, 32, 32] + data_batch = { + 'inputs': [torch.randint(0, 256, input_shape)], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + feat = recognizer.test_step(data_batch) + assert isinstance(feat, torch.Tensor) + assert feat.shape == torch.Size([1, 2048]) + + def test_3d_backbone(self): + with pytest.raises(NotImplementedError): + head = FeatureHead(spatial_type='test') + + head = FeatureHead(average_clips='score') + x = torch.rand(1, 64, 2, 7, 7) + feat = head(x) + assert feat.shape == torch.Size([1, 64]) + + head = FeatureHead(spatial_type=None, average_clips='score') + feat = head(x) + assert feat.shape == torch.Size([1, 64, 7, 7]) + + head = FeatureHead(temporal_type=None, average_clips='score') + feat = head(x) + assert feat.shape == torch.Size([1, 64, 2]) + + head = FeatureHead( + spatial_type=None, temporal_type=None, average_clips='score') + feat = head(x) + assert feat.shape == torch.Size([1, 64, 2, 7, 7]) + + def test_slowfast_backbone(self): + head = FeatureHead(backbone_name='slowfast', average_clips='score') + x_slow = torch.rand(1, 64, 2, 7, 7) + x_fast = torch.rand(1, 32, 6, 7, 7) + x = (x_slow, x_fast) + feat = head(x) + assert feat.shape == torch.Size([1, 96]) + + head = FeatureHead( + backbone_name='slowfast', spatial_type=None, average_clips='score') + feat = head(x) + assert feat.shape == torch.Size([1, 96, 7, 7]) + + with pytest.raises(AssertionError): + head = FeatureHead( + backbone_name='slowfast', + temporal_type=None, + average_clips='score') + feat = head(x) + + def test_2d_backbone(self): + head = FeatureHead(average_clips='score') + x = torch.rand(2, 64, 7, 7) + with pytest.raises(AssertionError): + feat = head(x) + + feat = head(x, num_segs=2) + assert feat.shape == torch.Size([1, 64]) + + x = torch.rand(2, 64, 7, 7) + head = FeatureHead(spatial_type=None, average_clips='score') + feat = head(x, num_segs=2) + assert feat.shape == torch.Size([1, 64, 7, 7]) + + head = FeatureHead(temporal_type=None, average_clips='score') + feat = head(x, num_segs=2) + assert feat.shape == torch.Size([1, 2, 64]) + + def test_tsm_backbone(self): + head = FeatureHead(backbone_name='tsm', average_clips='score') + x = torch.rand(2, 64, 7, 7) + with pytest.raises(AssertionError): + feat = head(x) + with pytest.raises(AssertionError): + feat = head(x, num_segs=2) + + head = FeatureHead(num_segments=2, average_clips='score') + feat = head(x, num_segs=2) + assert feat.shape == torch.Size([1, 64]) + + x = torch.rand(2, 64, 7, 7) + head = FeatureHead( + num_segments=2, spatial_type=None, average_clips='score') + feat = head(x, num_segs=2) + assert feat.shape == torch.Size([1, 64, 7, 7]) + + def test_gcn_backbone(self): + # N, M, C, T, V + head = FeatureHead(backbone_name='gcn', average_clips='score') + x = torch.rand(1, 5, 64, 2, 7) + feat = head(x) + assert feat.shape == torch.Size([1, 64]) diff --git a/tests/models/heads/test_gcn_head.py b/tests/models/heads/test_gcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..20d1f534ea3211fe98fea797bc90ec63c7b79a11 --- /dev/null +++ b/tests/models/heads/test_gcn_head.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import GCNHead + + +def test_gcn_head(): + """Test GCNHead.""" + with pytest.raises(AssertionError): + GCNHead(4, 5)(torch.rand((1, 2, 6, 75, 17))) + + gcn_head = GCNHead(num_classes=60, in_channels=256) + gcn_head.init_weights() + feat = torch.rand(1, 2, 256, 75, 25) + cls_scores = gcn_head(feat) + assert gcn_head.num_classes == 60 + assert gcn_head.in_channels == 256 + assert cls_scores.shape == torch.Size([1, 60]) + + gcn_head = GCNHead(num_classes=60, in_channels=256, dropout=0.1) + gcn_head.init_weights() + feat = torch.rand(1, 2, 256, 75, 25) + cls_scores = gcn_head(feat) + assert gcn_head.num_classes == 60 + assert gcn_head.in_channels == 256 + assert cls_scores.shape == torch.Size([1, 60]) diff --git a/tests/models/heads/test_i3d_head.py b/tests/models/heads/test_i3d_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f3632849afd90fa41e30dbdca971fe3cb8491fac --- /dev/null +++ b/tests/models/heads/test_i3d_head.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import I3DHead + + +def test_i3d_head(): + """Test loss method, layer construction, attributes and forward function in + i3d head.""" + i3d_head = I3DHead(num_classes=4, in_channels=2048) + i3d_head.init_weights() + + assert i3d_head.num_classes == 4 + assert i3d_head.dropout_ratio == 0.5 + assert i3d_head.in_channels == 2048 + assert i3d_head.init_std == 0.01 + + assert isinstance(i3d_head.dropout, nn.Dropout) + assert i3d_head.dropout.p == i3d_head.dropout_ratio + + assert isinstance(i3d_head.fc_cls, nn.Linear) + assert i3d_head.fc_cls.in_features == i3d_head.in_channels + assert i3d_head.fc_cls.out_features == i3d_head.num_classes + + assert isinstance(i3d_head.avg_pool, nn.AdaptiveAvgPool3d) + assert i3d_head.avg_pool.output_size == (1, 1, 1) + + input_shape = (3, 2048, 4, 7, 7) + feat = torch.rand(input_shape) + + # i3d head inference + cls_scores = i3d_head(feat) + assert cls_scores.shape == torch.Size([3, 4]) diff --git a/tests/models/heads/test_mvit_head.py b/tests/models/heads/test_mvit_head.py new file mode 100644 index 0000000000000000000000000000000000000000..95873cdbe9a7870f3b99f2f770bc6bc6e8839583 --- /dev/null +++ b/tests/models/heads/test_mvit_head.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +import torch.nn as nn + +from mmaction.models import MViTHead + + +class TestMViTHead(TestCase): + DEFAULT_ARGS = dict(in_channels=768, num_classes=5) + fake_feats = ([torch.rand(4, 768, 3, 2, 2), torch.rand(4, 768)], ) + + def test_init(self): + head = MViTHead(**self.DEFAULT_ARGS) + head.init_weights() + self.assertEqual(head.dropout.p, head.dropout_ratio) + self.assertIsInstance(head.fc_cls, nn.Linear) + self.assertEqual(head.num_classes, 5) + self.assertEqual(head.dropout_ratio, 0.5) + self.assertEqual(head.in_channels, 768) + self.assertEqual(head.init_std, 0.02) + + def test_pre_logits(self): + head = MViTHead(**self.DEFAULT_ARGS) + pre_logits = head.pre_logits(self.fake_feats) + self.assertIs(pre_logits, self.fake_feats[-1][1]) + + def test_forward(self): + head = MViTHead(**self.DEFAULT_ARGS) + cls_score = head(self.fake_feats) + self.assertEqual(cls_score.shape, (4, 5)) diff --git a/tests/models/heads/test_omni_head.py b/tests/models/heads/test_omni_head.py new file mode 100644 index 0000000000000000000000000000000000000000..bea7738fb7fb425c5cccb32f5d09dcd2dba27789 --- /dev/null +++ b/tests/models/heads/test_omni_head.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import OmniHead + + +class obj(): + + def __init__(self, name, value): + super(obj, self).__init__() + setattr(self, name, value) + + +def testOmniHead(): + head = OmniHead(image_classes=100, video_classes=200, in_channels=400) + + image_feat = torch.randn(2, 400, 8, 8) + image_score = head(image_feat) + assert image_score.shape == torch.Size([2, 100]) + + video_feat = torch.randn(2, 400, 8, 8, 8) + video_score = head(video_feat) + assert video_score.shape == torch.Size([2, 200]) + + head = OmniHead( + image_classes=100, + video_classes=200, + in_channels=400, + video_nl_head=True) + + video_feat = torch.randn(2, 400, 8, 8, 8) + video_score = head(video_feat) + assert video_score.shape == torch.Size([2, 200]) + data_samples = [obj('gt_label', torch.tensor(1)) for _ in range(2)] + losses = head.loss_by_feat(video_score, data_samples) + assert 'loss_cls' in losses + + image_feat = torch.randn(1, 400, 8, 8) + head.eval() + image_score = head(image_feat) + assert image_score.shape == torch.Size([1, 100]) + data_samples = [obj('gt_label', torch.tensor(1))] + losses = head.loss_by_feat(image_score, data_samples) + assert 'loss_cls' in losses diff --git a/tests/models/heads/test_rgbpose_head.py b/tests/models/heads/test_rgbpose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4a5e6a8fdbe19b33966e25fa097b63de56a0dbfc --- /dev/null +++ b/tests/models/heads/test_rgbpose_head.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import RGBPoseHead + + +def test_rgbpose_head(): + """Test RGBPoseHead.""" + rgbpose_head = RGBPoseHead( + num_classes=4, + in_channels=[2048, 512], + dropout=dict(rgb=0.51, pose=0.49)) + rgbpose_head.init_weights() + + assert rgbpose_head.num_classes == 4 + assert rgbpose_head.dropout == dict(rgb=0.51, pose=0.49) + assert rgbpose_head.in_channels == [2048, 512] + assert rgbpose_head.init_std == 0.01 + + assert isinstance(rgbpose_head.dropout_rgb, nn.Dropout) + assert isinstance(rgbpose_head.dropout_pose, nn.Dropout) + assert rgbpose_head.dropout_rgb.p == rgbpose_head.dropout['rgb'] + assert rgbpose_head.dropout_pose.p == rgbpose_head.dropout['pose'] + + assert isinstance(rgbpose_head.fc_rgb, nn.Linear) + assert isinstance(rgbpose_head.fc_pose, nn.Linear) + assert rgbpose_head.fc_rgb.in_features == rgbpose_head.in_channels[0] + assert rgbpose_head.fc_rgb.out_features == rgbpose_head.num_classes + assert rgbpose_head.fc_pose.in_features == rgbpose_head.in_channels[1] + assert rgbpose_head.fc_pose.out_features == rgbpose_head.num_classes + + assert isinstance(rgbpose_head.avg_pool, nn.AdaptiveAvgPool3d) + assert rgbpose_head.avg_pool.output_size == (1, 1, 1) + + feat_rgb = torch.rand((2, 2048, 8, 7, 7)) + feat_pose = torch.rand((2, 512, 32, 7, 7)) + + cls_scores = rgbpose_head((feat_rgb, feat_pose)) + assert cls_scores['rgb'].shape == torch.Size([2, 4]) + assert cls_scores['pose'].shape == torch.Size([2, 4]) diff --git a/tests/models/heads/test_slowfast_head.py b/tests/models/heads/test_slowfast_head.py new file mode 100644 index 0000000000000000000000000000000000000000..e2009d516599e8e475f6cd60825557838decdb2f --- /dev/null +++ b/tests/models/heads/test_slowfast_head.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import SlowFastHead + + +def test_slowfast_head(): + """Test loss method, layer construction, attributes and forward function in + slowfast head.""" + sf_head = SlowFastHead(num_classes=4, in_channels=2304) + sf_head.init_weights() + + assert sf_head.num_classes == 4 + assert sf_head.dropout_ratio == 0.8 + assert sf_head.in_channels == 2304 + assert sf_head.init_std == 0.01 + + assert isinstance(sf_head.dropout, nn.Dropout) + assert sf_head.dropout.p == sf_head.dropout_ratio + + assert isinstance(sf_head.fc_cls, nn.Linear) + assert sf_head.fc_cls.in_features == sf_head.in_channels + assert sf_head.fc_cls.out_features == sf_head.num_classes + + assert isinstance(sf_head.avg_pool, nn.AdaptiveAvgPool3d) + assert sf_head.avg_pool.output_size == (1, 1, 1) + + input_shape = (3, 2048, 32, 7, 7) + feat_slow = torch.rand(input_shape) + + input_shape = (3, 256, 4, 7, 7) + feat_fast = torch.rand(input_shape) + + sf_head = SlowFastHead(num_classes=4, in_channels=2304) + cls_scores = sf_head((feat_slow, feat_fast)) + assert cls_scores.shape == torch.Size([3, 4]) diff --git a/tests/models/heads/test_timesformer_head.py b/tests/models/heads/test_timesformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d8f860339581a3c4ba5cd7fa7b7b31581ac5544d --- /dev/null +++ b/tests/models/heads/test_timesformer_head.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import TimeSformerHead + + +def test_timesformer_head(): + """Test loss method, layer construction, attributes and forward function in + timesformer head.""" + timesformer_head = TimeSformerHead(num_classes=4, in_channels=64) + timesformer_head.init_weights() + + assert timesformer_head.num_classes == 4 + assert timesformer_head.in_channels == 64 + assert timesformer_head.init_std == 0.02 + + input_shape = (2, 64) + feat = torch.rand(input_shape) + + cls_scores = timesformer_head(feat) + assert cls_scores.shape == torch.Size([2, 4]) diff --git a/tests/models/heads/test_tpn_head.py b/tests/models/heads/test_tpn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5bde1f3ed5920b7cd8049c5dc936b4e5841686c0 --- /dev/null +++ b/tests/models/heads/test_tpn_head.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import TPNHead + + +def test_tpn_head(): + """Test loss method, layer construction, attributes and forward function in + tpn head.""" + tpn_head = TPNHead(num_classes=4, in_channels=2048) + tpn_head.init_weights() + + assert hasattr(tpn_head, 'avg_pool2d') + assert hasattr(tpn_head, 'avg_pool3d') + assert isinstance(tpn_head.avg_pool3d, nn.AdaptiveAvgPool3d) + assert tpn_head.avg_pool3d.output_size == (1, 1, 1) + assert tpn_head.avg_pool2d is None + + input_shape = (4, 2048, 7, 7) + feat = torch.rand(input_shape) + + # tpn head inference with num_segs + num_segs = 2 + cls_scores = tpn_head(feat, num_segs) + assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d) + assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7) + assert cls_scores.shape == torch.Size([2, 4]) + + # tpn head inference with no num_segs + input_shape = (2, 2048, 3, 7, 7) + feat = torch.rand(input_shape) + cls_scores = tpn_head(feat) + assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d) + assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7) + assert cls_scores.shape == torch.Size([2, 4]) diff --git a/tests/models/heads/test_trn_head.py b/tests/models/heads/test_trn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..90f9a48beabca35282723661a4839a5be1972290 --- /dev/null +++ b/tests/models/heads/test_trn_head.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +import torch.nn as nn + +from mmaction.models import TRNHead + + +def test_trn_head(): + """Test loss method, layer construction, attributes and forward function in + trn head.""" + from mmaction.models.heads.trn_head import (RelationModule, + RelationModuleMultiScale) + trn_head = TRNHead(num_classes=4, in_channels=2048, relation_type='TRN') + trn_head.init_weights() + + assert trn_head.num_classes == 4 + assert trn_head.dropout_ratio == 0.8 + assert trn_head.in_channels == 2048 + assert trn_head.init_std == 0.001 + assert trn_head.spatial_type == 'avg' + + relation_module = trn_head.consensus + assert isinstance(relation_module, RelationModule) + assert relation_module.hidden_dim == 256 + assert isinstance(relation_module.classifier[3], nn.Linear) + assert relation_module.classifier[3].out_features == trn_head.num_classes + + assert trn_head.dropout.p == trn_head.dropout_ratio + assert isinstance(trn_head.dropout, nn.Dropout) + assert isinstance(trn_head.fc_cls, nn.Linear) + assert trn_head.fc_cls.in_features == trn_head.in_channels + assert trn_head.fc_cls.out_features == trn_head.hidden_dim + + assert isinstance(trn_head.avg_pool, nn.AdaptiveAvgPool2d) + assert trn_head.avg_pool.output_size == 1 + + input_shape = (8, 2048, 7, 7) + feat = torch.rand(input_shape) + + # tsm head inference with no init + num_segs = input_shape[0] + cls_scores = trn_head(feat, num_segs) + assert cls_scores.shape == torch.Size([1, 4]) + + # tsm head inference with init + trn_head = TRNHead( + num_classes=4, + in_channels=2048, + num_segments=8, + relation_type='TRNMultiScale') + trn_head.init_weights() + assert isinstance(trn_head.consensus, RelationModuleMultiScale) + assert trn_head.consensus.scales == range(8, 1, -1) + cls_scores = trn_head(feat, num_segs) + assert cls_scores.shape == torch.Size([1, 4]) + + with pytest.raises(ValueError): + trn_head = TRNHead( + num_classes=4, + in_channels=2048, + num_segments=8, + relation_type='RelationModlue') diff --git a/tests/models/heads/test_tsm_head.py b/tests/models/heads/test_tsm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..70c2b8ed10106ed1c734798d19e27cdd103de707 --- /dev/null +++ b/tests/models/heads/test_tsm_head.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import TSMHead + + +def test_tsm_head(): + """Test loss method, layer construction, attributes and forward function in + tsm head.""" + tsm_head = TSMHead(num_classes=4, in_channels=2048) + tsm_head.init_weights() + + assert tsm_head.num_classes == 4 + assert tsm_head.dropout_ratio == 0.8 + assert tsm_head.in_channels == 2048 + assert tsm_head.init_std == 0.001 + assert tsm_head.consensus.dim == 1 + assert tsm_head.spatial_type == 'avg' + + assert isinstance(tsm_head.dropout, nn.Dropout) + assert tsm_head.dropout.p == tsm_head.dropout_ratio + + assert isinstance(tsm_head.fc_cls, nn.Linear) + assert tsm_head.fc_cls.in_features == tsm_head.in_channels + assert tsm_head.fc_cls.out_features == tsm_head.num_classes + + assert isinstance(tsm_head.avg_pool, nn.AdaptiveAvgPool2d) + assert tsm_head.avg_pool.output_size == 1 + + input_shape = (8, 2048, 7, 7) + feat = torch.rand(input_shape) + + # tsm head inference with no init + num_segs = input_shape[0] + cls_scores = tsm_head(feat, num_segs) + assert cls_scores.shape == torch.Size([1, 4]) + + # tsm head inference with init + tsm_head = TSMHead(num_classes=4, in_channels=2048, temporal_pool=True) + tsm_head.init_weights() + cls_scores = tsm_head(feat, num_segs) + assert cls_scores.shape == torch.Size([2, 4]) diff --git a/tests/models/heads/test_tsn_head.py b/tests/models/heads/test_tsn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..abedf4ba94189f1162413d104c098e2bc0fbd4d3 --- /dev/null +++ b/tests/models/heads/test_tsn_head.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import TSNHead + + +def test_tsn_head(): + """Test loss method, layer construction, attributes and forward function in + tsn head.""" + tsn_head = TSNHead(num_classes=4, in_channels=2048) + tsn_head.init_weights() + + assert tsn_head.num_classes == 4 + assert tsn_head.dropout_ratio == 0.4 + assert tsn_head.in_channels == 2048 + assert tsn_head.init_std == 0.01 + assert tsn_head.consensus.dim == 1 + assert tsn_head.spatial_type == 'avg' + + assert isinstance(tsn_head.dropout, nn.Dropout) + assert tsn_head.dropout.p == tsn_head.dropout_ratio + + assert isinstance(tsn_head.fc_cls, nn.Linear) + assert tsn_head.fc_cls.in_features == tsn_head.in_channels + assert tsn_head.fc_cls.out_features == tsn_head.num_classes + + assert isinstance(tsn_head.avg_pool, nn.AdaptiveAvgPool2d) + assert tsn_head.avg_pool.output_size == (1, 1) + + input_shape = (8, 2048, 7, 7) + feat = torch.rand(input_shape) + + # tsn head inference + num_segs = input_shape[0] + cls_scores = tsn_head(feat, num_segs) + assert cls_scores.shape == torch.Size([1, 4]) + + # Test multi-class recognition + multi_tsn_head = TSNHead( + num_classes=4, + in_channels=2048, + loss_cls=dict(type='BCELossWithLogits', loss_weight=160.0), + multi_class=True, + label_smooth_eps=0.01) + multi_tsn_head.init_weights() + assert multi_tsn_head.num_classes == 4 + assert multi_tsn_head.dropout_ratio == 0.4 + assert multi_tsn_head.in_channels == 2048 + assert multi_tsn_head.init_std == 0.01 + assert multi_tsn_head.consensus.dim == 1 + + assert isinstance(multi_tsn_head.dropout, nn.Dropout) + assert multi_tsn_head.dropout.p == multi_tsn_head.dropout_ratio + + assert isinstance(multi_tsn_head.fc_cls, nn.Linear) + assert multi_tsn_head.fc_cls.in_features == multi_tsn_head.in_channels + assert multi_tsn_head.fc_cls.out_features == multi_tsn_head.num_classes + + assert isinstance(multi_tsn_head.avg_pool, nn.AdaptiveAvgPool2d) + assert multi_tsn_head.avg_pool.output_size == (1, 1) + + input_shape = (8, 2048, 7, 7) + feat = torch.rand(input_shape) + + # multi-class tsn head inference + num_segs = input_shape[0] + cls_scores = tsn_head(feat, num_segs) + assert cls_scores.shape == torch.Size([1, 4]) diff --git a/tests/models/heads/test_x3d_head.py b/tests/models/heads/test_x3d_head.py new file mode 100644 index 0000000000000000000000000000000000000000..96866016f5c42474271c8b0b20f8aa05555dc66c --- /dev/null +++ b/tests/models/heads/test_x3d_head.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import X3DHead + + +def test_x3d_head(): + """Test loss method, layer construction, attributes and forward function in + x3d head.""" + x3d_head = X3DHead(in_channels=432, num_classes=4, fc1_bias=False) + x3d_head.init_weights() + + assert x3d_head.num_classes == 4 + assert x3d_head.dropout_ratio == 0.5 + assert x3d_head.in_channels == 432 + assert x3d_head.init_std == 0.01 + + assert isinstance(x3d_head.dropout, nn.Dropout) + assert x3d_head.dropout.p == x3d_head.dropout_ratio + + assert isinstance(x3d_head.fc1, nn.Linear) + assert x3d_head.fc1.in_features == x3d_head.in_channels + assert x3d_head.fc1.out_features == x3d_head.mid_channels + assert x3d_head.fc1.bias is None + + assert isinstance(x3d_head.fc2, nn.Linear) + assert x3d_head.fc2.in_features == x3d_head.mid_channels + assert x3d_head.fc2.out_features == x3d_head.num_classes + + assert isinstance(x3d_head.pool, nn.AdaptiveAvgPool3d) + assert x3d_head.pool.output_size == (1, 1, 1) + + input_shape = (3, 432, 4, 7, 7) + feat = torch.rand(input_shape) + + # i3d head inference + cls_scores = x3d_head(feat) + assert cls_scores.shape == torch.Size([3, 4]) diff --git a/tests/models/localizers/__init__.py b/tests/models/localizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/localizers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/localizers/test_bmn.py b/tests/models/localizers/test_bmn.py new file mode 100644 index 0000000000000000000000000000000000000000..4b3348cb37c0d019838b14c40ba56bbd20c5b1fa --- /dev/null +++ b/tests/models/localizers/test_bmn.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import numpy as np +import pytest +import torch +from mmcv.transforms import to_tensor +from mmengine.structures import InstanceData + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_localizer_cfg +from mmaction.utils import register_all_modules + +register_all_modules() + + +def get_localization_data_sample(): + gt_bbox = np.array([[0.1, 0.3], [0.375, 0.625]]) + data_sample = ActionDataSample() + instance_data = InstanceData() + instance_data['gt_bbox'] = to_tensor(gt_bbox) + data_sample.gt_instances = instance_data + data_sample.set_metainfo( + dict( + video_name='v_test', + duration_second=100, + duration_frame=960, + feature_frame=960)) + return data_sample + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_bmn_loss(): + model_cfg = get_localizer_cfg( + 'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py') + + if 0 and torch.cuda.is_available(): + raw_feature = [torch.rand(400, 100).cuda()] + data_samples = [get_localization_data_sample()] + localizer_bmn = MODELS.build(model_cfg.model).cuda() + losses = localizer_bmn(raw_feature, data_samples, mode='loss') + assert isinstance(losses, dict) + + else: + raw_feature = [torch.rand(400, 100)] + data_samples = [get_localization_data_sample()] + localizer_bmn = MODELS.build(model_cfg.model) + losses = localizer_bmn(raw_feature, data_samples, mode='loss') + assert isinstance(losses, dict) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_bmn_predict(): + model_cfg = get_localizer_cfg( + 'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py') + + if 0 and torch.cuda.is_available(): + localizer_bmn = MODELS.build(model_cfg.model).cuda() + data_samples = [get_localization_data_sample()] + + with torch.no_grad(): + one_raw_feature = [torch.rand(400, 100).cuda()] + localizer_bmn(one_raw_feature, data_samples, mode='predict') + else: + localizer_bmn = MODELS.build(model_cfg.model) + data_samples = [get_localization_data_sample()] + with torch.no_grad(): + one_raw_feature = [torch.rand(400, 100)] + localizer_bmn(one_raw_feature, data_samples, mode='predict') + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_bmn_tensor(): + model_cfg = get_localizer_cfg( + 'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py') + + if 0 and torch.cuda.is_available(): + localizer_bmn = MODELS.build(model_cfg.model).cuda() + + with torch.no_grad(): + one_raw_feature = [torch.rand(400, 100).cuda()] + localizer_bmn(one_raw_feature, data_samples=None, mode='tensor') + else: + localizer_bmn = MODELS.build(model_cfg.model) + with torch.no_grad(): + one_raw_feature = [torch.rand(400, 100)] + localizer_bmn(one_raw_feature, data_samples=None, mode='tensor') diff --git a/tests/models/localizers/test_localization_utils.py b/tests/models/localizers/test_localization_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8377916480ea83eec95272ed69853469c35012eb --- /dev/null +++ b/tests/models/localizers/test_localization_utils.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from mmaction.models.localizers.utils import (generate_bsp_feature, + generate_candidate_proposals, + soft_nms, temporal_iop, + temporal_iou) + + +def test_temporal_iou(): + anchors_min = np.array([0.0, 0.5]) + anchors_max = np.array([1.0, 1.5]) + box_min = 0.5 + box_max = 1.0 + + iou = temporal_iou(anchors_min, anchors_max, box_min, box_max) + assert_array_equal(iou, np.array([0.5, 0.5])) + + +def test_temporal_iop(): + anchors_min = np.array([0.0, 0.5]) + anchors_max = np.array([1.0, 1.5]) + box_min = 0.4 + box_max = 1.1 + + ioa = temporal_iop(anchors_min, anchors_max, box_min, box_max) + assert_array_almost_equal(ioa, np.array([0.6, 0.6])) + + +def test_soft_nms(): + proposals = np.array([[0., 1., 1., 1., 0.5, 0.5], + [0., 0.4, 1., 1., 0.4, 0.4], + [0., 0.95, 1., 1., 0.6, 0.6]]) + proposal_list = soft_nms(proposals, 0.75, 0.65, 0.9, 1) + assert_array_equal(proposal_list, [[0., 0.95, 0.6], [0., 0.4, 0.4]]) + + +def test_generate_candidate_proposals(): + video_list = [0, 1] + video_infos = [ + dict( + video_name='v_test1', + duration_second=100, + duration_frame=1000, + annotations=[{ + 'segment': [30.0, 60.0], + 'label': 'Rock climbing' + }], + feature_frame=900), + dict( + video_name='v_test2', + duration_second=100, + duration_frame=1000, + annotations=[{ + 'segment': [6.0, 8.0], + 'label': 'Drinking beer' + }], + feature_frame=900) + ] + tem_results_dir = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/tem_results')) + # test when tem_result_ext is not valid + with pytest.raises(NotImplementedError): + result_dict = generate_candidate_proposals( + video_list, + video_infos, + tem_results_dir, + 5, + 0.5, + tem_results_ext='unsupport_ext') + # test without result_dict + assert_result1 = np.array([ + [0.1, 0.7, 0.58390868, 0.35708317, 0.20850396, 0.55555556, 0.55555556], + [0.1, 0.5, 0.58390868, 0.32605207, 0.19038463, 0.29411765, 0.41666667], + [0.1, 0.3, 0.58390868, 0.26221931, 0.15311213, 0., 0.], + [0.3, 0.7, 0.30626667, 0.35708317, 0.10936267, 0.83333333, 0.83333333], + [0.3, 0.5, 0.30626667, 0.32605207, 0.09985888, 0.45454545, 0.83333333] + ]) + assert_result2 = np.array( + [[0.1, 0.3, 0.78390867, 0.3622193, 0.28394685, 0., 0.], + [0.1, 0.7, 0.78390867, 0.35708317, 0.27992059, 0., 0.], + [0.1, 0.5, 0.78390867, 0.32605207, 0.25559504, 0., 0.]]) + result_dict = generate_candidate_proposals(video_list, video_infos, + tem_results_dir, 5, 0.5) + + assert_array_almost_equal(result_dict['v_test1'], assert_result1) + assert_array_almost_equal(result_dict['v_test2'], assert_result2) + + # test with result_dict + result_dict = {} + generate_candidate_proposals( + video_list, + video_infos, + tem_results_dir, + 5, + 0.5, + result_dict=result_dict) + + assert_array_almost_equal(result_dict['v_test1'], assert_result1) + assert_array_almost_equal(result_dict['v_test2'], assert_result2) + + +def test_generate_bsp_feature(): + video_list = [0, 1] + video_infos = [ + dict( + video_name='v_test1', + duration_second=100, + duration_frame=1000, + annotations=[{ + 'segment': [30.0, 60.0], + 'label': 'Rock climbing' + }], + feature_frame=900), + dict( + video_name='v_test2', + duration_second=100, + duration_frame=1000, + annotations=[{ + 'segment': [6.0, 8.0], + 'label': 'Drinking beer' + }], + feature_frame=900) + ] + tem_results_dir = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/tem_results')) + pgm_proposals_dir = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/proposals')) + + # test when extension is not valid + with pytest.raises(NotImplementedError): + result_dict = generate_bsp_feature( + video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + tem_results_ext='unsupport_ext') + + with pytest.raises(NotImplementedError): + result_dict = generate_bsp_feature( + video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + pgm_proposal_ext='unsupport_ext') + + # test without result_dict + result_dict = generate_bsp_feature( + video_list, video_infos, tem_results_dir, pgm_proposals_dir, top_k=2) + assert_result1 = np.array( + [[ + 0.02633105, 0.02489364, 0.02345622, 0.0220188, 0.02058138, + 0.01914396, 0.01770654, 0.01626912, 0.01541432, 0.01514214, + 0.01486995, 0.01459776, 0.01432558, 0.01405339, 0.01378121, + 0.01350902, 0.03064331, 0.02941124, 0.02817916, 0.02694709, + 0.02571502, 0.02448295, 0.02325087, 0.0220188, 0.01432558, + 0.01409228, 0.01385897, 0.01362567, 0.01339237, 0.01315907, + 0.01292577, 0.01269246 + ], + [ + 0.01350902, 0.01323684, 0.01296465, 0.01269246, 0.01242028, + 0.01214809, 0.01187591, 0.01160372, 0.01154264, 0.01169266, + 0.01184269, 0.01199271, 0.01214273, 0.01229275, 0.01244278, + 0.0125928, 0.01432558, 0.01409228, 0.01385897, 0.01362567, + 0.01339237, 0.01315907, 0.01292577, 0.01269246, 0.01214273, + 0.01227132, 0.01239991, 0.0125285, 0.0126571, 0.01278569, + 0.01291428, 0.01304287 + ]]) + assert_result2 = np.array( + [[ + 0.04133105, 0.03922697, 0.03712288, 0.0350188, 0.03291471, + 0.03081063, 0.02870654, 0.02660246, 0.02541432, 0.02514214, + 0.02486995, 0.02459776, 0.02432558, 0.02405339, 0.02378121, + 0.02350902, 0.04764331, 0.04583981, 0.04403631, 0.04223281, + 0.0404293, 0.0386258, 0.0368223, 0.0350188, 0.02432558, 0.02409228, + 0.02385897, 0.02362567, 0.02339237, 0.02315907, 0.02292577, + 0.02269246 + ], + [ + 0.02350902, 0.02323684, 0.02296465, 0.02269246, 0.02242028, + 0.02214809, 0.02187591, 0.02160372, 0.02120931, 0.02069266, + 0.02017602, 0.01965937, 0.01914273, 0.01862609, 0.01810944, + 0.0175928, 0.02432558, 0.02409228, 0.02385897, 0.02362567, + 0.02339237, 0.02315907, 0.02292577, 0.02269246, 0.01914273, + 0.01869989, 0.01825706, 0.01781422, 0.01737138, 0.01692854, + 0.0164857, 0.01604287 + ]]) + assert_array_almost_equal(result_dict['v_test1'], assert_result1) + assert_array_almost_equal(result_dict['v_test2'], assert_result2) + + # test with result_dict + result_dict = {} + generate_bsp_feature( + video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + top_k=2, + result_dict=result_dict) + assert_array_almost_equal(result_dict['v_test1'], assert_result1) + assert_array_almost_equal(result_dict['v_test2'], assert_result2) diff --git a/tests/models/localizers/test_localizers.py b/tests/models/localizers/test_localizers.py new file mode 100644 index 0000000000000000000000000000000000000000..a73f886cc7d7288497713ef3aabe7692cffaed84 --- /dev/null +++ b/tests/models/localizers/test_localizers.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +from mmaction.models.localizers.utils import post_processing + + +def test_post_processing(): + # test with multiple results + result = np.array([[0., 1., 1., 1., 0.5, 0.5], [0., 0.4, 1., 1., 0.4, 0.4], + [0., 0.95, 1., 1., 0.6, 0.6]]) + video_info = dict( + video_name='v_test', + duration_second=100, + duration_frame=960, + feature_frame=960) + proposal_list = post_processing(result, video_info, 0.75, 0.65, 0.9, 2, 16) + assert isinstance(proposal_list[0], dict) + assert proposal_list[0]['score'] == 0.6 + assert proposal_list[0]['segment'] == [0., 95.0] + assert isinstance(proposal_list[1], dict) + assert proposal_list[1]['score'] == 0.4 + assert proposal_list[1]['segment'] == [0., 40.0] + + # test with only result + result = np.array([[0., 1., 1., 1., 0.5, 0.5]]) + video_info = dict( + video_name='v_test', + duration_second=100, + duration_frame=960, + feature_frame=960) + proposal_list = post_processing(result, video_info, 0.75, 0.65, 0.9, 1, 16) + assert isinstance(proposal_list[0], dict) + assert proposal_list[0]['score'] == 0.5 + assert proposal_list[0]['segment'] == [0., 100.0] diff --git a/tests/models/localizers/test_pem.py b/tests/models/localizers/test_pem.py new file mode 100644 index 0000000000000000000000000000000000000000..82c8d1d7a6864c25dfbfcfa19dac0c3b5ac3e51e --- /dev/null +++ b/tests/models/localizers/test_pem.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import pytest +import torch +from mmengine.structures import InstanceData + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_localizer_cfg +from mmaction.utils import register_all_modules + +register_all_modules() + + +def get_localization_data_sample(): + bsp_feature = torch.rand(100, 32) + reference_temporal_iou = torch.rand(100) + data_sample = ActionDataSample() + instance_data = InstanceData() + instance_data['bsp_feature'] = bsp_feature + instance_data['reference_temporal_iou'] = reference_temporal_iou + data_sample.gt_instances = instance_data + return data_sample + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_pem(): + model_cfg = get_localizer_cfg( + 'bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py') + + localizer_pem = MODELS.build(model_cfg.model) + raw_features = [torch.rand(100, 32)] * 8 + data_samples = [get_localization_data_sample()] * 8 + losses = localizer_pem(raw_features, data_samples, mode='loss') + assert isinstance(losses, dict) + + # Test forward predict + tmin = torch.rand(100) + tmax = torch.rand(100) + tmin_score = torch.rand(100) + tmax_score = torch.rand(100) + + video_meta = dict( + video_name='v_test', + duration_second=100, + duration_frame=1000, + annotations=[{ + 'segment': [0.3, 0.6], + 'label': 'Rock climbing' + }], + feature_frame=900) + + with torch.no_grad(): + raw_feature = [torch.rand(100, 32)] + data_sample = get_localization_data_sample() + data_sample.set_metainfo(video_meta) + gt_instances = data_sample.gt_instances + gt_instances['tmin'] = tmin + gt_instances['tmax'] = tmax + gt_instances['tmin_score'] = tmin_score + gt_instances['tmax_score'] = tmax_score + data_samples = [data_sample] + + localizer_pem(raw_feature, data_samples, mode='predict') + + # Test forward tensor + with torch.no_grad(): + raw_feature = [torch.rand(100, 32)] + localizer_pem(raw_feature, data_samples=None, mode='tensor') diff --git a/tests/models/localizers/test_tem.py b/tests/models/localizers/test_tem.py new file mode 100644 index 0000000000000000000000000000000000000000..720e63f46e76ce150539ac8b0977f8f2b9c6b6b3 --- /dev/null +++ b/tests/models/localizers/test_tem.py @@ -0,0 +1,50 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import numpy as np +import pytest +import torch +from mmcv.transforms import to_tensor +from mmengine.structures import InstanceData + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_localizer_cfg +from mmaction.utils import register_all_modules + +register_all_modules() + + +def get_localization_data_sample(): + gt_bbox = np.array([[0.1, 0.3], [0.375, 0.625]]) + data_sample = ActionDataSample() + instance_data = InstanceData() + instance_data['gt_bbox'] = to_tensor(gt_bbox) + data_sample.gt_instances = instance_data + data_sample.set_metainfo( + dict( + video_name='v_test', + duration_second=100, + duration_frame=960, + feature_frame=960)) + return data_sample + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_tem(): + model_cfg = get_localizer_cfg( + 'bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py') + + localizer_tem = MODELS.build(model_cfg.model) + raw_feature = torch.rand(8, 400, 100) + # gt_bbox = torch.Tensor([[[1.0, 3.0], [3.0, 5.0]]] * 8) + data_samples = [get_localization_data_sample()] * 8 + losses = localizer_tem(raw_feature, data_samples, mode='loss') + assert isinstance(losses, dict) + + # Test forward predict + with torch.no_grad(): + for one_raw_feature in raw_feature: + one_raw_feature = one_raw_feature.reshape(1, 400, 100) + data_samples = [get_localization_data_sample()] + localizer_tem(one_raw_feature, data_samples, mode='predict') diff --git a/tests/models/losses/test_binary_logistic_regression_loss.py b/tests/models/losses/test_binary_logistic_regression_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..2f0e835bfdffada1a6afab9805a04f4e52914e0f --- /dev/null +++ b/tests/models/losses/test_binary_logistic_regression_loss.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from numpy.testing import assert_array_almost_equal + +from mmaction.models import BinaryLogisticRegressionLoss, BMNLoss + + +def test_binary_logistic_regression_loss(): + bmn_loss = BMNLoss() + + # test tem_loss + pred_start = torch.tensor([0.9, 0.1]) + pred_end = torch.tensor([0.1, 0.9]) + gt_start = torch.tensor([1., 0.]) + gt_end = torch.tensor([0., 1.]) + output_tem_loss = bmn_loss.tem_loss(pred_start, pred_end, gt_start, gt_end) + binary_logistic_regression_loss = BinaryLogisticRegressionLoss() + assert_loss = ( + binary_logistic_regression_loss(pred_start, gt_start) + + binary_logistic_regression_loss(pred_end, gt_end)) + assert_array_almost_equal( + output_tem_loss.numpy(), assert_loss.numpy(), decimal=4) diff --git a/tests/models/losses/test_bmn_loss.py b/tests/models/losses/test_bmn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..dcaee159f9adf72002eea6a37146d8ea677ed7a6 --- /dev/null +++ b/tests/models/losses/test_bmn_loss.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from numpy.testing import assert_array_almost_equal + +from mmaction.models import BMNLoss + + +def test_bmn_loss(): + bmn_loss = BMNLoss() + + # test tem_loss + pred_start = torch.tensor([0.9, 0.1]) + pred_end = torch.tensor([0.1, 0.9]) + gt_start = torch.tensor([1., 0.]) + gt_end = torch.tensor([0., 1.]) + output_tem_loss = bmn_loss.tem_loss(pred_start, pred_end, gt_start, gt_end) + + # test pem_reg_loss + seed = 1 + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + pred_bm_reg = torch.tensor([[0.1, 0.99], [0.5, 0.4]]) + gt_iou_map = torch.tensor([[0, 1.], [0, 1.]]) + mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]]) + output_pem_reg_loss = bmn_loss.pem_reg_loss(pred_bm_reg, gt_iou_map, mask) + assert_array_almost_equal( + output_pem_reg_loss.numpy(), np.array([0.2140]), decimal=4) + + # test pem_cls_loss + pred_bm_cls = torch.tensor([[0.1, 0.99], [0.95, 0.2]]) + gt_iou_map = torch.tensor([[0., 1.], [0., 1.]]) + mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]]) + output_pem_cls_loss = bmn_loss.pem_cls_loss(pred_bm_cls, gt_iou_map, mask) + assert_array_almost_equal( + output_pem_cls_loss.numpy(), np.array([1.6137]), decimal=4) + + # test bmn_loss + pred_bm = torch.tensor([[[[0.1, 0.99], [0.5, 0.4]], + [[0.1, 0.99], [0.95, 0.2]]]]) + pred_start = torch.tensor([[0.9, 0.1]]) + pred_end = torch.tensor([[0.1, 0.9]]) + gt_iou_map = torch.tensor([[[0., 2.5], [0., 10.]]]) + gt_start = torch.tensor([[1., 0.]]) + gt_end = torch.tensor([[0., 1.]]) + mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]]) + output_loss = bmn_loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start, + gt_end, mask) + assert_array_almost_equal( + output_loss[0].numpy(), + output_tem_loss + 10 * output_pem_reg_loss + output_pem_cls_loss) + assert_array_almost_equal(output_loss[1].numpy(), output_tem_loss) + assert_array_almost_equal(output_loss[2].numpy(), output_pem_reg_loss) + assert_array_almost_equal(output_loss[3].numpy(), output_pem_cls_loss) diff --git a/tests/models/losses/test_cross_entropy_loss.py b/tests/models/losses/test_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..302a7c25852d92adbc4312940ef734da2d68d6b6 --- /dev/null +++ b/tests/models/losses/test_cross_entropy_loss.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from numpy.testing import assert_almost_equal + +from mmaction.models import BCELossWithLogits, CrossEntropyLoss + + +def test_bce_loss_with_logits(): + cls_scores = torch.rand((3, 4)) + gt_labels = torch.rand((3, 4)) + + bce_loss_with_logits = BCELossWithLogits() + output_loss = bce_loss_with_logits(cls_scores, gt_labels) + assert torch.equal( + output_loss, F.binary_cross_entropy_with_logits(cls_scores, gt_labels)) + + weight = torch.rand(4) + class_weight = weight.numpy().tolist() + bce_loss_with_logits = BCELossWithLogits(class_weight=class_weight) + output_loss = bce_loss_with_logits(cls_scores, gt_labels) + assert torch.equal( + output_loss, + F.binary_cross_entropy_with_logits( + cls_scores, gt_labels, weight=weight)) + + +def test_cross_entropy_loss(): + cls_scores = torch.rand((3, 4)) + hard_gt_labels = torch.LongTensor([0, 1, 2]).squeeze() + soft_gt_labels = torch.FloatTensor([[1, 0, 0, 0], [0, 1, 0, 0], + [0, 0, 1, 0]]).squeeze() + + # hard label without weight + cross_entropy_loss = CrossEntropyLoss() + output_loss = cross_entropy_loss(cls_scores, hard_gt_labels) + assert torch.equal(output_loss, F.cross_entropy(cls_scores, + hard_gt_labels)) + + # hard label with class weight + weight = torch.rand(4) + class_weight = weight.numpy().tolist() + cross_entropy_loss = CrossEntropyLoss(class_weight=class_weight) + output_loss = cross_entropy_loss(cls_scores, hard_gt_labels) + assert torch.equal( + output_loss, + F.cross_entropy(cls_scores, hard_gt_labels, weight=weight)) + + # soft label without class weight + cross_entropy_loss = CrossEntropyLoss() + output_loss = cross_entropy_loss(cls_scores, soft_gt_labels) + assert_almost_equal( + output_loss.numpy(), + F.cross_entropy(cls_scores, hard_gt_labels).numpy(), + decimal=4) + + # soft label with class weight + cross_entropy_loss = CrossEntropyLoss(class_weight=class_weight) + output_loss = cross_entropy_loss(cls_scores, soft_gt_labels) + assert_almost_equal( + output_loss.numpy(), + F.cross_entropy(cls_scores, hard_gt_labels, weight=weight).numpy(), + decimal=4) diff --git a/tests/models/losses/test_hvu_loss.py b/tests/models/losses/test_hvu_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..63f577e5c4067184c66a095813fdf0679032fc94 --- /dev/null +++ b/tests/models/losses/test_hvu_loss.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F + +from mmaction.models import HVULoss + + +def test_hvu_loss(): + pred = torch.tensor([[-1.0525, -0.7085, 0.1819, -0.8011], + [0.1555, -1.5550, 0.5586, 1.9746]]) + gt = torch.tensor([[1., 0., 0., 0.], [0., 0., 1., 1.]]) + mask = torch.tensor([[1., 1., 0., 0.], [0., 0., 1., 1.]]) + category_mask = torch.tensor([[1., 0.], [0., 1.]]) + categories = ['action', 'scene'] + category_nums = (2, 2) + category_loss_weights = (1, 1) + loss_all_nomask_sum = HVULoss( + categories=categories, + category_nums=category_nums, + category_loss_weights=category_loss_weights, + loss_type='all', + with_mask=False, + reduction='sum') + loss = loss_all_nomask_sum(pred, gt, mask, category_mask) + loss1 = F.binary_cross_entropy_with_logits(pred, gt, reduction='none') + loss1 = torch.sum(loss1, dim=1) + assert torch.eq(loss['loss_cls'], torch.mean(loss1)) + + loss_all_mask = HVULoss( + categories=categories, + category_nums=category_nums, + category_loss_weights=category_loss_weights, + loss_type='all', + with_mask=True) + loss = loss_all_mask(pred, gt, mask, category_mask) + loss1 = F.binary_cross_entropy_with_logits(pred, gt, reduction='none') + loss1 = torch.sum(loss1 * mask, dim=1) / torch.sum(mask, dim=1) + loss1 = torch.mean(loss1) + assert torch.eq(loss['loss_cls'], loss1) + + loss_ind_mask = HVULoss( + categories=categories, + category_nums=category_nums, + category_loss_weights=category_loss_weights, + loss_type='individual', + with_mask=True) + loss = loss_ind_mask(pred, gt, mask, category_mask) + action_loss = F.binary_cross_entropy_with_logits(pred[:1, :2], gt[:1, :2]) + scene_loss = F.binary_cross_entropy_with_logits(pred[1:, 2:], gt[1:, 2:]) + loss1 = (action_loss + scene_loss) / 2 + assert torch.eq(loss['loss_cls'], loss1) + + loss_ind_nomask_sum = HVULoss( + categories=categories, + category_nums=category_nums, + category_loss_weights=category_loss_weights, + loss_type='individual', + with_mask=False, + reduction='sum') + loss = loss_ind_nomask_sum(pred, gt, mask, category_mask) + action_loss = F.binary_cross_entropy_with_logits( + pred[:, :2], gt[:, :2], reduction='none') + action_loss = torch.sum(action_loss, dim=1) + action_loss = torch.mean(action_loss) + + scene_loss = F.binary_cross_entropy_with_logits( + pred[:, 2:], gt[:, 2:], reduction='none') + scene_loss = torch.sum(scene_loss, dim=1) + scene_loss = torch.mean(scene_loss) + + loss1 = (action_loss + scene_loss) / 2 + assert torch.eq(loss['loss_cls'], loss1) diff --git a/tests/models/losses/test_ohem_hinge_loss.py b/tests/models/losses/test_ohem_hinge_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..ef6d0bb0d3293f62ae5ff8c8b775caf781a926ef --- /dev/null +++ b/tests/models/losses/test_ohem_hinge_loss.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch +from numpy.testing import assert_array_almost_equal +from torch.autograd import Variable + +from mmaction.models import OHEMHingeLoss + + +def test_ohem_hinge_loss(): + # test normal case + pred = torch.tensor([[ + 0.5161, 0.5228, 0.7748, 0.0573, 0.1113, 0.8862, 0.1752, 0.9448, 0.0253, + 0.1009, 0.4371, 0.2232, 0.0412, 0.3487, 0.3350, 0.9294, 0.7122, 0.3072, + 0.2942, 0.7679 + ]], + requires_grad=True) + gt = torch.tensor([8]) + num_video = 1 + loss = OHEMHingeLoss.apply(pred, gt, 1, 1.0, num_video) + assert_array_almost_equal( + loss.detach().numpy(), np.array([0.0552]), decimal=4) + loss.backward(Variable(torch.ones([1]))) + assert_array_almost_equal( + np.array(pred.grad), + np.array([[ + 0., 0., 0., 0., 0., 0., 0., -1., 0., 0., 0., 0., 0., 0., 0., 0., + 0., 0., 0., 0. + ]]), + decimal=4) + + # test error case + with pytest.raises(ValueError): + gt = torch.tensor([8, 10]) + loss = OHEMHingeLoss.apply(pred, gt, 1, 1.0, num_video) diff --git a/tests/models/losses/test_ssn_loss.py b/tests/models/losses/test_ssn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..68b5b670e462b1c50b78f1995815397501fb3c37 --- /dev/null +++ b/tests/models/losses/test_ssn_loss.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from mmengine import ConfigDict + +from mmaction.models import OHEMHingeLoss, SSNLoss + + +def test_ssn_loss(): + ssn_loss = SSNLoss() + + # test activity_loss + activity_score = torch.rand((8, 21)) + labels = torch.LongTensor([8] * 8).squeeze() + activity_indexer = torch.tensor([0, 7]) + output_activity_loss = ssn_loss.activity_loss(activity_score, labels, + activity_indexer) + assert torch.equal( + output_activity_loss, + F.cross_entropy(activity_score[activity_indexer, :], + labels[activity_indexer])) + + # test completeness_loss + completeness_score = torch.rand((8, 20), requires_grad=True) + labels = torch.LongTensor([8] * 8).squeeze() + completeness_indexer = torch.tensor([0, 1, 2, 3, 4, 5, 6]) + positive_per_video = 1 + incomplete_per_video = 6 + output_completeness_loss = ssn_loss.completeness_loss( + completeness_score, labels, completeness_indexer, positive_per_video, + incomplete_per_video) + + pred = completeness_score[completeness_indexer, :] + gt = labels[completeness_indexer] + pred_dim = pred.size(1) + pred = pred.view(-1, positive_per_video + incomplete_per_video, pred_dim) + gt = gt.view(-1, positive_per_video + incomplete_per_video) + # yapf:disable + positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim) # noqa:E501 + incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim) # noqa:E501 + # yapf:enable + ohem_ratio = 0.17 + positive_loss = OHEMHingeLoss.apply( + positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1, + 1.0, positive_per_video) + incomplete_loss = OHEMHingeLoss.apply( + incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1), -1, + ohem_ratio, incomplete_per_video) + num_positives = positive_pred.size(0) + num_incompletes = int(incomplete_pred.size(0) * ohem_ratio) + assert_loss = ((positive_loss + incomplete_loss) / + float(num_positives + num_incompletes)) + assert torch.equal(output_completeness_loss, assert_loss) + + # test reg_loss + bbox_pred = torch.rand((8, 20, 2)) + labels = torch.LongTensor([8] * 8).squeeze() + bbox_targets = torch.rand((8, 2)) + regression_indexer = torch.tensor([0]) + output_reg_loss = ssn_loss.classwise_regression_loss( + bbox_pred, labels, bbox_targets, regression_indexer) + + pred = bbox_pred[regression_indexer, :, :] + gt = labels[regression_indexer] + reg_target = bbox_targets[regression_indexer, :] + class_idx = gt.data - 1 + classwise_pred = pred[:, class_idx, :] + classwise_reg_pred = torch.cat((torch.diag(classwise_pred[:, :, 0]).view( + -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)), + dim=1) + assert torch.equal( + output_reg_loss, + F.smooth_l1_loss(classwise_reg_pred.view(-1), reg_target.view(-1)) * 2) + + # test ssn_loss + proposal_type = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 2]]) + train_cfg = ConfigDict( + dict( + ssn=dict( + sampler=dict( + num_per_video=8, + positive_ratio=1, + background_ratio=1, + incomplete_ratio=6, + add_gt_as_proposals=True), + loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1)))) + output_loss = ssn_loss(activity_score, completeness_score, bbox_pred, + proposal_type, labels, bbox_targets, train_cfg) + assert torch.equal(output_loss['loss_activity'], output_activity_loss) + assert torch.equal(output_loss['loss_completeness'], + output_completeness_loss * 0.1) + assert torch.equal(output_loss['loss_reg'], output_reg_loss * 0.1) diff --git a/tests/models/necks/__init__.py b/tests/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/necks/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/necks/test_tpn.py b/tests/models/necks/test_tpn.py new file mode 100644 index 0000000000000000000000000000000000000000..de63d1dab33b7f92f7a90098e63993f20af54385 --- /dev/null +++ b/tests/models/necks/test_tpn.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +import torch + +from mmaction.models import TPN +from mmaction.structures import ActionDataSample +from mmaction.testing import generate_backbone_demo_inputs + + +def get_label(label_): + label = [] + for idx, one_label in enumerate(label_): + data_sample = ActionDataSample() + data_sample.set_gt_label(label_[idx]) + label.append(data_sample) + return label + + +def test_tpn(): + """Test TPN backbone.""" + + tpn_cfg = dict( + in_channels=(1024, 2048), + out_channels=1024, + spatial_modulation_cfg=dict( + in_channels=(1024, 2048), out_channels=2048), + temporal_modulation_cfg=dict(downsample_scales=(8, 8)), + upsample_cfg=dict(scale_factor=(1, 1, 1)), + downsample_cfg=dict(downsample_scale=(1, 1, 1)), + level_fusion_cfg=dict( + in_channels=(1024, 1024), + mid_channels=(1024, 1024), + out_channels=2048, + downsample_scales=((1, 1, 1), (1, 1, 1))), + aux_head_cfg=dict(out_channels=400, loss_weight=0.5)) + + with pytest.raises(AssertionError): + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cfg_['in_channels'] = list(tpn_cfg_['in_channels']) + TPN(**tpn_cfg_) + + with pytest.raises(AssertionError): + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cfg_['out_channels'] = float(tpn_cfg_['out_channels']) + TPN(**tpn_cfg_) + + with pytest.raises(AssertionError): + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cfg_['downsample_cfg']['downsample_position'] = 'unsupport' + TPN(**tpn_cfg_) + + for k in tpn_cfg: + if not k.endswith('_cfg'): + continue + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cfg_[k] = list() + with pytest.raises(AssertionError): + TPN(**tpn_cfg_) + + with pytest.raises(ValueError): + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cfg_['flow_type'] = 'unsupport' + TPN(**tpn_cfg_) + + target_shape = (32, 1) + target_ = generate_backbone_demo_inputs(target_shape).long().squeeze() + + x0_shape = (32, 1024, 1, 4, 4) + x1_shape = (32, 2048, 1, 2, 2) + x0 = generate_backbone_demo_inputs(x0_shape) + x1 = generate_backbone_demo_inputs(x1_shape) + x = [x0, x1] + + # ResNetTPN with 'cascade' flow_type + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_cascade = TPN(**tpn_cfg_) + target = get_label(target_) + feat, loss_aux = tpn_cascade(x, target) + assert feat.shape == torch.Size([32, 2048, 1, 2, 2]) + assert len(loss_aux) == 1 + + # ResNetTPN with 'parallel' flow_type + tpn_cfg_ = copy.deepcopy(tpn_cfg) + tpn_parallel = TPN(flow_type='parallel', **tpn_cfg_) + target = get_label(target_) + feat, loss_aux = tpn_parallel(x, target) + assert feat.shape == torch.Size([32, 2048, 1, 2, 2]) + assert len(loss_aux) == 1 + + # ResNetTPN with 'cascade' flow_type and target is None + feat, loss_aux = tpn_cascade(x, None) + assert feat.shape == torch.Size([32, 2048, 1, 2, 2]) + assert len(loss_aux) == 0 + + # ResNetTPN with 'parallel' flow_type and target is None + feat, loss_aux = tpn_parallel(x, None) + assert feat.shape == torch.Size([32, 2048, 1, 2, 2]) + assert len(loss_aux) == 0 diff --git a/tests/models/recognizers/__init__.py b/tests/models/recognizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/recognizers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/recognizers/recognizer_omni.py b/tests/models/recognizers/recognizer_omni.py new file mode 100644 index 0000000000000000000000000000000000000000..60ac6311a68ce7d0f2f226352edcb437a64f6516 --- /dev/null +++ b/tests/models/recognizers/recognizer_omni.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest.mock import MagicMock + +import torch + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_recognizer_cfg +from mmaction.utils import register_all_modules + + +def test_omni_resnet(): + register_all_modules() + config = get_recognizer_cfg( + 'omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py') + recognizer = MODELS.build(config.model) + + # test train_step + + video_sample = { + 'inputs': [ + torch.randint(0, 255, (1, 3, 8, 224, 224)), + torch.randint(0, 255, (1, 3, 8, 224, 224)) + ], + 'data_samples': [ + ActionDataSample().set_gt_label(2), + ActionDataSample().set_gt_label(2) + ] + } + + image_sample = { + 'inputs': [ + torch.randint(0, 255, (1, 3, 224, 224)), + torch.randint(0, 255, (1, 3, 224, 224)) + ], + 'data_samples': [ + ActionDataSample().set_gt_label(2), + ActionDataSample().set_gt_label(2) + ] + } + + optim_wrapper = MagicMock() + loss_vars = recognizer.train_step([video_sample, image_sample], + optim_wrapper) + assert 'loss_cls_0' in loss_vars + assert 'loss_cls_1' in loss_vars + + loss_vars = recognizer.train_step([image_sample, video_sample], + optim_wrapper) + assert 'loss_cls_0' in loss_vars + assert 'loss_cls_1' in loss_vars + + # test test_step + with torch.no_grad(): + predictions = recognizer.test_step(video_sample) + score = predictions[0].pred_score + assert len(predictions) == 2 + assert torch.min(score) >= 0 + assert torch.max(score) <= 1 diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py new file mode 100644 index 0000000000000000000000000000000000000000..675874ec320337aa20ec3c589a80fee51415a674 --- /dev/null +++ b/tests/models/recognizers/test_recognizer2d.py @@ -0,0 +1,213 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform +from unittest.mock import MagicMock + +import pytest +import torch +from mmengine.utils import digit_version + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_recognizer_cfg +from mmaction.utils import register_all_modules + + +def train_test_step(cfg, input_shape): + recognizer = MODELS.build(cfg.model) + num_classes = cfg.model.cls_head.num_classes + batch_size = input_shape[0] + input_shape = input_shape[1:] + data_batch = { + 'inputs': + [torch.randint(0, 256, input_shape) for i in range(batch_size)], + 'data_samples': + [ActionDataSample().set_gt_label(2) for i in range(batch_size)] + } + + # test train_step + optim_wrapper = MagicMock() + loss_vars = recognizer.train_step(data_batch, optim_wrapper) + assert 'loss' in loss_vars + assert 'loss_cls' in loss_vars + optim_wrapper.update_params.assert_called_once() + + # test test_step + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == batch_size + assert score.shape == torch.Size([num_classes]) + assert torch.min(score) >= 0 + assert torch.max(score) <= 1 + + # test twice sample + 3 crops + num_views = input_shape[0] * 2 * 3 + input_shape = (num_views, *input_shape[1:]) + data_batch['inputs'] = [torch.randint(0, 256, input_shape)] + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == batch_size + assert score.shape == torch.Size([num_classes]) + + return loss_vars, predictions + + +def test_tsn(): + register_all_modules() + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + +def test_tsn_mmcls_backbone(): + register_all_modules() + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + # test mmcls backbone + mmcls_backbone = dict( + type='mmcls.ResNeXt', + depth=101, + num_stages=4, + out_indices=(3, ), + groups=32, + width_per_group=4, + style='pytorch') + config.model['backbone'] = mmcls_backbone + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + from mmcls.models import ResNeXt + mmcls_backbone['type'] = ResNeXt + config.model['backbone'] = mmcls_backbone + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + +def test_tsn_mobileone(): + register_all_modules() + config = get_recognizer_cfg( + 'tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['init_cfg'] = None + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + +def test_tsn_timm_backbone(): + # test tsn from timm + register_all_modules() + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py') + timm_backbone = dict(type='timm.efficientnet_b0', pretrained=False) + config.model['backbone'] = timm_backbone + config.model['cls_head']['in_channels'] = 1280 + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + import timm + if digit_version(timm.__version__) <= digit_version('0.6.7'): + feature_shape = 'NLC' + else: + feature_shape = 'NHWC' + + timm_swin = dict( + type='timm.swin_base_patch4_window7_224', + pretrained=False, + feature_shape=feature_shape) + config.model['backbone'] = timm_swin + config.model['cls_head']['in_channels'] = 1024 + + input_shape = (1, 3, 3, 224, 224) + train_test_step(config, input_shape) + + +def test_tsn_tv_backbone(): + register_all_modules() + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + # test tv backbone + tv_backbone = dict(type='torchvision.densenet161', pretrained=True) + config.model['backbone'] = tv_backbone + config.model['cls_head']['in_channels'] = 2208 + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + from torchvision.models import densenet161 + tv_backbone = dict(type=densenet161, pretrained=True) + config.model['backbone'] = tv_backbone + config.model['cls_head']['in_channels'] = 2208 + + input_shape = (1, 3, 3, 32, 32) + train_test_step(config, input_shape) + + +def test_tsm(): + register_all_modules() + # test tsm-mobilenetv2 + config = get_recognizer_cfg( + 'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['pretrained'] = None + config.model['backbone']['pretrained2d'] = None + + input_shape = (1, 8, 3, 32, 32) + train_test_step(config, input_shape) + + # test tsm-res50 + config = get_recognizer_cfg( + 'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + config.model['backbone']['pretrained2d'] = None + + input_shape = (1, 8, 3, 32, 32) + train_test_step(config, input_shape) + + # test tsm-mobileone + config = get_recognizer_cfg( + 'tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['init_cfg'] = None + config.model['backbone']['pretrained2d'] = None + + input_shape = (1, 16, 3, 32, 32) + train_test_step(config, input_shape) + + +def test_trn(): + register_all_modules() + config = get_recognizer_cfg( + 'trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py') + config.model['backbone']['pretrained'] = None + + input_shape = (1, 8, 3, 32, 32) + train_test_step(config, input_shape) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_tpn(): + register_all_modules() + config = get_recognizer_cfg( + 'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py') + config.model['backbone']['pretrained'] = None + + input_shape = (1, 8, 3, 64, 64) + train_test_step(config, input_shape) + + +def test_tanet(): + register_all_modules() + config = get_recognizer_cfg('tanet/tanet_imagenet-pretrained-r50_8xb8-' + 'dense-1x1x8-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + + input_shape = (1, 8, 3, 32, 32) + train_test_step(config, input_shape) diff --git a/tests/models/recognizers/test_recognizer3d.py b/tests/models/recognizers/test_recognizer3d.py new file mode 100644 index 0000000000000000000000000000000000000000..657e1038504af809c57943bf82f8b6acd6ab0929 --- /dev/null +++ b/tests/models/recognizers/test_recognizer3d.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest.mock import MagicMock + +import torch + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_recognizer_cfg +from mmaction.utils import register_all_modules + + +def train_test_step(cfg, input_shape): + recognizer = MODELS.build(cfg.model) + num_classes = cfg.model.cls_head.num_classes + data_batch = { + 'inputs': [torch.randint(0, 256, input_shape)], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + + # test train_step + optim_wrapper = MagicMock() + loss_vars = recognizer.train_step(data_batch, optim_wrapper) + assert 'loss' in loss_vars + assert 'loss_cls' in loss_vars + optim_wrapper.update_params.assert_called_once() + + # test test_step + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == 1 + assert score.shape == torch.Size([num_classes]) + assert torch.min(score) >= 0 + assert torch.max(score) <= 1 + + # test when average_clips is None + recognizer.cls_head.average_clips = None + num_views = 3 + input_shape = (num_views, *input_shape[1:]) + data_batch['inputs'] = [torch.randint(0, 256, input_shape)] + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == 1 + assert score.shape == torch.Size([num_views, num_classes]) + + return loss_vars, predictions + + +def test_i3d(): + register_all_modules() + config = get_recognizer_cfg( + 'i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 8, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_r2plus1d(): + register_all_modules() + config = get_recognizer_cfg( + 'r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + config.model['backbone']['norm_cfg'] = dict(type='BN3d') + input_shape = (1, 3, 8, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_slowfast(): + register_all_modules() + config = get_recognizer_cfg( + 'slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py') + input_shape = (1, 3, 16, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_csn(): + register_all_modules() + config = get_recognizer_cfg( + 'csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 8, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_timesformer(): + register_all_modules() + config = get_recognizer_cfg( + 'timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + config.model['backbone']['img_size'] = 32 + input_shape = (1, 3, 8, 32, 32) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_c3d(): + register_all_modules() + config = get_recognizer_cfg( + 'c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py') + config.model['backbone']['pretrained'] = None + config.model['backbone']['out_dim'] = 512 + input_shape = (1, 3, 16, 28, 28) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_slowonly(): + register_all_modules() + config = get_recognizer_cfg( + 'slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 4, 32, 32) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_tpn_slowonly(): + register_all_modules() + config = get_recognizer_cfg('tpn/tpn-slowonly_imagenet-pretrained-r50_' + '8xb8-8x8x1-150e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 4, 48, 48) # M C T H W + loss_vars, _ = train_test_step(config, input_shape=input_shape) + assert 'loss_aux' in loss_vars + assert loss_vars['loss_cls'] + loss_vars['loss_aux'] == loss_vars['loss'] + + +def test_swin(): + register_all_modules() + config = get_recognizer_cfg('swin/swin-tiny-p244-w877_in1k-pre_' + '8xb8-amp-32x2x1-30e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 4, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) + + +def test_c2d(): + register_all_modules() + config = get_recognizer_cfg( + 'c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + input_shape = (1, 3, 8, 64, 64) # M C T H W + train_test_step(config, input_shape=input_shape) diff --git a/tests/models/recognizers/test_recognizer_gcn.py b/tests/models/recognizers/test_recognizer_gcn.py new file mode 100644 index 0000000000000000000000000000000000000000..70333a8dbb9f2a4c5d9135b3cda33e63cd00b190 --- /dev/null +++ b/tests/models/recognizers/test_recognizer_gcn.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest.mock import MagicMock + +import torch + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_skeletongcn_cfg +from mmaction.utils import register_all_modules + + +def train_test_step(cfg, input_shape): + recognizer = MODELS.build(cfg.model) + num_classes = cfg.model.cls_head.num_classes + data_batch = { + 'inputs': [torch.randn(input_shape)], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + + # test train_step + optim_wrapper = MagicMock() + loss_vars = recognizer.train_step(data_batch, optim_wrapper) + assert 'loss' in loss_vars + assert 'loss_cls' in loss_vars + optim_wrapper.update_params.assert_called_once() + + # test test_step + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == 1 + assert score.shape == torch.Size([num_classes]) + assert torch.min(score) >= 0 + assert torch.max(score) <= 1 + + # test when average_clips is None + recognizer.cls_head.average_clips = None + num_clips = 3 + input_shape = (num_clips, *input_shape[1:]) + data_batch['inputs'] = [torch.randn(input_shape)] + with torch.no_grad(): + predictions = recognizer.test_step(data_batch) + score = predictions[0].pred_score + assert len(predictions) == 1 + assert score.shape == torch.Size([num_clips, num_classes]) + + return loss_vars, predictions + + +def test_stgcn(): + register_all_modules() + config = get_skeletongcn_cfg( + 'stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py') + input_shape = (1, 2, 30, 17, 3) # N M T V C + train_test_step(config, input_shape=input_shape) + + +def test_agcn(): + register_all_modules() + config = get_skeletongcn_cfg( + '2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py') + input_shape = (1, 2, 30, 17, 3) # N M T V C + train_test_step(config, input_shape=input_shape) + + +def test_stgcn_plusplus(): + register_all_modules() + config = get_skeletongcn_cfg( + 'stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py') + input_shape = (1, 2, 30, 17, 3) # N M T V C + train_test_step(config, input_shape=input_shape) diff --git a/tests/models/roi_heads/test_bbox_heads.py b/tests/models/roi_heads/test_bbox_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..1eb6c86a1a62b56790bfecb4caf269849a21b347 --- /dev/null +++ b/tests/models/roi_heads/test_bbox_heads.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import BBoxHeadAVA + + +def test_bbox_head_ava(): + """Test loss method, layer construction, attributes and forward function in + bbox head.""" + with pytest.raises(TypeError): + # topk must be None, int or tuple[int] + BBoxHeadAVA(background_class=True, topk=0.1) + + with pytest.raises(AssertionError): + # topk should be smaller than num_classes + BBoxHeadAVA(background_class=True, num_classes=5, topk=(3, 5)) + + bbox_head = BBoxHeadAVA( + background_class=True, in_channels=10, num_classes=4, topk=1) + input = torch.randn([3, 10, 2, 2, 2]) + ret = bbox_head(input) + assert ret.shape == (3, 4) + + cls_score = torch.tensor( + [[0.568, -0.162, 0.273, -0.390, 0.447, 0.102, -0.409], + [2.388, 0.609, 0.369, 1.630, -0.808, -0.212, 0.296], + [0.252, -0.533, -0.644, -0.591, 0.148, 0.963, -0.525], + [0.134, -0.311, -0.764, -0.752, 0.656, -1.517, 0.185]]) + + # Test topk_to_matrix() + assert torch.equal( + BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 1), + torch.tensor([[0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0]], + dtype=bool)) + assert torch.equal( + BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 2), + torch.tensor([[0, 1, 0, 1, 0, 0], [1, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1]], + dtype=bool)) + assert torch.equal( + BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 3), + torch.tensor([[0, 1, 0, 1, 1, 0], [1, 1, 1, 0, 0, 0], + [0, 0, 0, 1, 1, 1], [1, 0, 0, 1, 0, 1]], + dtype=bool)) + assert torch.equal( + BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 6), + torch.ones([4, 6], dtype=bool)) + + # Test Multi-Label Loss + bbox_head = BBoxHeadAVA( + background_class=True) # Why is this here? isn't this redundant? + bbox_head.init_weights() + bbox_head = BBoxHeadAVA( + background_class=True, + temporal_pool_type='max', + spatial_pool_type='avg') + bbox_head.init_weights() + + # test without background class + """ + losses = bbox_head.loss( + cls_score=cls_score, + bbox_pred=None, + rois=None, + labels=labels, + label_weights=label_weights) + assert torch.isclose(losses['loss_action_cls'], torch.tensor(0.7162495)) + assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.6666666)) + assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.4791665)) + assert torch.isclose(losses['recall@top3'], torch.tensor(0.75)) + assert torch.isclose(losses['prec@top3'], torch.tensor(0.5)) + assert torch.isclose(losses['recall@top5'], torch.tensor(1.0)) + assert torch.isclose(losses['prec@top5'], torch.tensor(0.45)) + + # Test Single-Label Loss + bbox_head = BBoxHeadAVA(multilabel=False) + losses = bbox_head.loss( + cls_score=cls_score, + bbox_pred=None, + rois=None, + labels=labels, + label_weights=label_weights) + assert torch.isclose(losses['loss_action_cls'], torch.tensor(1.639561)) + assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.25)) + assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.25)) + assert torch.isclose(losses['recall@top3'], torch.tensor(0.75)) + assert torch.isclose(losses['prec@top3'], torch.tensor(0.5)) + assert torch.isclose(losses['recall@top5'], torch.tensor(1.0)) + assert torch.isclose(losses['prec@top5'], torch.tensor(0.45)) + + # Test ROI + rois = torch.tensor([[0.0, 0.1, 0.2, 0.3, 0.4], [0.0, 0.5, 0.6, 0.7, 0.8]]) + rois[1::2] *= 380 + rois[2::2] *= 220 + crop_quadruple = np.array([0.1, 0.2, 0.8, 0.7]) + cls_score = torch.tensor([0.995, 0.728]) + img_shape = (320, 480) + flip = True + + bbox_head = BBoxHeadAVA(multilabel=True) + bboxes, scores = bbox_head.get_det_bboxes( + rois=rois, + cls_score=cls_score, + img_shape=img_shape, + flip=flip, + crop_quadruple=crop_quadruple) + assert torch.all( + torch.isclose( + bboxes, + torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500], + [0.45499998, 0.69875002, 0.58166665, 0.86499995]]))) + assert torch.all( + torch.isclose(scores, torch.tensor([0.73007441, 0.67436624]))) + + bbox_head = BBoxHeadAVA(multilabel=False) + bboxes, scores = bbox_head.get_det_bboxes( + rois=rois, + cls_score=cls_score, + img_shape=img_shape, + flip=flip, + crop_quadruple=crop_quadruple) + assert torch.all( + torch.isclose( + bboxes, + torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500], + [0.45499998, 0.69875002, 0.58166665, 0.86499995]]))) + assert torch.all(torch.isclose(scores, torch.tensor([0.56636, 0.43364]))) + """ diff --git a/tests/models/roi_heads/test_fbo_head.py b/tests/models/roi_heads/test_fbo_head.py new file mode 100644 index 0000000000000000000000000000000000000000..725dfe56fae4c4993bfbba116f85b11e73d95121 --- /dev/null +++ b/tests/models/roi_heads/test_fbo_head.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import torch + +from mmaction.models import FBOHead + + +def test_fbo_head(): + """Test layer construction, attributes and forward function in fbo head.""" + lfb_prefix_path = osp.normpath( + osp.join(osp.dirname(__file__), '../../data/lfb')) + + st_feat_shape = (1, 16, 1, 8, 8) + st_feat = torch.rand(st_feat_shape) + rois = torch.randn(1, 5) + rois[0][0] = 0 + img_metas = [dict(img_key='video_1, 930')] + + # non local fbo + fbo_head = FBOHead( + lfb_cfg=dict( + lfb_prefix_path=lfb_prefix_path, + max_num_sampled_feat=5, + window_size=60, + lfb_channels=16, + dataset_modes=('unittest'), + device='cpu'), + fbo_cfg=dict( + type='non_local', + st_feat_channels=16, + lt_feat_channels=16, + latent_channels=8, + num_st_feat=1, + num_lt_feat=5 * 60, + )) + fbo_head.init_weights() + out = fbo_head(st_feat, rois, img_metas) + assert out.shape == (1, 24, 1, 1, 1) + + # avg fbo + fbo_head = FBOHead( + lfb_cfg=dict( + lfb_prefix_path=lfb_prefix_path, + max_num_sampled_feat=5, + window_size=60, + lfb_channels=16, + dataset_modes=('unittest'), + device='cpu'), + fbo_cfg=dict(type='avg')) + fbo_head.init_weights() + out = fbo_head(st_feat, rois, img_metas) + assert out.shape == (1, 32, 1, 1, 1) + + # max fbo + fbo_head = FBOHead( + lfb_cfg=dict( + lfb_prefix_path=lfb_prefix_path, + max_num_sampled_feat=5, + window_size=60, + lfb_channels=16, + dataset_modes=('unittest'), + device='cpu'), + fbo_cfg=dict(type='max')) + fbo_head.init_weights() + out = fbo_head(st_feat, rois, img_metas) + assert out.shape == (1, 32, 1, 1, 1) diff --git a/tests/models/roi_heads/test_roi_extractors.py b/tests/models/roi_heads/test_roi_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..d4733b7e38e1829d314c44154a674912e4b97f02 --- /dev/null +++ b/tests/models/roi_heads/test_roi_extractors.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""import torch TODO! from mmaction.models import SingleRoIExtractor3D. + +def test_single_roi_extractor3d(): + roi_extractor = SingleRoIExtractor3D( + roi_layer_type='RoIAlign', + featmap_stride=16, + output_size=8, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + with_temporal_pool=True) + feat = torch.randn([4, 64, 8, 16, 16]) + rois = torch.tensor([[0., 1., 1., 6., 6.], [1., 2., 2., 7., 7.], + [3., 2., 2., 9., 9.], [2., 2., 0., 10., 9.]]) + roi_feat, feat = roi_extractor(feat, rois) + assert roi_feat.shape == (4, 64, 1, 8, 8) + assert feat.shape == (4, 64, 1, 16, 16) + + feat = (torch.randn([4, 64, 8, 16, 16]), torch.randn([4, 32, 16, 16, 16])) + roi_feat, feat = roi_extractor(feat, rois) + assert roi_feat.shape == (4, 96, 1, 8, 8) + assert feat.shape == (4, 96, 1, 16, 16) + + feat = torch.randn([4, 64, 8, 16, 16]) + roi_extractor = SingleRoIExtractor3D( + roi_layer_type='RoIAlign', + featmap_stride=16, + output_size=8, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + with_temporal_pool=False) + roi_feat, feat = roi_extractor(feat, rois) + assert roi_feat.shape == (4, 64, 8, 8, 8) + assert feat.shape == (4, 64, 8, 16, 16) + + feat = (torch.randn([4, 64, 8, 16, 16]), torch.randn([4, 32, 16, 16, 16])) + roi_feat, feat = roi_extractor(feat, rois) + assert roi_feat.shape == (4, 96, 16, 8, 8) + assert feat.shape == (4, 96, 16, 16, 16) + + feat = torch.randn([4, 64, 8, 16, 16]) + roi_extractor = SingleRoIExtractor3D( + roi_layer_type='RoIAlign', + featmap_stride=16, + output_size=8, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + with_temporal_pool=True, + with_global=True) + roi_feat, feat = roi_extractor(feat, rois) + assert roi_feat.shape == (4, 128, 1, 8, 8) + assert feat.shape == (4, 64, 1, 16, 16) +""" diff --git a/tests/models/roi_heads/test_shared_heads.py b/tests/models/roi_heads/test_shared_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..76fe05594cb925f084606814787a227c0800d7a1 --- /dev/null +++ b/tests/models/roi_heads/test_shared_heads.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmaction.models import ACRNHead + + +def test_acrn_head(): + roi_feat = torch.randn(4, 16, 1, 7, 7) + feat = torch.randn(2, 16, 1, 16, 16) + rois = torch.Tensor([[0, 2.2268, 0.5926, 10.6142, 8.0029], + [0, 2.2577, 0.1519, 11.6451, 8.9282], + [1, 1.9874, 1.0000, 11.1585, 8.2840], + [1, 3.3338, 3.7166, 8.4174, 11.2785]]) + + acrn_head = ACRNHead(32, 16) + acrn_head.init_weights() + new_feat = acrn_head(roi_feat, feat, rois) + assert new_feat.shape == (4, 16, 1, 16, 16) + + acrn_head = ACRNHead(32, 16, stride=2) + new_feat = acrn_head(roi_feat, feat, rois) + assert new_feat.shape == (4, 16, 1, 8, 8) + + acrn_head = ACRNHead(32, 16, stride=2, num_convs=2) + new_feat = acrn_head(roi_feat, feat, rois) + assert new_feat.shape == (4, 16, 1, 8, 8) diff --git a/tests/models/similarity/test_adapters.py b/tests/models/similarity/test_adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..dfa58c5efcbf13021cde7a85c1482b972c664712 --- /dev/null +++ b/tests/models/similarity/test_adapters.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import SimpleMeanAdapter, TransformerAdapter + + +def test_transformer_adapter(): + """Test transformer adapter.""" + with pytest.raises(RuntimeError): + num_segs_model = 8 + num_segs_features = 9 + adapter = TransformerAdapter( + num_segs=num_segs_model, + transformer_width=64, + transformer_heads=8, + transformer_layers=2) + features = torch.randn(2, num_segs_features, 64) + adapter(features) + + num_segs = 8 + adapter = TransformerAdapter( + num_segs=num_segs, + transformer_width=64, + transformer_heads=8, + transformer_layers=2) + adapter.init_weights() + features = torch.randn(2, num_segs, 64) + adapted_features = adapter(features) + assert adapted_features.shape == torch.Size([2, 64]) + + +def test_simple_mean_adapter(): + """Test simple mean adapter.""" + + adapter = SimpleMeanAdapter(dim=1) + features = torch.randn(2, 8, 64) + adapted_features = adapter(features) + assert adapted_features.shape == torch.Size([2, 64]) + + adapter = SimpleMeanAdapter(dim=(1, 2)) + features = torch.randn(2, 8, 2, 64) + adapted_features = adapter(features) + assert adapted_features.shape == torch.Size([2, 64]) diff --git a/tests/models/similarity/test_clip_similarity.py b/tests/models/similarity/test_clip_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..5d5f7f34bdd4df0db4d297693663dc79c338deef --- /dev/null +++ b/tests/models/similarity/test_clip_similarity.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform +from unittest.mock import MagicMock + +import pytest +import torch + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_similarity_cfg +from mmaction.utils import register_all_modules + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_clip_similarity(): + register_all_modules() + cfg = get_similarity_cfg( + 'clip4clip/' + 'clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py') + cfg.model.frozen_layers = -1 # no frozen layers + model = MODELS.build(cfg.model) + model.train() + + data_batch = { + 'inputs': { + 'imgs': [torch.randint(0, 256, (2, 3, 224, 224))], + 'text': [torch.randint(0, 49408, (77, ))] + }, + 'data_samples': [ActionDataSample()] + } + + # test train_step + optim_wrapper = MagicMock() + loss_vars = model.train_step(data_batch, optim_wrapper) + assert 'loss' in loss_vars + assert 'sim_loss_v2t' in loss_vars + assert 'sim_loss_t2v' in loss_vars + optim_wrapper.update_params.assert_called_once() + + # test test_step + with torch.no_grad(): + predictions = model.test_step(data_batch) + features = predictions[0].features + assert len(predictions) == 1 + assert features.video_feature.size() == (512, ) + assert features.text_feature.size() == (512, ) + + # test frozen layers + def check_frozen_layers(mdl, frozen_layers): + if frozen_layers >= 0: + top_layers = [ + 'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post', + 'visual.proj' + ] + mid_layers = [ + 'visual.transformer.resblocks', 'transformer.resblocks' + ] + + for name, param in mdl.clip.named_parameters(): + if any(name.find(n) == 0 for n in top_layers): + assert param.requires_grad is True + elif any(name.find(n) == 0 for n in mid_layers): + layer_n = int(name.split('.resblocks.')[1].split('.')[0]) + if layer_n >= frozen_layers: + assert param.requires_grad is True + else: + assert param.requires_grad is False + else: + assert param.requires_grad is False + else: + assert all([p.requires_grad for p in mdl.clip.parameters()]) + + check_frozen_layers(model, -1) + + model.frozen_layers = 0 + model.train() + check_frozen_layers(model, 0) + + model.frozen_layers = 6 + model.train() + check_frozen_layers(model, 6) + + model.frozen_layers = 12 + model.train() + check_frozen_layers(model, 12) diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d --- /dev/null +++ b/tests/models/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/utils/test_blending_utils.py b/tests/models/utils/test_blending_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..568a958fa9be7e0629a8e9cf61b284465098754e --- /dev/null +++ b/tests/models/utils/test_blending_utils.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch +import torch.nn.functional as F +from mmcv.transforms import to_tensor + +from mmaction.models import CutmixBlending, MixupBlending, RandomBatchAugment +from mmaction.structures import ActionDataSample + + +def get_label(label_): + label = [] + for idx, one_label in enumerate(label_): + data_sample = ActionDataSample() + data_sample.set_gt_label(label_[idx]) + label.append(data_sample) + return label + + +def test_mixup(): + alpha = 0.2 + num_classes = 10 + label = get_label([to_tensor(x) for x in range(4)]) + mixup = MixupBlending(num_classes, alpha) + + # NCHW imgs + imgs = torch.randn(4, 4, 3, 32, 32) + mixed_imgs, mixed_label = mixup(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32)) + assert len(mixed_label) == 4 + + # NCTHW imgs + imgs = torch.randn(4, 4, 2, 3, 32, 32) + label = get_label([to_tensor(x) for x in range(4)]) + mixed_imgs, mixed_label = mixup(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 + + # multi-label with one-hot tensor as label + imgs = torch.randn(4, 4, 2, 3, 32, 32) + label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes)) + mixed_imgs, mixed_label = mixup(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 + + +def test_cutmix(): + alpha = 0.2 + num_classes = 10 + label = get_label([to_tensor(x) for x in range(4)]) + cutmix = CutmixBlending(num_classes, alpha) + + # NCHW imgs + imgs = torch.randn(4, 4, 3, 32, 32) + mixed_imgs, mixed_label = cutmix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32)) + assert len(mixed_label) == 4 + + # NCTHW imgs + imgs = torch.randn(4, 4, 2, 3, 32, 32) + label = get_label([to_tensor(x) for x in range(4)]) + mixed_imgs, mixed_label = cutmix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 + + # multi-label with one-hot tensor as label + imgs = torch.randn(4, 4, 2, 3, 32, 32) + label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes)) + mixed_imgs, mixed_label = cutmix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 + + +def test_rand_blend(): + alpha_mixup = 0.2 + alpha_cutmix = 0.2 + num_classes = 10 + label = get_label([to_tensor(x) for x in range(4)]) + blending_augs = [ + dict(type='MixupBlending', alpha=alpha_mixup, num_classes=num_classes), + dict( + type='CutmixBlending', alpha=alpha_cutmix, num_classes=num_classes) + ] + + # test assertion + with pytest.raises(AssertionError): + rand_mix = RandomBatchAugment(blending_augs, [0.5, 0.6]) + + # mixup, cutmix + rand_mix = RandomBatchAugment(blending_augs, probs=None) + assert rand_mix.probs is None + + # mixup, cutmix and None + probs = [0.5, 0.4] + rand_mix = RandomBatchAugment(blending_augs, probs) + + np.testing.assert_allclose(rand_mix.probs[-1], 0.1) + + # test call + imgs = torch.randn(4, 4, 3, 32, 32) # NCHW imgs + mixed_imgs, mixed_label = rand_mix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32)) + assert len(mixed_label) == 4 + + imgs = torch.randn(4, 4, 2, 3, 32, 32) # NCTHW imgs + label = get_label([to_tensor(x) for x in range(4)]) + mixed_imgs, mixed_label = rand_mix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 + + # multi-label with one-hot tensor as label + imgs = torch.randn(4, 4, 2, 3, 32, 32) + label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes)) + mixed_imgs, mixed_label = rand_mix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2e78c3bb2e2283c2a9defd386d4b63099a95b0 --- /dev/null +++ b/tests/models/utils/test_gradcam.py @@ -0,0 +1,238 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import pytest +import torch + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample +from mmaction.testing import get_recognizer_cfg +from mmaction.utils import register_all_modules +from mmaction.utils.gradcam_utils import GradCAM + +register_all_modules() + + +def _get_target_shapes(input_shape, num_classes=400, model_type='2D'): + if model_type not in ['2D', '3D']: + raise ValueError(f'Data type {model_type} is not available') + + preds_target_shape = (input_shape[0], num_classes) + if model_type == '3D': + # input shape (batch_size, num_crops*num_clips, C, clip_len, H, W) + # target shape (batch_size*num_crops*num_clips, clip_len, H, W, C) + blended_imgs_target_shape = (input_shape[0] * input_shape[1], + input_shape[3], input_shape[4], + input_shape[5], input_shape[2]) + else: + # input shape (batch_size, num_segments, C, H, W) + # target shape (batch_size, num_segments, H, W, C) + blended_imgs_target_shape = (input_shape[0], input_shape[1], + input_shape[3], input_shape[4], + input_shape[2]) + + return blended_imgs_target_shape, preds_target_shape + + +def _do_test_2D_models(recognizer, + target_layer_name, + input_shape, + num_classes=400, + device='cpu'): + demo_data = { + 'inputs': [torch.randint(0, 256, input_shape[1:])], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + + recognizer = recognizer.to(device) + gradcam = GradCAM(recognizer, target_layer_name) + + blended_imgs_target_shape, preds_target_shape = _get_target_shapes( + input_shape, num_classes=num_classes, model_type='2D') + + blended_imgs, preds = gradcam(demo_data) + assert blended_imgs.size() == blended_imgs_target_shape + assert preds.size() == preds_target_shape + + blended_imgs, preds = gradcam(demo_data, True) + assert blended_imgs.size() == blended_imgs_target_shape + assert preds.size() == preds_target_shape + + +def _do_test_3D_models(recognizer, + target_layer_name, + input_shape, + num_classes=400): + blended_imgs_target_shape, preds_target_shape = _get_target_shapes( + input_shape, num_classes=num_classes, model_type='3D') + demo_data = { + 'inputs': [torch.randint(0, 256, input_shape[1:])], + 'data_samples': [ActionDataSample().set_gt_label(2)] + } + + gradcam = GradCAM(recognizer, target_layer_name) + + blended_imgs, preds = gradcam(demo_data) + assert blended_imgs.size() == blended_imgs_target_shape + assert preds.size() == preds_target_shape + + blended_imgs, preds = gradcam(demo_data, True) + assert blended_imgs.size() == blended_imgs_target_shape + assert preds.size() == preds_target_shape + + +def test_tsn(): + config = get_recognizer_cfg( + 'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + recognizer = MODELS.build(config.model) + recognizer.cfg = config + + input_shape = (1, 25, 3, 32, 32) + target_layer_name = 'backbone/layer4/1/relu' + + _do_test_2D_models(recognizer, target_layer_name, input_shape) + + +def test_i3d(): + config = get_recognizer_cfg( + 'i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + + recognizer = MODELS.build(config.model) + recognizer.cfg = config + + input_shape = (1, 1, 3, 32, 32, 32) + target_layer_name = 'backbone/layer4/1/relu' + + _do_test_3D_models(recognizer, target_layer_name, input_shape) + + +def test_r2plus1d(): + config = get_recognizer_cfg( + 'r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py') + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + config.model['backbone']['norm_cfg'] = dict(type='BN3d') + + recognizer = MODELS.build(config.model) + recognizer.cfg = config + + input_shape = (1, 3, 3, 8, 16, 16) + target_layer_name = 'backbone/layer4/1/relu' + + _do_test_3D_models(recognizer, target_layer_name, input_shape) + + +def test_slowfast(): + config = get_recognizer_cfg( + 'slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py') + + recognizer = MODELS.build(config.model) + recognizer.cfg = config + + input_shape = (1, 1, 3, 32, 32, 32) + target_layer_name = 'backbone/slow_path/layer4/1/relu' + + _do_test_3D_models(recognizer, target_layer_name, input_shape) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_tsm(): + config = get_recognizer_cfg( + 'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + target_layer_name = 'backbone/layer4/1/relu' + + # base config + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 8, 3, 32, 32) + _do_test_2D_models(recognizer, target_layer_name, input_shape) + + # test twice sample + 3 crops, 2*3*8=48 + config.model.test_cfg = dict(average_clips='prob') + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 48, 3, 32, 32) + _do_test_2D_models(recognizer, target_layer_name, input_shape) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_csn(): + config = get_recognizer_cfg( + 'csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['pretrained2d'] = False + config.model['backbone']['pretrained'] = None + + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 1, 3, 32, 16, 16) + target_layer_name = 'backbone/layer4/1/relu' + + _do_test_3D_models(recognizer, target_layer_name, input_shape) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_tpn(): + target_layer_name = 'backbone/layer4/1/relu' + + config = get_recognizer_cfg( + 'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py') + config.model['backbone']['pretrained'] = None + config.model['backbone']['num_segments'] = 4 + config.model.test_cfg['fcn_test'] = False + recognizer = MODELS.build(config.model) + recognizer.cfg = config + + input_shape = (1, 4, 3, 16, 16) + _do_test_2D_models(recognizer, target_layer_name, input_shape, 174) + + config = get_recognizer_cfg( + 'tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + config.model.test_cfg['fcn_test'] = False + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 3, 3, 4, 16, 16) + _do_test_3D_models(recognizer, target_layer_name, input_shape) + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_c3d(): + config = get_recognizer_cfg( + 'c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py') + config.model['backbone']['pretrained'] = None + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 1, 3, 16, 112, 112) + target_layer_name = 'backbone/conv5a/activate' + _do_test_3D_models(recognizer, target_layer_name, input_shape, 101) + + +@pytest.mark.skipif( + not torch.cuda.is_available(), reason='requires CUDA support') +def test_tin(): + config = get_recognizer_cfg( + 'tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + target_layer_name = 'backbone/layer4/1/relu' + + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 8, 3, 64, 64) + _do_test_2D_models( + recognizer, target_layer_name, input_shape, device='cuda:0') + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_x3d(): + config = get_recognizer_cfg('x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py') + config.model['backbone']['pretrained'] = None + recognizer = MODELS.build(config.model) + recognizer.cfg = config + input_shape = (1, 1, 3, 13, 16, 16) + target_layer_name = 'backbone/layer4/1/relu' + _do_test_3D_models(recognizer, target_layer_name, input_shape) diff --git a/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py b/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py new file mode 100644 index 0000000000000000000000000000000000000000..ad2c198d09639554490a00fea478e6bb4576bcc8 --- /dev/null +++ b/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py @@ -0,0 +1,73 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""import os.path as osp. + +import torch + +from mmaction.datasets import AVADataset + + +def test_assigner_sampler(): + try: + from mmdet.core.bbox import build_assigner, build_sampler + except (ImportError, ModuleNotFoundError): + raise ImportError( + 'Failed to import `build_assigner` and `build_sampler` ' + 'from `mmdet.core.bbox`. The two APIs are required for ' + 'the testing in `test_bbox.py`! ') + + data_prefix = osp.normpath( + osp.join(osp.dirname(__file__), '../../../data/eval_detection')) + ann_file = osp.join(data_prefix, 'gt.csv') + label_file = osp.join(data_prefix, 'action_list.txt') + proposal_file = osp.join(data_prefix, 'proposal.pkl') + dataset = AVADataset( + ann_file=ann_file, + exclude_file=None, + pipeline=[], + label_file=label_file, + proposal_file=proposal_file, + num_classes=4) + + assigner = dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5) + assigner = build_assigner(assigner) + proposal = torch.tensor(dataset[0]['proposals']) + + gt_bboxes = torch.tensor(dataset[0]['gt_bboxes']) + gt_labels = torch.tensor(dataset[0]['gt_labels']) + assign_result = assigner.assign( + bboxes=proposal, + gt_bboxes=gt_bboxes, + gt_bboxes_ignore=None, + gt_labels=gt_labels) + assert assign_result.num_gts == 4 + assert torch.all( + assign_result.gt_inds == torch.tensor([0, 0, 3, 3, 0, 0, 0, 1, 0, 0])) + assert torch.all( + torch.isclose( + assign_result.max_overlaps, + torch.tensor([ + 0.40386841, 0.47127257, 0.53544776, 0.58797631, 0.29281288, + 0.40979504, 0.45902917, 0.50093938, 0.21560125, 0.32948171 + ], + dtype=torch.float64))) + assert torch.all( + torch.isclose( + assign_result.labels, + torch.tensor([[0., 0., 0., 0.], [0., 0., 0., 0.], [0., 1., 0., 0.], + [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.], + [0., 0., 0., 0.], [0., 0., 0., 1.], [0., 0., 0., 0.], + [0., 0., 0., 0.]]))) + sampler = dict(type='RandomSampler', num=32, pos_fraction=1) + sampler = build_sampler(sampler) + sampling_result = sampler.sample(assign_result, proposal, gt_bboxes, + gt_labels) + assert (sampling_result.pos_inds.shape[0] == + sampling_result.pos_bboxes.shape[0]) + assert (sampling_result.neg_inds.shape[0] == + sampling_result.neg_bboxes.shape[0]) + return sampling_result +""" diff --git a/tests/structures/bbox/test_bbox_target.py b/tests/structures/bbox/test_bbox_target.py new file mode 100644 index 0000000000000000000000000000000000000000..d78853102e319f1cbde005136854e2290d60e37b --- /dev/null +++ b/tests/structures/bbox/test_bbox_target.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractproperty + +import torch + +from mmaction.structures import bbox_target + + +def test_bbox_target(): + pos_bboxes = torch.tensor([[0.072, 0.47, 0.84, 0.898], + [0.23, 0.215, 0.781, 0.534], + [0.195, 0.128, 0.643, 0.944], + [0.236, 0.189, 0.689, 0.74]]) + neg_bboxes = torch.tensor([[0.375, 0.371, 0.726, 0.804], + [0.024, 0.398, 0.776, 0.719]]) + pos_gt_labels = torch.tensor([[0., 0., 1., 0.], [0., 0., 0., 1.], + [0., 1., 0., 0.], [0., 1., 0., 0.]]) + cfg = abstractproperty() + cfg.pos_weight = 0.8 + labels, label_weights = bbox_target([pos_bboxes], [neg_bboxes], + [pos_gt_labels], cfg) + assert torch.all( + torch.isclose( + labels, + torch.tensor([[0., 0., 1., 0.], [0., 0., 0., 1.], [0., 1., 0., 0.], + [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., + 0.]]))) + assert torch.all( + torch.isclose(label_weights, torch.tensor([0.8] * 4 + [1.0] * 2))) diff --git a/tests/structures/bbox/test_transforms.py b/tests/structures/bbox/test_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..7690bd5657e7e3f1c6285e4d01d4fcb79220f760 --- /dev/null +++ b/tests/structures/bbox/test_transforms.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmaction.structures import bbox2result + + +def test_bbox2result(): + bboxes = torch.tensor([[0.072, 0.47, 0.84, 0.898], + [0.23, 0.215, 0.781, 0.534], + [0.195, 0.128, 0.643, 0.944], + [0.236, 0.189, 0.689, 0.74], + [0.375, 0.371, 0.726, 0.804], + [0.024, 0.398, 0.776, 0.719]]) + labels = torch.tensor([[-1.650, 0.515, 0.798, 1.240], + [1.368, -1.128, 0.037, -1.087], + [0.481, -1.303, 0.501, -0.463], + [-0.356, 0.126, -0.840, 0.438], + [0.079, 1.269, -0.263, -0.538], + [-0.853, 0.391, 0.103, 0.398]]) + num_classes = 4 + # Test for multi-label + result = bbox2result(bboxes, labels, num_classes) + assert np.all( + np.isclose( + result[0], + np.array([[0.072, 0.47, 0.84, 0.898, 0.515], + [0.236, 0.189, 0.689, 0.74, 0.126], + [0.375, 0.371, 0.726, 0.804, 1.269], + [0.024, 0.398, 0.776, 0.719, 0.391]]))) + assert np.all( + np.isclose( + result[1], + np.array([[0.072, 0.47, 0.84, 0.898, 0.798], + [0.23, 0.215, 0.781, 0.534, 0.037], + [0.195, 0.128, 0.643, 0.944, 0.501], + [0.024, 0.398, 0.776, 0.719, 0.103]]))) + assert np.all( + np.isclose( + result[2], + np.array([[0.072, 0.47, 0.84, 0.898, 1.24], + [0.236, 0.189, 0.689, 0.74, 0.438], + [0.024, 0.398, 0.776, 0.719, 0.398]]))) + + # Test for single-label + result = bbox2result(bboxes, labels, num_classes, -1.0) + assert np.all( + np.isclose(result[0], np.array([[0.375, 0.371, 0.726, 0.804, 1.269]]))) + assert np.all( + np.isclose( + result[1], + np.array([[0.23, 0.215, 0.781, 0.534, 0.037], + [0.195, 0.128, 0.643, 0.944, 0.501]]))) + assert np.all( + np.isclose( + result[2], + np.array([[0.072, 0.47, 0.84, 0.898, 1.240], + [0.236, 0.189, 0.689, 0.74, 0.438], + [0.024, 0.398, 0.776, 0.719, 0.398]]))) diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..c8102f0dd6f61089d337b48d5affc260091d2ef4 --- /dev/null +++ b/tests/utils/test_misc.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import platform +from tempfile import TemporaryDirectory + +import pytest + +from mmaction.utils import frame_extract + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_frame_extract(): + data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data')) + video_path = osp.join(data_prefix, 'test.mp4') + with TemporaryDirectory() as tmp_dir: + # assign short_side + frame_paths, frames = frame_extract( + video_path, short_side=100, out_dir=tmp_dir) + assert osp.exists(tmp_dir) and \ + len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths) + assert min(frames[0].shape[:2]) == 100 + # default short_side + frame_paths, frames = frame_extract(video_path, out_dir=tmp_dir) + assert osp.exists(tmp_dir) and \ + len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths) diff --git a/tests/visualization/test_action_visualizer.py b/tests/visualization/test_action_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..46de15c7703b6d44f0a28ec9944008c76e027ff6 --- /dev/null +++ b/tests/visualization/test_action_visualizer.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +import decord +import pytest + +from mmaction.structures import ActionDataSample +from mmaction.visualization import ActionVisualizer + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_visualizer(): + video = decord.VideoReader('./demo/demo.mp4') + video = video.get_batch(range(32)).asnumpy() + + data_sample = ActionDataSample() + data_sample.set_gt_label(2) + + vis = ActionVisualizer() + vis.add_datasample('demo', video) + vis.add_datasample('demo', video, data_sample) + vis.add_datasample('demo', video, data_sample, step=1) + return diff --git a/tests/visualization/test_video_backend.py b/tests/visualization/test_video_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..fc86a0ed60edca8417560ac769ec009ade340f1f --- /dev/null +++ b/tests/visualization/test_video_backend.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import platform +import time +from pathlib import Path +from tempfile import TemporaryDirectory + +import decord +import pytest + +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules +from mmaction.visualization import ActionVisualizer + +register_all_modules() + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_local_visbackend(): + video = decord.VideoReader('./demo/demo.mp4') + video = video.get_batch(range(32)).asnumpy() + + data_sample = ActionDataSample() + data_sample.set_gt_label(2) + with TemporaryDirectory() as tmp_dir: + vis = ActionVisualizer( + save_dir=tmp_dir, vis_backends=[dict(type='LocalVisBackend')]) + vis.add_datasample('demo', video, data_sample) + for k in range(32): + frame_path = osp.join(tmp_dir, 'vis_data/demo/frames_0/%d.png' % k) + assert Path(frame_path).exists() + + vis.add_datasample('demo', video, data_sample, step=1) + for k in range(32): + frame_path = osp.join(tmp_dir, 'vis_data/demo/frames_1/%d.png' % k) + assert Path(frame_path).exists() + return + + +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') +def test_tensorboard_visbackend(): + video = decord.VideoReader('./demo/demo.mp4') + video = video.get_batch(range(32)).asnumpy() + + data_sample = ActionDataSample() + data_sample.set_gt_label(2) + with TemporaryDirectory() as tmp_dir: + vis = ActionVisualizer( + save_dir=tmp_dir, + vis_backends=[dict(type='TensorboardVisBackend')]) + vis.add_datasample('demo', video, data_sample, step=1) + + assert Path(osp.join(tmp_dir, 'vis_data')).exists() + flag = False + for item in os.listdir(osp.join(tmp_dir, 'vis_data')): + if item.startswith('events.out.tfevents.'): + flag = True + break + assert flag, 'Cannot find tensorboard file!' + # wait tensorboard store asynchronously + time.sleep(1) + return diff --git a/tools/analysis_tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py new file mode 100644 index 0000000000000000000000000000000000000000..9021721062c37c74e9daffcab80acb128c3564c4 --- /dev/null +++ b/tools/analysis_tools/analyze_logs.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns + + +def cal_train_time(log_dicts, args): + for i, log_dict in enumerate(log_dicts): + print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}') + all_times = [] + for epoch in log_dict.keys(): + if args.include_outliers: + all_times.append(log_dict[epoch]['time']) + else: + all_times.append(log_dict[epoch]['time'][1:]) + all_times = np.array(all_times) + epoch_ave_time = all_times.mean(-1) + slowest_epoch = epoch_ave_time.argmax() + fastest_epoch = epoch_ave_time.argmin() + std_over_epoch = epoch_ave_time.std() + print(f'slowest epoch {slowest_epoch + 1}, ' + f'average time is {epoch_ave_time[slowest_epoch]:.4f}') + print(f'fastest epoch {fastest_epoch + 1}, ' + f'average time is {epoch_ave_time[fastest_epoch]:.4f}') + print(f'time std over epochs is {std_over_epoch:.4f}') + print(f'average iter time: {np.mean(all_times):.4f} s/iter') + print() + + +def plot_curve(log_dicts, args): + if args.backend is not None: + plt.switch_backend(args.backend) + sns.set_style(args.style) + # if legend is None, use {filename}_{key} as legend + legend = args.legend + if legend is None: + legend = [] + for json_log in args.json_logs: + for metric in args.keys: + legend.append(f'{json_log}_{metric}') + assert len(legend) == (len(args.json_logs) * len(args.keys)) + metrics = args.keys + + num_metrics = len(metrics) + for i, log_dict in enumerate(log_dicts): + epochs = list(log_dict.keys()) + for j, metric in enumerate(metrics): + print(f'plot curve of {args.json_logs[i]}, metric is {metric}') + if metric not in log_dict[epochs[0]]: + raise KeyError( + f'{args.json_logs[i]} does not contain metric {metric}') + xs = [] + ys = [] + for epoch in epochs: + iters = log_dict[epoch]['iter'] + if log_dict[epoch]['mode'][-1] == 'val': + iters = iters[:-1] + num_iters_per_epoch = iters[-1] + xs.append(np.array(iters) + (epoch - 1) * num_iters_per_epoch) + ys.append(np.array(log_dict[epoch][metric][:len(iters)])) + xs = np.concatenate(xs) + ys = np.concatenate(ys) + plt.xlabel('iter') + plt.plot(xs, ys, label=legend[i * num_metrics + j], linewidth=0.5) + plt.legend() + if args.title is not None: + plt.title(args.title) + if args.out is None: + plt.show() + else: + print(f'save curve to: {args.out}') + plt.savefig(args.out) + plt.cla() + + +def add_plot_parser(subparsers): + parser_plt = subparsers.add_parser( + 'plot_curve', help='parser for plotting curves') + parser_plt.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_plt.add_argument( + '--keys', + type=str, + nargs='+', + default=['top1_acc'], + help='the metric that you want to plot') + parser_plt.add_argument('--title', type=str, help='title of figure') + parser_plt.add_argument( + '--legend', + type=str, + nargs='+', + default=None, + help='legend of each plot') + parser_plt.add_argument( + '--backend', type=str, default=None, help='backend of plt') + parser_plt.add_argument( + '--style', type=str, default='dark', help='style of plt') + parser_plt.add_argument('--out', type=str, default=None) + + +def add_time_parser(subparsers): + parser_time = subparsers.add_parser( + 'cal_train_time', + help='parser for computing the average time per training iteration') + parser_time.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_time.add_argument( + '--include-outliers', + action='store_true', + help='include the first value of every epoch when computing ' + 'the average time') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Analyze Json Log') + # currently only support plot curve and calculate average train time + subparsers = parser.add_subparsers(dest='task', help='task parser') + add_plot_parser(subparsers) + add_time_parser(subparsers) + args = parser.parse_args() + return args + + +def load_json_logs(json_logs): + # load and convert json_logs to log_dict, key is epoch, value is a sub dict + # keys of sub dict is different metrics, e.g. memory, top1_acc + # value of sub dict is a list of corresponding values of all iterations + log_dicts = [dict() for _ in json_logs] + for json_log, log_dict in zip(json_logs, log_dicts): + with open(json_log, 'r') as log_file: + for line in log_file: + log = json.loads(line.strip()) + # skip lines without `epoch` field + if 'epoch' not in log: + continue + epoch = log.pop('epoch') + if epoch not in log_dict: + log_dict[epoch] = defaultdict(list) + for k, v in log.items(): + log_dict[epoch][k].append(v) + return log_dicts + + +def main(): + args = parse_args() + + json_logs = args.json_logs + for json_log in json_logs: + assert json_log.endswith('.json') + + log_dicts = load_json_logs(json_logs) + + eval(args.task)(log_dicts, args) + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/bench_processing.py b/tools/analysis_tools/bench_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..e77c1956cffcc78c2c773000d1910afd701accf9 --- /dev/null +++ b/tools/analysis_tools/bench_processing.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This file is for benchmark dataloading process. The command line to run this +file is: + +$ python -m cProfile -o program.prof tools/analysis/bench_processing.py +configs/task/method/[config filename] + +It use cProfile to record cpu running time and output to program.prof +To visualize cProfile output program.prof, use Snakeviz and run: +$ snakeviz program.prof +""" +import argparse +import os + +import mmcv +from mmcv import Config + +from mmaction import __version__ +from mmaction.datasets import build_dataloader, build_dataset +from mmaction.utils import get_root_logger + + +def main(): + parser = argparse.ArgumentParser(description='Benchmark dataloading') + parser.add_argument('config', help='train config file path') + args = parser.parse_args() + cfg = Config.fromfile(args.config) + + # init logger before other steps + logger = get_root_logger() + logger.info(f'MMAction2 Version: {__version__}') + logger.info(f'Config: {cfg.text}') + + # create bench data list + ann_file_bench = 'benchlist.txt' + if not os.path.exists(ann_file_bench): + with open(cfg.ann_file_train) as f: + lines = f.readlines()[:256] + with open(ann_file_bench, 'w') as f1: + f1.writelines(lines) + cfg.data.train.ann_file = ann_file_bench + + dataset = build_dataset(cfg.data.train) + data_loader = build_dataloader( + dataset, + videos_per_gpu=cfg.data.videos_per_gpu, + workers_per_gpu=0, + persistent_workers=False, + num_gpus=1, + dist=False) + + # Start progress bar after first 5 batches + prog_bar = mmcv.ProgressBar( + len(dataset) - 5 * cfg.data.videos_per_gpu, start=False) + for i, data in enumerate(data_loader): + if i == 5: + prog_bar.start() + for _ in data['imgs']: + if i < 5: + continue + prog_bar.update() + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..dacf78f731375ea25e7938e28b8dbfd2ab8e2ea5 --- /dev/null +++ b/tools/analysis_tools/benchmark.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import time + +import torch +from mmcv import Config +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel +from mmcv.runner.fp16_utils import wrap_fp16_model + +from mmaction.datasets import build_dataloader, build_dataset +from mmaction.models import build_model + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 benchmark a recognizer') + parser.add_argument('config', help='test config file path') + parser.add_argument( + '--log-interval', default=10, help='interval of logging') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.model.backbone.pretrained = None + cfg.data.test.test_mode = True + + # build the dataloader + dataset = build_dataset(cfg.data.test, dict(test_mode=True)) + data_loader = build_dataloader( + dataset, + videos_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + persistent_workers=cfg.data.get('persistent_workers', False), + dist=False, + shuffle=False) + + # build the model and load checkpoint + model = build_model( + cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + if args.fuse_conv_bn: + model = fuse_conv_bn(model) + + model = MMDataParallel(model, device_ids=[0]) + + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + + # benchmark with 2000 video and take the average + for i, data in enumerate(data_loader): + + torch.cuda.synchronize() + start_time = time.perf_counter() + + with torch.no_grad(): + model(return_loss=False, **data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % args.log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print( + f'Done video [{i + 1:<3}/ 2000], fps: {fps:.1f} video / s') + + if (i + 1) == 200: + pure_inf_time += elapsed + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Overall fps: {fps:.1f} video / s') + break + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/check_videos.py b/tools/analysis_tools/check_videos.py new file mode 100644 index 0000000000000000000000000000000000000000..acc0210bdce759e1d57d02cb2c2f36d5872f08f1 --- /dev/null +++ b/tools/analysis_tools/check_videos.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import warnings +from functools import partial +from multiprocessing import Manager, cpu_count + +import numpy as np +from mmengine import Config, DictAction, track_parallel_progress +from mmengine.registry import init_default_scope + +from mmaction.registry import DATASETS, TRANSFORMS + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 check datasets') + parser.add_argument('config', help='test config file path') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + default={}, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function (deprecate), ' + 'change to --eval-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--output-file', + default='invalid-video.txt', + help='Output file path which keeps corrupted/missing video file paths') + parser.add_argument( + '--split', + default='train', + choices=['train', 'val', 'test'], + help='Dataset split') + parser.add_argument( + '--decoder', + default='decord', + choices=['decord', 'opencv', 'pyav'], + help='Video decoder type, should be one of [decord, opencv, pyav]') + parser.add_argument( + '--nproc', + type=int, + default=(cpu_count() - 1 or 1), + help='Number of processes to check videos') + parser.add_argument( + '--remove-corrupted-videos', + action='store_true', + help='Whether to delete all corrupted videos') + args = parser.parse_args() + + if args.options and args.eval_options: + raise ValueError( + '--options and --eval-options cannot be both ' + 'specified, --options is deprecated in favor of --eval-options') + if args.options: + warnings.warn('--options is deprecated in favor of --eval-options') + args.eval_options = args.options + return args + + +@TRANSFORMS.register_module() +class RandomSampleFrames: + + def __call__(self, results): + """Select frames to verify. + + Select the first, last and three random frames, Required key is + "total_frames", added or modified key is "frame_inds". + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + assert results['total_frames'] > 0 + + # first and last frames + results['frame_inds'] = np.array([0, results['total_frames'] - 1]) + + # choose 3 random frames + if results['total_frames'] > 2: + results['frame_inds'] = np.concatenate([ + results['frame_inds'], + np.random.randint(1, results['total_frames'] - 1, 3) + ]) + + return results + + +def _do_check_videos(lock, pipeline, output_file, data_info): + try: + pipeline(data_info) + except: # noqa + # save invalid video path to output file + lock.acquire() + with open(output_file, 'a') as f: + f.write(data_info['filename'] + '\n') + lock.release() + + +if __name__ == '__main__': + args = parse_args() + + decoder_to_pipeline_prefix = dict( + decord='Decord', opencv='OpenCV', pyav='PyAV') + + # read config file + cfg = Config.fromfile(args.config) + cfg.merge_from_dict(args.cfg_options) + init_default_scope(cfg.get('default_scope', 'mmaction')) + + # build dataset + dataset_cfg = cfg.get(f'{args.split}_dataloader').dataset + dataset_type = dataset_cfg.type + assert dataset_type == 'VideoDataset' + dataset_cfg.pipeline = [ + dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Init'), + dict(type='RandomSampleFrames'), + dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode') + ] + + dataset = DATASETS.build(dataset_cfg) + dataset_cfg.pop('type') + pipeline = dataset.pipeline + + # prepare for checking + if os.path.exists(args.output_file): + # remove existing output file + os.remove(args.output_file) + + lock = Manager().Lock() + worker_fn = partial(_do_check_videos, lock, pipeline, args.output_file) + # avoid copy dataset for multiprocess + data_info_list = [ + dataset.get_data_info(idx) for idx in range(len(dataset)) + ] + + # start checking + track_parallel_progress(worker_fn, data_info_list, nproc=args.nproc) + + if os.path.exists(args.output_file): + num_lines = sum(1 for _ in open(args.output_file)) + print(f'Checked {len(dataset)} videos, ' + f'{num_lines} are corrupted/missing.') + if args.remove_corrupted_videos: + print('Start deleting corrupted videos') + cnt = 0 + with open(args.output_file, 'r') as f: + for line in f: + if os.path.exists(line.strip()): + os.remove(line.strip()) + cnt += 1 + print(f'Deleted {cnt} corrupted videos.') + else: + print(f'Checked {len(dataset)} videos, none are corrupted/missing') diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..218a227bdb03c3dd105e4b2a2c64b58e42396073 --- /dev/null +++ b/tools/analysis_tools/confusion_matrix.py @@ -0,0 +1,131 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import tempfile + +import torch +from mmengine import dump, list_from_file, load +from mmengine.config import Config, DictAction +from mmengine.evaluator import Evaluator +from mmengine.runner import Runner + +from mmaction.evaluation import ConfusionMatrix +from mmaction.registry import DATASETS +from mmaction.utils import register_all_modules + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Eval a checkpoint and draw the confusion matrix.') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'ckpt_or_result', + type=str, + help='The checkpoint file (.pth) or ' + 'dumpped predictions pickle file (.pkl).') + parser.add_argument('--out', help='the file to save the confusion matrix.') + parser.add_argument( + '--show', + action='store_true', + help='whether to display the metric result by matplotlib if supports.') + parser.add_argument( + '--show-path', type=str, help='Path to save the visualization image.') + parser.add_argument( + '--include-values', + action='store_true', + help='To draw the values in the figure.') + parser.add_argument('--label-file', default=None, help='Labelmap file') + parser.add_argument( + '--target-classes', + type=int, + nargs='+', + default=[], + help='Selected classes to evaluate, and remains will be neglected') + parser.add_argument( + '--cmap', + type=str, + default='viridis', + help='The color map to use. Defaults to "viridis".') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # register all modules in mmaction into the registries + # do not init the default scope here because it will be init in the runner + register_all_modules(init_default_scope=False) + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if args.ckpt_or_result.endswith('.pth'): + # Set confusion matrix as the metric. + cfg.test_evaluator = dict(type='ConfusionMatrix') + + cfg.load_from = str(args.ckpt_or_result) + + with tempfile.TemporaryDirectory() as tmpdir: + cfg.work_dir = tmpdir + runner = Runner.from_cfg(cfg) + classes = runner.test_loop.dataloader.dataset.metainfo.get( + 'classes') + cm = runner.test()['confusion_matrix/result'] + logging.shutdown() + else: + predictions = load(args.ckpt_or_result) + evaluator = Evaluator(ConfusionMatrix()) + metrics = evaluator.offline_evaluate(predictions, None) + cm = metrics['confusion_matrix/result'] + try: + # Try to build the dataset. + dataset = DATASETS.build({ + **cfg.test_dataloader.dataset, 'pipeline': [] + }) + classes = dataset.metainfo.get('classes') + except Exception: + classes = None + + if args.label_file is not None: + classes = list_from_file(args.label_file) + if classes is None: + num_classes = cm.shape[0] + classes = list(range(num_classes)) + + if args.target_classes: + assert len(args.target_classes) > 1, \ + 'please ensure select more than one class' + target_idx = torch.tensor(args.target_classes) + cm = cm[target_idx][:, target_idx] + classes = [classes[idx] for idx in target_idx] + + if args.out is not None: + dump(cm, args.out) + + if args.show or args.show_path is not None: + fig = ConfusionMatrix.plot( + cm, + show=args.show, + classes=classes, + include_values=args.include_values, + cmap=args.cmap) + if args.show_path is not None: + fig.savefig(args.show_path) + print(f'The confusion matrix is saved at {args.show_path}.') + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/eval_metric.py b/tools/analysis_tools/eval_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..51b5156c37e557fd8d06dce67dddb7d8e30dc992 --- /dev/null +++ b/tools/analysis_tools/eval_metric.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import mmengine +from mmengine import Config, DictAction +from mmengine.evaluator import Evaluator +from mmengine.registry import init_default_scope +from rich import print + + +def parse_args(): + parser = argparse.ArgumentParser(description='Evaluate metric of the ' + 'results saved in pkl format') + parser.add_argument('config', help='Config of the model') + parser.add_argument('pkl_results', help='Results in pickle format') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + init_default_scope(cfg.get('default_scope', 'mmaction')) + + data_samples = mmengine.load(args.pkl_results) + + evaluator = Evaluator(cfg.test_evaluator) + eval_results = evaluator.offline_evaluate(data_samples) + print(eval_results) + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py new file mode 100644 index 0000000000000000000000000000000000000000..cdb8b03658f519fb5f124cb4560e6eddf3e1b45d --- /dev/null +++ b/tools/analysis_tools/get_flops.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +from mmengine import Config +from mmengine.registry import init_default_scope + +from mmaction.registry import MODELS + +try: + from mmengine.analysis import get_model_complexity_info +except ImportError: + raise ImportError('Please upgrade mmcv to >0.6.2') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get model flops and params') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[224, 224], + help='input image size') + args = parser.parse_args() + return args + + +def main(): + + args = parse_args() + + if len(args.shape) == 1: + input_shape = (1, 3, args.shape[0], args.shape[0]) + elif len(args.shape) == 2: + input_shape = (1, 3) + tuple(args.shape) + elif len(args.shape) == 4: + # n, c, h, w = args.shape for 2D recognizer + input_shape = tuple(args.shape) + elif len(args.shape) == 5: + # n, c, t, h, w = args.shape for 3D recognizer or + # n, m, t, v, c = args.shape for GCN-based recognizer + input_shape = tuple(args.shape) + else: + raise ValueError('invalid input shape') + + cfg = Config.fromfile(args.config) + init_default_scope(cfg.get('default_scope', 'mmaction')) + model = MODELS.build(cfg.model) + model.eval() + + if hasattr(model, 'extract_feat'): + model.forward = model.extract_feat + else: + raise NotImplementedError( + 'FLOPs counter is currently not currently supported with {}'. + format(model.__class__.__name__)) + + analysis_results = get_model_complexity_info(model, input_shape) + flops = analysis_results['flops_str'] + params = analysis_results['params_str'] + table = analysis_results['out_table'] + print(table) + split_line = '=' * 30 + print(f'\n{split_line}\nInput shape: {input_shape}\n' + f'Flops: {flops}\nParams: {params}\n{split_line}') + print('!!!Please be cautious if you use the results in papers. ' + 'You may need to check if all ops are supported and verify that the ' + 'flops computation is correct.') + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/print_config.py b/tools/analysis_tools/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..661d4aca561bb466496296c6d6d842b604b89dea --- /dev/null +++ b/tools/analysis_tools/print_config.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +from mmengine import Config, DictAction + + +def parse_args(): + parser = argparse.ArgumentParser(description='Print the whole config') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--options', nargs='+', action=DictAction, help='arguments in dict') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.options is not None: + cfg.merge_from_dict(args.options) + print(f'Config:\n{cfg.pretty_text}') + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/report_accuracy.py b/tools/analysis_tools/report_accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..1360cb3bf64658887498edc566282ee43227365e --- /dev/null +++ b/tools/analysis_tools/report_accuracy.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import numpy as np +from mmengine import load +from scipy.special import softmax + +from mmaction.evaluation.functional import (get_weighted_score, + mean_class_accuracy, + mmit_mean_average_precision, + top_k_accuracy) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Fusing multiple scores') + parser.add_argument( + '--preds', + nargs='+', + help='list of predict result', + default=['demo/fuse/joint.pkl', 'demo/fuse/bone.pkl']) + parser.add_argument( + '--coefficients', + nargs='+', + type=float, + help='coefficients of each score file', + default=[1.0, 1.0]) + parser.add_argument('--apply-softmax', action='store_true') + parser.add_argument( + '--multi-label', + action='store_true', + help='whether the task is multi label classification') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert len(args.preds) == len(args.coefficients) + data_sample_list = [load(f) for f in args.preds] + score_list = [] + for data_samples in data_sample_list: + scores = [sample['pred_score'].numpy() for sample in data_samples] + score_list.append(scores) + + if args.multi_label: + labels = [sample['gt_label'] for sample in data_sample_list[0]] + else: + labels = [sample['gt_label'].item() for sample in data_sample_list[0]] + + if args.apply_softmax: + + def apply_softmax(scores): + return [softmax(score) for score in scores] + + score_list = [apply_softmax(scores) for scores in score_list] + + weighted_scores = get_weighted_score(score_list, args.coefficients) + if args.multi_label: + mean_avg_prec = mmit_mean_average_precision( + np.array(weighted_scores), np.stack([t.numpy() for t in labels])) + print(f'MMit Average Precision: {mean_avg_prec:.04f}') + else: + mean_class_acc = mean_class_accuracy(weighted_scores, labels) + top_1_acc, top_5_acc = top_k_accuracy(weighted_scores, labels, (1, 5)) + print(f'Mean Class Accuracy: {mean_class_acc:.04f}') + print(f'Top 1 Accuracy: {top_1_acc:.04f}') + print(f'Top 5 Accuracy: {top_5_acc:.04f}') + + +if __name__ == '__main__': + main() diff --git a/tools/analysis_tools/report_map.py b/tools/analysis_tools/report_map.py new file mode 100644 index 0000000000000000000000000000000000000000..49206ff84be87d9d17f04b8e898260a12e478e0f --- /dev/null +++ b/tools/analysis_tools/report_map.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +import mmengine +import numpy as np + +from mmaction.evaluation import ActivityNetLocalization + +args = None + + +def cuhk17_top1(): + """Assign label for each proposal with the cuhk17 result, which is the #2 + entry in http://activity-net.org/challenges/2017/evaluation.html.""" + if not osp.exists('cuhk_anet17_pred.json'): + os.system('wget https://download.openmmlab.com/' + 'mmaction/localization/cuhk_anet17_pred.json') + proposal = mmengine.load(args.proposal) + results = proposal['results'] + cuhk_pred = mmengine.load('cuhk_anet17_pred.json')['results'] + + def get_topk(preds, k): + preds.sort(key=lambda x: x['score']) + return preds[-k:] + + for k, v in results.items(): + action_pred = cuhk_pred[k] + top1 = get_topk(action_pred, 1) + top1_label = top1[0]['label'] + new_value = [] + for item in v: + x = dict(label=top1_label) + x.update(item) + new_value.append(x) + results[k] = new_value + proposal['results'] = results + mmengine.dump(proposal, args.det_output) + + +cls_funcs = {'cuhk17_top1': cuhk17_top1} + + +def parse_args(): + parser = argparse.ArgumentParser(description='Report detection mAP for' + 'ActivityNet proposal file') + parser.add_argument('--proposal', type=str, help='proposal file') + parser.add_argument( + '--gt', + type=str, + default='data/ActivityNet/' + 'anet_anno_val.json', + help='groundtruth file') + parser.add_argument( + '--cls', + type=str, + default='cuhk17_top1', + choices=['cuhk17_top1'], + help='the way to assign label for each ' + 'proposal') + parser.add_argument( + '--det-output', + type=str, + default='det_result.json', + help='the path to store detection results') + args = parser.parse_args() + return args + + +def main(): + global args, cls_funcs + args = parse_args() + func = cls_funcs[args.cls] + func() + anet_detection = ActivityNetLocalization( + args.gt, + args.det_output, + tiou_thresholds=np.linspace(0.5, 0.95, 10), + verbose=True) + mAP, average_mAP = anet_detection.evaluate() + print('[RESULTS] Performance on ActivityNet detection task.\n' + f'mAP: {mAP}\nAverage-mAP: {average_mAP}') + + +if __name__ == '__main__': + main() diff --git a/tools/argparse.bash b/tools/argparse.bash new file mode 100644 index 0000000000000000000000000000000000000000..6182e393007568f8d12b7cca1ac3a146f968339a --- /dev/null +++ b/tools/argparse.bash @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +# Use python's argparse module in shell scripts +# +# The function `argparse` parses its arguments using +# argparse.ArgumentParser; the parser is defined in the function's +# stdin. +# +# Executing ``argparse.bash`` (as opposed to sourcing it) prints a +# script template. +# +# https://github.com/nhoffman/argparse-bash +# MIT License - Copyright (c) 2015 Noah Hoffman +# +# The MIT License (MIT) +# +# Copyright (c) 2015 Noah Hoffman +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +argparse(){ + argparser=$(mktemp 2>/dev/null || mktemp -t argparser) + cat > "$argparser" <> "$argparser" + + cat >> "$argparser" < /dev/null; then + eval $(python "$argparser" "$@") + retval=0 + else + python "$argparser" "$@" + retval=1 + fi + + rm "$argparser" + return $retval +} + +# print a script template when this script is executed +if [[ $0 == *argparse.bash ]]; then + cat < + +```BibTeX +@article{Heilbron2015ActivityNetAL, + title={ActivityNet: A large-scale video benchmark for human activity understanding}, + author={Fabian Caba Heilbron and Victor Escorcia and Bernard Ghanem and Juan Carlos Niebles}, + journal={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2015}, + pages={961-970} +} +``` + +For basic dataset information, please refer to the official [website](http://activity-net.org/). +For action detection, you can either use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) or extract feature with mmaction2 (which has better performance). +We release both pipeline. +Before we start, please make sure that current working directory is `$MMACTION2/tools/data/activitynet/`. + +## Option 1: Use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) + +### Step 1. Download Annotations + +First of all, you can run the following script to download annotation files. + +```shell +bash download_feature_annotations.sh +``` + +### Step 2. Prepare Videos Features + +Then, you can run the following script to download activitynet features. + +```shell +bash download_features.sh +``` + +### Step 3. Process Annotation Files + +Next, you can run the following script to process the downloaded annotation files for training and testing. +It first merges the two annotation files together and then separates the annoations by `train`, `val` and `test`. + +```shell +python process_annotations.py +``` + +## Option 2: Extract ActivityNet feature using MMAction2 with all videos provided in official [website](http://activity-net.org/) + +### Step 1. Download Annotations + +First of all, you can run the following script to download annotation files. + +```shell +bash download_annotations.sh +``` + +### Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. +The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. + +```shell +bash download_videos.sh +``` + +Since some videos in the ActivityNet dataset might be no longer available on YouTube, official [website](http://activity-net.org/) has made the full dataset available on Google and Baidu drives. +To accommodate missing data requests, you can fill in this [request form](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform) provided in official [download page](http://activity-net.org/download.html) to have a 7-day-access to download the videos from the drive folders. + +We also provide download steps for annotations from [BSN repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) + +```shell +bash download_bsn_videos.sh +``` + +For this case, the downloading scripts update the annotation file after downloading to make sure every video in it exists. + +### Step 3. Extract RGB and Flow + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +Use following scripts to extract both RGB and Flow. + +```shell +bash extract_frames.sh +``` + +The command above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`. +More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md) + +### Step 4. Generate File List for ActivityNet Finetuning + +With extracted frames, you can generate video-level or clip-level lists of rawframes, which can be used for ActivityNet Finetuning. + +```shell +python generate_rawframes_filelist.py +``` + +### Step 5. Finetune TSN models on ActivityNet + +You can use ActivityNet configs in `configs/recognition/tsn` to finetune TSN models on ActivityNet. +You need to use Kinetics models for pretraining. +Both RGB models and Flow models are supported. + +### Step 6. Extract ActivityNet Feature with finetuned ckpts + +After finetuning TSN on ActivityNet, you can use it to extract both RGB and Flow feature. + +```shell +python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \ + /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_train_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \ + path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_val_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \ + /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_train_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \ + /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_val_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score +``` + +After feature extraction, you can use our post processing scripts to concat RGB and Flow feature, generate the `100-t X 400-d` feature for Action Detection. + +```shell +python activitynet_feature_postprocessing.py --rgb ../../../data/ActivityNet/rgb_feat --flow ../../../data/ActivityNet/flow_feat --dest ../../../data/ActivityNet/mmaction_feat +``` + +## Final Step. Check Directory Structure + +After the whole data pipeline for ActivityNet preparation, +you will get the features, videos, frames and annotation files. + +In the context of the whole project (for ActivityNet only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ActivityNet + +(if Option 1 used) +โ”‚ โ”‚ โ”œโ”€โ”€ anet_anno_{train,val,test,full}.json +โ”‚ โ”‚ โ”œโ”€โ”€ anet_anno_action.json +โ”‚ โ”‚ โ”œโ”€โ”€ video_info_new.csv +โ”‚ โ”‚ โ”œโ”€โ”€ activitynet_feature_cuhk +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ csv_mean_100 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw.csv +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___dXUJsj3yo.csv +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ .. + +(if Option 2 used) +โ”‚ โ”‚ โ”œโ”€โ”€ anet_train_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_val_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_train_clip.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_val_clip.txt +โ”‚ โ”‚ โ”œโ”€โ”€ activity_net.v1-3.min.json +โ”‚ โ”‚ โ”œโ”€โ”€ mmaction_feat +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___dXUJsj3yo.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. + +``` + +For training and evaluating on ActivityNet, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/activitynet/README_zh-CN.md b/tools/data/activitynet/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..3c852febcb6f7786a3816cf6f11e51bd785dcd0d --- /dev/null +++ b/tools/data/activitynet/README_zh-CN.md @@ -0,0 +1,185 @@ +# ๅ‡†ๅค‡ ActivityNet + +## ็ฎ€ไป‹ + + + +```BibTeX +@article{Heilbron2015ActivityNetAL, + title={ActivityNet: A large-scale video benchmark for human activity understanding}, + author={Fabian Caba Heilbron and Victor Escorcia and Bernard Ghanem and Juan Carlos Niebles}, + journal={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2015}, + pages={961-970} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](http://activity-net.org/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅฏนไบŽๆ—ถๅบๅŠจไฝœๆฃ€ๆต‹ไปปๅŠก๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จ่ฟ™ไธช [ไปฃ็ ๅบ“](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) ๆไพ›็š„็ผฉๆ”พ่ฟ‡๏ผˆrescaled๏ผ‰็š„ ActivityNet ็‰นๅพ๏ผŒ +ๆˆ–่€…ไฝฟ็”จ MMAction2 ่ฟ›่กŒ็‰นๅพๆๅ–๏ผˆ่ฟ™ๅฐ†ๅ…ทๆœ‰ๆ›ด้ซ˜็š„็ฒพๅบฆ๏ผ‰ใ€‚MMAction2 ๅŒๆ—ถๆไพ›ไบ†ไปฅไธŠๆ‰€่ฟฐ็š„ไธค็งๆ•ฐๆฎไฝฟ็”จๆต็จ‹ใ€‚ +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/activitynet/`ใ€‚ + +## ้€‰้กน 1๏ผš็”จๆˆทๅฏไปฅไฝฟ็”จ่ฟ™ไธช [ไปฃ็ ๅบ“](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) ๆไพ›็š„็‰นๅพ + +### ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +bash download_feature_annotations.sh +``` + +### ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘็‰นๅพ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝ ActivityNet ็‰นๅพใ€‚ + +```shell +bash download_features.sh +``` + +### ๆญฅ้ชค 3. ๅค„็†ๆ ‡ๆณจๆ–‡ไปถ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๅค„็†ไธ‹่ฝฝ็š„ๆ ‡ๆณจๆ–‡ไปถ๏ผŒไปฅไพฟไบŽ่ฎญ็ปƒๅ’Œๆต‹่ฏ•ใ€‚ +่ฏฅ่„šๆœฌไผš้ฆ–ๅ…ˆๅˆๅนถไธคไธชๆ ‡ๆณจๆ–‡ไปถ๏ผŒ็„ถๅŽๅ†ๅฐ†ๅ…ถๅˆ†ไธบ `train`, `val` ๅ’Œ `test` ไธ‰ไธช้ƒจๅˆ†ใ€‚ + +```shell +python process_annotations.py +``` + +## ้€‰้กน 2๏ผšไฝฟ็”จ MMAction2 ๅฏน [ๅฎ˜็ฝ‘](http://activity-net.org/) ๆไพ›็š„่ง†้ข‘่ฟ›่กŒ็‰นๅพๆŠฝๅ– + +### ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +bash download_annotations.sh +``` + +### ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘ๆ•ฐๆฎใ€‚ +่ฏฅไปฃ็ ๅ‚่€ƒ่‡ช [ๅฎ˜ๆ–น็ˆฌ่™ซ](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)๏ผŒ่ฏฅ่ฟ‡็จ‹ๅฐ†ไผš่€—่ดน่พƒๅคšๆ—ถ้—ดใ€‚ + +```shell +bash download_videos.sh +``` + +็”ฑไบŽ ActivityNet ๆ•ฐๆฎ้›†ไธญ็š„ไธ€ไบ›่ง†้ข‘ๅทฒ็ปๅœจ YouTube ๅคฑๆ•ˆ๏ผŒ[ๅฎ˜็ฝ‘](http://activity-net.org/) ๅœจ่ฐทๆญŒ็ฝ‘็›˜ๅ’Œ็™พๅบฆ็ฝ‘็›˜ๆไพ›ไบ†ๅฎŒๆ•ด็š„ๆ•ฐๆฎ้›†ๆ•ฐๆฎใ€‚ +ๅฆ‚ๆžœ็”จๆˆทๆƒณ่ฆ่Žทๅ–ๅคฑๆ•ˆ็š„ๆ•ฐๆฎ้›†๏ผŒๅˆ™้œ€่ฆๅกซๅ†™ [ไธ‹่ฝฝ้กต้ข](http://activity-net.org/download.html) ไธญๆไพ›็š„ [้œ€ๆฑ‚่กจๆ ผ](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform) ไปฅ่Žทๅ– 7 ๅคฉ็š„ไธ‹่ฝฝๆƒ้™ใ€‚ + +MMAction2 ๅŒๆ—ถไนŸๆไพ›ไบ† [BSN ไปฃ็ ๅบ“](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) ็š„ๆ ‡ๆณจๆ–‡ไปถ็š„ไธ‹่ฝฝๆญฅ้ชคใ€‚ + +```shell +bash download_bsn_videos.sh +``` + +ๅฏนไบŽ่ฟ™็งๆƒ…ๅ†ต๏ผŒ่ฏฅไธ‹่ฝฝ่„šๆœฌๅฐ†ๅœจไธ‹่ฝฝๅŽๆ›ดๆ–ฐๆญคๆ ‡ๆณจๆ–‡ไปถ๏ผŒไปฅ็กฎไฟๆฏไธช่ง†้ข‘้ƒฝๅญ˜ๅœจใ€‚ + +### ๆญฅ้ชค 3. ๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตใ€‚ + +```shell +bash extract_frames.sh +``` + +ไปฅไธŠ่„šๆœฌๅฐ†ไผš็”Ÿๆˆ็Ÿญ่พน 256 ๅˆ†่พจ็އ็š„่ง†้ข‘ใ€‚ๅฆ‚ๆžœ็”จๆˆทๆƒณ็”Ÿๆˆ็Ÿญ่พน 320 ๅˆ†่พจ็އ็š„่ง†้ข‘๏ผˆๅณ 320p๏ผ‰๏ผŒๆˆ–่€… 340x256 ็š„ๅ›บๅฎšๅˆ†่พจ็އ๏ผŒ็”จๆˆทๅฏไปฅ้€š่ฟ‡ๆ”นๅ˜ๅ‚ๆ•ฐ็”ฑ `--new-short 256` ่‡ณ `--new-short 320`๏ผŒๆˆ–่€… `--new-width 340 --new-height 256` ่ฟ›่กŒ่ฎพ็ฝฎ +ๆ›ดๅคš็ป†่Š‚ๅฏๅ‚่€ƒ [ๆ•ฐๆฎๅ‡†ๅค‡ๆŒ‡ๅ—](/docs/zh_cn/user_guides/prepare_dataset.md) + +### ๆญฅ้ชค 4. ็”Ÿๆˆ็”จไบŽ ActivityNet ๅพฎ่ฐƒ็š„ๆ–‡ไปถๅˆ—่กจ + +ๆ นๆฎๆŠฝๅ–็š„ๅธง๏ผŒ็”จๆˆทๅฏไปฅ็”Ÿๆˆ่ง†้ข‘็บงๅˆซ๏ผˆvideo-level๏ผ‰ๆˆ–่€…็‰‡ๆฎต็บงๅˆซ๏ผˆclip-level๏ผ‰็š„ๆ–‡ไปถๅˆ—่กจ๏ผŒๅ…ถๅฏ็”จไบŽๅพฎ่ฐƒ ActivityNetใ€‚ + +```shell +python generate_rawframes_filelist.py +``` + +### ๆญฅ้ชค 5. ๅœจ ActivityNet ไธŠๅพฎ่ฐƒ TSN ๆจกๅž‹ + +็”จๆˆทๅฏไฝฟ็”จ `configs/recognition/tsn` ็›ฎๅฝ•ไธญ็š„ ActivityNet ้…็ฝฎๆ–‡ไปถ่ฟ›่กŒ TSN ๆจกๅž‹ๅพฎ่ฐƒใ€‚ +็”จๆˆท้œ€่ฆไฝฟ็”จ Kinetics ็›ธๅ…ณๆจกๅž‹๏ผˆๅŒๆ—ถๆ”ฏๆŒ RGB ๆจกๅž‹ไธŽๅ…‰ๆตๆจกๅž‹๏ผ‰่ฟ›่กŒ้ข„่ฎญ็ปƒใ€‚ + +### ๆญฅ้ชค 6. ไฝฟ็”จ้ข„่ฎญ็ปƒๆจกๅž‹่ฟ›่กŒ ActivityNet ็‰นๅพๆŠฝๅ– + +ๅœจ ActivityNet ไธŠๅพฎ่ฐƒ TSN ๆจกๅž‹ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จ่ฏฅๆจกๅž‹่ฟ›่กŒ RGB ็‰นๅพๅ’Œๅ…‰ๆต็‰นๅพ็š„ๆๅ–ใ€‚ + +```shell +python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \ + /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_train_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \ + path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_val_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \ + /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_train_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score + +python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \ + /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \ + --video-list ../../../data/ActivityNet/anet_val_video.txt \ + --video-root ../../../data/ActivityNet/rawframes \ + --dump-score +``` + +ๅœจๆๅ–ๅฎŒ็‰นๅพๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จๅŽๅค„็†่„šๆœฌๆ•ดๅˆ RGB ็‰นๅพๅ’Œๅ…‰ๆต็‰นๅพ๏ผŒ็”Ÿๆˆ `100-t X 400-d` ็ปดๅบฆ็š„็‰นๅพ็”จไบŽๆ—ถๅบๅŠจไฝœๆฃ€ๆต‹ใ€‚ + +```shell +python activitynet_feature_postprocessing.py --rgb ../../../data/ActivityNet/rgb_feat --flow ../../../data/ActivityNet/flow_feat --dest ../../../data/ActivityNet/mmaction_feat +``` + +## ๆœ€ๅŽไธ€ๆญฅ๏ผšๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ ActivityNet ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„็‰นๅพๆ–‡ไปถ๏ผŒRGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒActivityNet ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ActivityNet + +(่‹ฅๆ นๆฎ้€‰้กน 1 ่ฟ›่กŒๆ•ฐๆฎๅค„็†) +โ”‚ โ”‚ โ”œโ”€โ”€ anet_anno_{train,val,test,full}.json +โ”‚ โ”‚ โ”œโ”€โ”€ anet_anno_action.json +โ”‚ โ”‚ โ”œโ”€โ”€ video_info_new.csv +โ”‚ โ”‚ โ”œโ”€โ”€ activitynet_feature_cuhk +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ csv_mean_100 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw.csv +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___dXUJsj3yo.csv +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ .. + +(่‹ฅๆ นๆฎ้€‰้กน 2 ่ฟ›่กŒๆ•ฐๆฎๅค„็†) +โ”‚ โ”‚ โ”œโ”€โ”€ anet_train_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_val_video.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_train_clip.txt +โ”‚ โ”‚ โ”œโ”€โ”€ anet_val_clip.txt +โ”‚ โ”‚ โ”œโ”€โ”€ activity_net.v1-3.min.json +โ”‚ โ”‚ โ”œโ”€โ”€ mmaction_feat +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___dXUJsj3yo.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v___c8enCfzqw +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00000.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. + +``` + +ๅ…ณไบŽๅฏน ActivityNet ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒๅฏไปฅๅ‚่€ƒ [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md). diff --git a/tools/data/activitynet/action_name.csv b/tools/data/activitynet/action_name.csv new file mode 100644 index 0000000000000000000000000000000000000000..ff639a9ff7e3e157ce8440b13f58d23d159b6c10 --- /dev/null +++ b/tools/data/activitynet/action_name.csv @@ -0,0 +1,201 @@ +action +Applying sunscreen +Arm wrestling +Assembling bicycle +BMX +Baking cookies +Baton twirling +Beach soccer +Beer pong +Blow-drying hair +Blowing leaves +Playing ten pins +Braiding hair +Building sandcastles +Bullfighting +Calf roping +Camel ride +Canoeing +Capoeira +Carving jack-o-lanterns +Changing car wheel +Cleaning sink +Clipping cat claws +Croquet +Curling +Cutting the grass +Decorating the Christmas tree +Disc dog +Doing a powerbomb +Doing crunches +Drum corps +Elliptical trainer +Doing fencing +Fixing the roof +Fun sliding down +Futsal +Gargling mouthwash +Grooming dog +Hand car wash +Hanging wallpaper +Having an ice cream +Hitting a pinata +Hula hoop +Hurling +Ice fishing +Installing carpet +Kite flying +Kneeling +Knitting +Laying tile +Longboarding +Making a cake +Making a lemonade +Making an omelette +Mooping floor +Painting fence +Painting furniture +Peeling potatoes +Plastering +Playing beach volleyball +Playing blackjack +Playing congas +Playing drums +Playing ice hockey +Playing pool +Playing rubik cube +Powerbocking +Putting in contact lenses +Putting on shoes +Rafting +Raking leaves +Removing ice from car +Riding bumper cars +River tubing +Rock-paper-scissors +Rollerblading +Roof shingle removal +Rope skipping +Running a marathon +Scuba diving +Sharpening knives +Shuffleboard +Skiing +Slacklining +Snow tubing +Snowboarding +Spread mulch +Sumo +Surfing +Swimming +Swinging at the playground +Table soccer +Throwing darts +Trimming branches or hedges +Tug of war +Using the monkey bar +Using the rowing machine +Wakeboarding +Waterskiing +Waxing skis +Welding +Drinking coffee +Zumba +Doing kickboxing +Doing karate +Tango +Putting on makeup +High jump +Playing bagpipes +Cheerleading +Wrapping presents +Cricket +Clean and jerk +Preparing pasta +Bathing dog +Discus throw +Playing field hockey +Grooming horse +Preparing salad +Playing harmonica +Playing saxophone +Chopping wood +Washing face +Using the pommel horse +Javelin throw +Spinning +Ping-pong +Making a sandwich +Brushing hair +Playing guitarra +Doing step aerobics +Drinking beer +Playing polo +Snatch +Paintball +Long jump +Cleaning windows +Brushing teeth +Playing flauta +Tennis serve with ball bouncing +Bungee jumping +Triple jump +Horseback riding +Layup drill in basketball +Vacuuming floor +Cleaning shoes +Doing nails +Shot put +Fixing bicycle +Washing hands +Ironing clothes +Using the balance beam +Shoveling snow +Tumbling +Using parallel bars +Getting a tattoo +Rock climbing +Smoking hookah +Shaving +Getting a piercing +Springboard diving +Playing squash +Playing piano +Dodgeball +Smoking a cigarette +Sailing +Getting a haircut +Playing lacrosse +Cumbia +Tai chi +Painting +Mowing the lawn +Shaving legs +Walking the dog +Hammer throw +Skateboarding +Polishing shoes +Ballet +Hand washing clothes +Plataform diving +Playing violin +Breakdancing +Windsurfing +Hopscotch +Doing motocross +Mixing drinks +Starting a campfire +Belly dance +Removing curlers +Archery +Volleyball +Playing water polo +Playing racquetball +Kayaking +Polishing forniture +Playing kickball +Using uneven bars +Washing dishes +Pole vault +Playing accordion +Playing badminton diff --git a/tools/data/activitynet/activitynet_feature_postprocessing.py b/tools/data/activitynet/activitynet_feature_postprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..789d8583c3e5817076e433a9c18b22781e4ca41c --- /dev/null +++ b/tools/data/activitynet/activitynet_feature_postprocessing.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import multiprocessing +import os +import os.path as osp + +import numpy as np +import scipy.interpolate +from mmengine import dump, load + +args = None + + +def parse_args(): + parser = argparse.ArgumentParser(description='ANet Feature Prepare') + parser.add_argument('--rgb', default='', help='rgb feature root') + parser.add_argument('--flow', default='', help='flow feature root') + parser.add_argument('--dest', default='', help='dest root') + parser.add_argument('--output-format', default='csv') + args = parser.parse_args() + return args + + +def pool_feature(data, num_proposals=100, num_sample_bins=3, pool_type='mean'): + """Pool features with arbitrary temporal length. + + Args: + data (list[np.ndarray] | np.ndarray): Features of an untrimmed video, + with arbitrary temporal length. + num_proposals (int): The temporal dim of pooled feature. Default: 100. + num_sample_bins (int): How many points to sample to get the feature + vector at one timestamp. Default: 3. + pool_type (str): Type of pooling to pool features. Choices are + ['mean', 'max']. Default: 'mean'. + + Returns: + np.ndarray: The pooled feature with shape num_proposals x feature_dim. + """ + if len(data) == 1: + return np.concatenate([data] * num_proposals) + x_range = list(range(len(data))) + f = scipy.interpolate.interp1d(x_range, data, axis=0) + eps = 1e-4 + start, end = eps, len(data) - 1 - eps + anchor_size = (end - start) / num_proposals + ptr = start + feature = [] + for _ in range(num_proposals): + x_new = [ + ptr + i / num_sample_bins * anchor_size + for i in range(num_sample_bins) + ] + y_new = f(x_new) + if pool_type == 'mean': + y_new = np.mean(y_new, axis=0) + elif pool_type == 'max': + y_new = np.max(y_new, axis=0) + else: + raise NotImplementedError('Unsupported pool type') + feature.append(y_new) + ptr += anchor_size + feature = np.stack(feature) + return feature + + +def merge_feat(name): + # concatenate rgb feat and flow feat for a single sample + rgb_feat = load(osp.join(args.rgb, name)) + flow_feat = load(osp.join(args.flow, name)) + rgb_feat = pool_feature(rgb_feat) + flow_feat = pool_feature(flow_feat) + feat = np.concatenate([rgb_feat, flow_feat], axis=-1) + if not osp.exists(args.dest): + os.system(f'mkdir -p {args.dest}') + if args.output_format == 'pkl': + dump(feat, osp.join(args.dest, name)) + elif args.output_format == 'csv': + feat = feat.tolist() + lines = [] + line0 = ','.join([f'f{i}' for i in range(400)]) + lines.append(line0) + for line in feat: + lines.append(','.join([f'{x:.4f}' for x in line])) + with open(osp.join(args.dest, name.replace('.pkl', '.csv')), 'w') as f: + f.write('\n'.join(lines)) + + +def main(): + global args + args = parse_args() + rgb_feat = [file for file in os.listdir(args.rgb) if file.endswith('.pkl')] + flow_feat = [ + file for file in os.listdir(args.flow) if file.endswith('.pkl') + ] + assert set(rgb_feat) == set(flow_feat) + # for feat in rgb_feat: + # merge_feat(feat) + pool = multiprocessing.Pool(32) + pool.map(merge_feat, rgb_feat) + + +if __name__ == '__main__': + main() diff --git a/tools/data/activitynet/convert_proposal_format.py b/tools/data/activitynet/convert_proposal_format.py new file mode 100644 index 0000000000000000000000000000000000000000..b6f69fe66531444e7163c7aa15ecdedd5b39e8da --- /dev/null +++ b/tools/data/activitynet/convert_proposal_format.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This file converts the output proposal file of proposal generator (BSN, BMN) +into the input proposal file of action classifier (Currently supports SSN and +P-GCN, not including TSN, I3D etc.).""" +import argparse + +import mmengine +import numpy as np + +from mmaction.evaluation import pairwise_temporal_iou + + +def load_annotations(ann_file): + """Load the annotation according to ann_file into video_infos.""" + video_infos = [] + anno_database = mmengine.load(ann_file) + for video_name in anno_database: + video_info = anno_database[video_name] + video_info['video_name'] = video_name + video_infos.append(video_info) + return video_infos + + +def import_ground_truth(video_infos, activity_index): + """Read ground truth data from video_infos.""" + ground_truth = {} + for video_info in video_infos: + video_id = video_info['video_name'][2:] + this_video_ground_truths = [] + for ann in video_info['annotations']: + t_start, t_end = ann['segment'] + label = activity_index[ann['label']] + this_video_ground_truths.append([t_start, t_end, label]) + ground_truth[video_id] = np.array(this_video_ground_truths) + return ground_truth + + +def import_proposals(result_dict): + """Read predictions from result dict.""" + proposals = {} + num_proposals = 0 + for video_id in result_dict: + result = result_dict[video_id] + this_video_proposals = [] + for proposal in result: + t_start, t_end = proposal['segment'] + score = proposal['score'] + this_video_proposals.append([t_start, t_end, score]) + num_proposals += 1 + proposals[video_id] = np.array(this_video_proposals) + return proposals, num_proposals + + +def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts, + proposals, tiou, t_overlap_self, + formatted_proposal_file): + """dump the formatted proposal file, which is the input proposal file of + action classifier (e.g: SSN). + + Args: + video_idx (int): Index of video. + video_id (str): ID of video. + num_frames (int): Total frames of the video. + fps (float): Fps of the video. + gts (np.ndarray[float]): t_start, t_end and label of groundtruths. + proposals (np.ndarray[float]): t_start, t_end and score of proposals. + tiou (np.ndarray[float]): 2-dim array with IoU ratio. + t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self + (union / self_len) ratio. + formatted_proposal_file (open file object): Open file object of + formatted_proposal_file. + """ + + formatted_proposal_file.write( + f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n') + for gt in gts: + formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n') + formatted_proposal_file.write(f'{proposals.shape[0]}\n') + + best_iou = np.amax(tiou, axis=0) + best_iou_index = np.argmax(tiou, axis=0) + best_overlap = np.amax(t_overlap_self, axis=0) + best_overlap_index = np.argmax(t_overlap_self, axis=0) + + for i in range(proposals.shape[0]): + index_iou = best_iou_index[i] + index_overlap = best_overlap_index[i] + label_iou = gts[index_iou][2] + label_overlap = gts[index_overlap][2] + if label_iou != label_overlap: + label = label_iou if label_iou != 0 else label_overlap + else: + label = label_iou + if best_iou[i] == 0 and best_overlap[i] == 0: + formatted_proposal_file.write( + f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n') + else: + formatted_proposal_file.write( + f'{int(label)} {best_iou[i]} {best_overlap[i]} ' + f'{proposals[i][0]} {proposals[i][1]}\n') + + +def parse_args(): + parser = argparse.ArgumentParser(description='convert proposal format') + parser.add_argument( + '--ann-file', + type=str, + default='../../../data/ActivityNet/anet_anno_val.json', + help='name of annotation file') + parser.add_argument( + '--activity-index-file', + type=str, + default='../../../data/ActivityNet/anet_activity_indexes_val.txt', + help='name of activity index file') + parser.add_argument( + '--proposal-file', + type=str, + default='../../../results.json', + help='name of proposal file, which is the' + 'output of proposal generator (BMN)') + parser.add_argument( + '--formatted-proposal-file', + type=str, + default='../../../anet_val_formatted_proposal.txt', + help='name of formatted proposal file, which is the' + 'input of action classifier (SSN)') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + formatted_proposal_file = open(args.formatted_proposal_file, 'w') + + # The activity index file is constructed according to + # 'https://github.com/activitynet/ActivityNet/blob/master/Evaluation/eval_classification.py' + activity_index, class_idx = {}, 0 + for line in open(args.activity_index_file).readlines(): + activity_index[line.strip()] = class_idx + class_idx += 1 + + video_infos = load_annotations(args.ann_file) + ground_truth = import_ground_truth(video_infos, activity_index) + proposal, num_proposals = import_proposals( + mmengine.load(args.proposal_file)['results']) + video_idx = 0 + + for video_info in video_infos: + video_id = video_info['video_name'][2:] + num_frames = video_info['duration_frame'] + fps = video_info['fps'] + tiou, t_overlap = pairwise_temporal_iou( + proposal[video_id][:, :2].astype(float), + ground_truth[video_id][:, :2].astype(float), + calculate_overlap_self=True) + + dump_formatted_proposal(video_idx, video_id, num_frames, fps, + ground_truth[video_id], proposal[video_id], + tiou, t_overlap, formatted_proposal_file) + video_idx += 1 + formatted_proposal_file.close() diff --git a/tools/data/activitynet/download.py b/tools/data/activitynet/download.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5ea866256e622742857d8979780e9bee2584f9 --- /dev/null +++ b/tools/data/activitynet/download.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This scripts is copied from +# https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py # noqa: E501 +# The code is licensed under the MIT licence. +import argparse +import os +import ssl +import subprocess + +import mmengine +from joblib import Parallel, delayed + +ssl._create_default_https_context = ssl._create_unverified_context +data_file = '../../../data/ActivityNet' +output_dir = f'{data_file}/videos' + + +def parse_args(): + parser = argparse.ArgumentParser(description='ActivityNet downloader') + parser.add_argument( + '--bsn', + action='store_true', + help='download for BSN annotation or official one') + args = parser.parse_args() + return args + + +def download_clip(video_identifier, + output_filename, + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + + if not os.path.exists(output_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate', + '-f', 'mp4', '-o', + '"%s"' % output_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + attempts += 1 + if attempts == num_attempts: + return status, 'Fail' + else: + break + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + return status, 'Downloaded' + + +def download_clip_wrapper(youtube_id, output_dir): + """Wrapper for parallel processing purposes.""" + # we do this to align with names in annotations + output_filename = os.path.join(output_dir, 'v_' + youtube_id + '.mp4') + if os.path.exists(output_filename): + status = tuple(['v_' + youtube_id, True, 'Exists']) + return status + + downloaded, log = download_clip(youtube_id, output_filename) + status = tuple(['v_' + youtube_id, downloaded, log]) + return status + + +def parse_activitynet_annotations(input_csv, is_bsn_case=False): + """Returns a list of YoutubeID. + arguments: + --------- + input_csv: str + Path to CSV file containing the following columns: + 'video,numFrame,seconds,fps,rfps,subset,featureFrame' + returns: + ------- + youtube_ids: list + List of all YoutubeIDs in ActivityNet. + + """ + if is_bsn_case: + lines = open(input_csv).readlines() + lines = lines[1:] + # YoutubeIDs do not have prefix `v_` + youtube_ids = [x.split(',')[0][2:] for x in lines] + else: + data = mmengine.load(anno_file)['database'] + youtube_ids = list(data.keys()) + + return youtube_ids + + +def main(input_csv, output_dir, anno_file, num_jobs=24, is_bsn_case=False): + # Reading and parsing ActivityNet. + youtube_ids = parse_activitynet_annotations(input_csv, is_bsn_case) + + # Creates folders where videos will be saved later. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Download all clips. + if num_jobs == 1: + status_list = [] + for index in youtube_ids: + status_list.append(download_clip_wrapper(index, output_dir)) + else: + status_list = Parallel(n_jobs=num_jobs)( + delayed(download_clip_wrapper)(index, output_dir) + for index in youtube_ids) + + # Save download report. + mmengine.dump(status_list, 'download_report.json') + annotation = mmengine.load(anno_file) + downloaded = {status[0]: status[1] for status in status_list} + annotation = {k: v for k, v in annotation.items() if downloaded[k]} + + if is_bsn_case: + anno_file_bak = anno_file.replace('.json', '_bak.json') + os.rename(anno_file, anno_file_bak) + mmengine.dump(annotation, anno_file) + + +if __name__ == '__main__': + args = parse_args() + is_bsn_case = args.bsn + if is_bsn_case: + video_list = f'{data_file}/video_info_new.csv' + anno_file = f'{data_file}/anet_anno_action.json' + else: + video_list = f'{data_file}/activity_net.v1-3.min.json' + anno_file = video_list + main(video_list, output_dir, anno_file, 24, is_bsn_case) diff --git a/tools/data/activitynet/download_annotations.sh b/tools/data/activitynet/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..8af7b938a7325cd6418df81908c9fa7aee5880bf --- /dev/null +++ b/tools/data/activitynet/download_annotations.sh @@ -0,0 +1,12 @@ +DATA_DIR="../../../data/ActivityNet/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/files/activity_net.v1-3.min.json + +cd - diff --git a/tools/data/activitynet/download_bsn_videos.sh b/tools/data/activitynet/download_bsn_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..b0b187959bcdf1d861081ff2a3fecc364475132f --- /dev/null +++ b/tools/data/activitynet/download_bsn_videos.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate activitynet +pip install --upgrade youtube-dl +pip install mmcv + +DATA_DIR="../../../data/ActivityNet" +python download.py --bsn + +source deactivate activitynet +conda remove -n activitynet --all diff --git a/tools/data/activitynet/download_feature_annotations.sh b/tools/data/activitynet/download_feature_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..f59452165ac5c0f0c99e1dd2a1d84d8f7fc4e5b8 --- /dev/null +++ b/tools/data/activitynet/download_feature_annotations.sh @@ -0,0 +1,16 @@ +DATA_DIR="../../../data/ActivityNet/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget https://raw.githubusercontent.com/wzmsltw/BSN-boundary-sensitive-network/master/data/activitynet_annotations/anet_anno_action.json + +wget https://raw.githubusercontent.com/wzmsltw/BSN-boundary-sensitive-network/master/data/activitynet_annotations/video_info_new.csv + +wget https://download.openmmlab.com/mmaction/localization/anet_activity_indexes_val.txt + +cd - diff --git a/tools/data/activitynet/download_features.sh b/tools/data/activitynet/download_features.sh new file mode 100644 index 0000000000000000000000000000000000000000..c668d77c35e19b66c4cccc9bffdb2c76fbce9813 --- /dev/null +++ b/tools/data/activitynet/download_features.sh @@ -0,0 +1,11 @@ +DATA_DIR="../../../data/ActivityNet/activitynet_feature_cuhk/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ISemndlSDS2FtqQOKL0t3Cjj9yk2yznF' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ISemndlSDS2FtqQOKL0t3Cjj9yk2yznF" -O "csv_mean_100.zip" && rm -rf /tmp/cookies.txt + +unzip csv_mean_100.zip -d ${DATA_DIR}/ +rm csv_mean_100.zip diff --git a/tools/data/activitynet/download_videos.sh b/tools/data/activitynet/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..047c0f199a71d47d781768f66c1d42495814cb6d --- /dev/null +++ b/tools/data/activitynet/download_videos.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate activitynet +pip install --upgrade youtube-dl +pip install mmcv + +DATA_DIR="../../../data/ActivityNet" +python download.py + +source deactivate activitynet +conda remove -n activitynet --all diff --git a/tools/data/activitynet/environment.yml b/tools/data/activitynet/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..050d6e6a78397564a694fea90f374886da2c2914 --- /dev/null +++ b/tools/data/activitynet/environment.yml @@ -0,0 +1,36 @@ +name: activitynet +channels: + - anaconda + - menpo + - conda-forge + - defaults +dependencies: + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl diff --git a/tools/data/activitynet/extract_frames.sh b/tools/data/activitynet/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..a3496ac9645774afa828c2918b79e5426290de92 --- /dev/null +++ b/tools/data/activitynet/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +cd ../ +python build_rawframes.py ../../data/ActivityNet/videos/ ../../data/ActivityNet/rawframes/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated for train set" + +cd activitynet/ diff --git a/tools/data/activitynet/generate_rawframes_filelist.py b/tools/data/activitynet/generate_rawframes_filelist.py new file mode 100644 index 0000000000000000000000000000000000000000..7a08130d75cd9fe9e32be2377e508b21cf87108e --- /dev/null +++ b/tools/data/activitynet/generate_rawframes_filelist.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp + +data_file = '../../../data/ActivityNet' +video_list = f'{data_file}/video_info_new.csv' +anno_file = f'{data_file}/anet_anno_action.json' +rawframe_dir = f'{data_file}/rawframes' +action_name_list = 'action_name.csv' + +train_rawframe_dir = rawframe_dir +val_rawframe_dir = rawframe_dir + +json_file = f'{data_file}/activity_net.v1-3.min.json' + + +def generate_rawframes_filelist(): + load_dict = json.load(open(json_file)) + + anet_labels = open(action_name_list).readlines() + anet_labels = [x.strip() for x in anet_labels[1:]] + + train_dir_list = [ + osp.join(train_rawframe_dir, x) for x in os.listdir(train_rawframe_dir) + ] + val_dir_list = [ + osp.join(val_rawframe_dir, x) for x in os.listdir(val_rawframe_dir) + ] + + def simple_label(anno): + label = anno[0]['label'] + return anet_labels.index(label) + + def count_frames(dir_list, video): + for dir_name in dir_list: + if video in dir_name: + return osp.basename(dir_name), len(os.listdir(dir_name)) + return None, None + + database = load_dict['database'] + training = {} + validation = {} + key_dict = {} + + for k in database: + data = database[k] + subset = data['subset'] + + if subset in ['training', 'validation']: + annotations = data['annotations'] + label = simple_label(annotations) + if subset == 'training': + dir_list = train_dir_list + data_dict = training + else: + dir_list = val_dir_list + data_dict = validation + + else: + continue + + gt_dir_name, num_frames = count_frames(dir_list, k) + if gt_dir_name is None: + continue + data_dict[gt_dir_name] = [num_frames, label] + key_dict[gt_dir_name] = k + + train_lines = [ + k + ' ' + str(training[k][0]) + ' ' + str(training[k][1]) + for k in training + ] + val_lines = [ + k + ' ' + str(validation[k][0]) + ' ' + str(validation[k][1]) + for k in validation + ] + + with open(osp.join(data_file, 'anet_train_video.txt'), 'w') as fout: + fout.write('\n'.join(train_lines)) + with open(osp.join(data_file, 'anet_val_video.txt'), 'w') as fout: + fout.write('\n'.join(val_lines)) + + def clip_list(k, anno, video_anno): + duration = anno['duration'] + num_frames = video_anno[0] + fps = num_frames / duration + segs = anno['annotations'] + lines = [] + for seg in segs: + segment = seg['segment'] + label = seg['label'] + label = anet_labels.index(label) + start, end = int(segment[0] * fps), int(segment[1] * fps) + if end > num_frames - 1: + end = num_frames - 1 + newline = f'{k} {start} {end - start + 1} {label}' + lines.append(newline) + return lines + + train_clips, val_clips = [], [] + for k in training: + train_clips.extend(clip_list(k, database[key_dict[k]], training[k])) + for k in validation: + val_clips.extend(clip_list(k, database[key_dict[k]], validation[k])) + + with open(osp.join(data_file, 'anet_train_clip.txt'), 'w') as fout: + fout.write('\n'.join(train_clips)) + with open(osp.join(data_file, 'anet_val_clip.txt'), 'w') as fout: + fout.write('\n'.join(val_clips)) + + +if __name__ == '__main__': + generate_rawframes_filelist() diff --git a/tools/data/activitynet/label_map.txt b/tools/data/activitynet/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c3646c2665359bacb823bb1d6ede1b1e2f973 --- /dev/null +++ b/tools/data/activitynet/label_map.txt @@ -0,0 +1,200 @@ +Applying sunscreen +Arm wrestling +Assembling bicycle +BMX +Baking cookies +Baton twirling +Beach soccer +Beer pong +Blow-drying hair +Blowing leaves +Playing ten pins +Braiding hair +Building sandcastles +Bullfighting +Calf roping +Camel ride +Canoeing +Capoeira +Carving jack-o-lanterns +Changing car wheel +Cleaning sink +Clipping cat claws +Croquet +Curling +Cutting the grass +Decorating the Christmas tree +Disc dog +Doing a powerbomb +Doing crunches +Drum corps +Elliptical trainer +Doing fencing +Fixing the roof +Fun sliding down +Futsal +Gargling mouthwash +Grooming dog +Hand car wash +Hanging wallpaper +Having an ice cream +Hitting a pinata +Hula hoop +Hurling +Ice fishing +Installing carpet +Kite flying +Kneeling +Knitting +Laying tile +Longboarding +Making a cake +Making a lemonade +Making an omelette +Mooping floor +Painting fence +Painting furniture +Peeling potatoes +Plastering +Playing beach volleyball +Playing blackjack +Playing congas +Playing drums +Playing ice hockey +Playing pool +Playing rubik cube +Powerbocking +Putting in contact lenses +Putting on shoes +Rafting +Raking leaves +Removing ice from car +Riding bumper cars +River tubing +Rock-paper-scissors +Rollerblading +Roof shingle removal +Rope skipping +Running a marathon +Scuba diving +Sharpening knives +Shuffleboard +Skiing +Slacklining +Snow tubing +Snowboarding +Spread mulch +Sumo +Surfing +Swimming +Swinging at the playground +Table soccer +Throwing darts +Trimming branches or hedges +Tug of war +Using the monkey bar +Using the rowing machine +Wakeboarding +Waterskiing +Waxing skis +Welding +Drinking coffee +Zumba +Doing kickboxing +Doing karate +Tango +Putting on makeup +High jump +Playing bagpipes +Cheerleading +Wrapping presents +Cricket +Clean and jerk +Preparing pasta +Bathing dog +Discus throw +Playing field hockey +Grooming horse +Preparing salad +Playing harmonica +Playing saxophone +Chopping wood +Washing face +Using the pommel horse +Javelin throw +Spinning +Ping-pong +Making a sandwich +Brushing hair +Playing guitarra +Doing step aerobics +Drinking beer +Playing polo +Snatch +Paintball +Long jump +Cleaning windows +Brushing teeth +Playing flauta +Tennis serve with ball bouncing +Bungee jumping +Triple jump +Horseback riding +Layup drill in basketball +Vacuuming floor +Cleaning shoes +Doing nails +Shot put +Fixing bicycle +Washing hands +Ironing clothes +Using the balance beam +Shoveling snow +Tumbling +Using parallel bars +Getting a tattoo +Rock climbing +Smoking hookah +Shaving +Getting a piercing +Springboard diving +Playing squash +Playing piano +Dodgeball +Smoking a cigarette +Sailing +Getting a haircut +Playing lacrosse +Cumbia +Tai chi +Painting +Mowing the lawn +Shaving legs +Walking the dog +Hammer throw +Skateboarding +Polishing shoes +Ballet +Hand washing clothes +Plataform diving +Playing violin +Breakdancing +Windsurfing +Hopscotch +Doing motocross +Mixing drinks +Starting a campfire +Belly dance +Removing curlers +Archery +Volleyball +Playing water polo +Playing racquetball +Kayaking +Polishing forniture +Playing kickball +Using uneven bars +Washing dishes +Pole vault +Playing accordion +Playing badminton diff --git a/tools/data/activitynet/process_annotations.py b/tools/data/activitynet/process_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..dbe3e91e6ad67b33b3f484aa498e0ccf949efbe1 --- /dev/null +++ b/tools/data/activitynet/process_annotations.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This file processes the annotation files and generates proper annotation +files for localizers.""" +import json + +import numpy as np + + +def load_json(file): + with open(file) as json_file: + data = json.load(json_file) + return data + + +data_file = '../../../data/ActivityNet' +info_file = f'{data_file}/video_info_new.csv' +ann_file = f'{data_file}/anet_anno_action.json' + +anno_database = load_json(ann_file) + +video_record = np.loadtxt(info_file, dtype=str, delimiter=',', skiprows=1) + +video_dict_train = {} +video_dict_val = {} +video_dict_test = {} +video_dict_full = {} + +for _, video_item in enumerate(video_record): + video_name = video_item[0] + video_info = anno_database[video_name] + video_subset = video_item[5] + video_info['fps'] = video_item[3].astype(np.float64) + video_info['rfps'] = video_item[4].astype(np.float64) + video_dict_full[video_name] = video_info + if video_subset == 'training': + video_dict_train[video_name] = video_info + elif video_subset == 'testing': + video_dict_test[video_name] = video_info + elif video_subset == 'validation': + video_dict_val[video_name] = video_info + +print(f'full subset video numbers: {len(video_record)}') + +with open(f'{data_file}/anet_anno_train.json', 'w') as result_file: + json.dump(video_dict_train, result_file) + +with open(f'{data_file}/anet_anno_val.json', 'w') as result_file: + json.dump(video_dict_val, result_file) + +with open(f'{data_file}/anet_anno_test.json', 'w') as result_file: + json.dump(video_dict_test, result_file) + +with open(f'{data_file}/anet_anno_full.json', 'w') as result_file: + json.dump(video_dict_full, result_file) diff --git a/tools/data/activitynet/tsn_extract_flow_feat_config.py b/tools/data/activitynet/tsn_extract_flow_feat_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3305a17501221307efe27df9bd472d8ac0bb9542 --- /dev/null +++ b/tools/data/activitynet/tsn_extract_flow_feat_config.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = [ + 'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py' +] + +clip_len = 5 +model = dict( + backbone=dict(in_channels=2 * clip_len), + data_preprocessor=dict(mean=[128], std=[128])) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='UntrimmedSampleFrames', clip_len=clip_len, clip_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW_Flow'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + filename_tmpl='{}_{:05d}.jpg', + modality='Flow', + test_mode=True)) + +test_evaluator = [] + +test_cfg = dict(type='TestLoop') diff --git a/tools/data/activitynet/tsn_extract_rgb_feat_config.py b/tools/data/activitynet/tsn_extract_rgb_feat_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f1cd53a4f42ddf95cd789c5a8cdba0170c617d75 --- /dev/null +++ b/tools/data/activitynet/tsn_extract_rgb_feat_config.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = [ + 'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'RawframeDataset' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(img=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = [] + +test_cfg = dict(type='TestLoop') diff --git a/tools/data/activitynet/tsn_extract_video_feat_config.py b/tools/data/activitynet/tsn_extract_video_feat_config.py new file mode 100644 index 0000000000000000000000000000000000000000..8e323262f27d51cf4dc12c6a3a7231f79576c6f8 --- /dev/null +++ b/tools/data/activitynet/tsn_extract_video_feat_config.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +_base_ = [ + 'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = [] + +test_cfg = dict(type='TestLoop') diff --git a/tools/data/anno_txt2json.py b/tools/data/anno_txt2json.py new file mode 100644 index 0000000000000000000000000000000000000000..6f9790641c4594551fc179b01665c60366926a16 --- /dev/null +++ b/tools/data/anno_txt2json.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import mmengine + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert txt annotation list to json') + parser.add_argument( + 'annofile', type=str, help='the txt annotation file to convert') + parser.add_argument( + '--format', + type=str, + default='rawframes', + choices=['rawframes', 'videos'], + help='the format of the txt annotation file') + parser.add_argument( + '--output', + type=str, + default=None, + help=( + 'the output file name, use annofile.replace(\'.txt\', \'.json\') ' + 'if the arg value is None')) + args = parser.parse_args() + + return args + + +def lines2dictlist(lines, format): + """Convert lines in 'txt' format to dictionaries in 'json' format. + Currently support single-label and multi-label. + + Example of a single-label rawframes annotation txt file: + + .. code-block:: txt + + (frame_dir num_frames label) + some/directory-1 163 1 + some/directory-2 122 1 + some/directory-3 258 2 + + Example of a multi-label rawframes annotation txt file: + + .. code-block:: txt + + (frame_dir num_frames label1 label2 ...) + some/directory-1 163 1 3 5 + some/directory-2 122 1 2 + some/directory-3 258 2 + + Example of a single-label videos annotation txt file: + + .. code-block:: txt + + (filename label) + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + + Example of a multi-label videos annotation txt file: + + .. code-block:: txt + + (filename label1 label2 ...) + some/path/000.mp4 1 3 5 + some/path/001.mp4 1 4 8 + some/path/002.mp4 2 4 9 + + Args: + lines (list): List of lines in 'txt' label format. + format (str): Data format, choices are 'rawframes' and 'videos'. + + Returns: + list[dict]: For rawframes format, each dict has keys: frame_dir, + total_frames, label; for videos format, each diction has keys: + filename, label. + """ + lines = [x.split() for x in lines] + if format == 'rawframes': + data = [ + dict( + frame_dir=line[0], + total_frames=int(line[1]), + label=[int(x) for x in line[2:]]) for line in lines + ] + elif format == 'videos': + data = [ + dict(filename=line[0], label=[int(x) for x in line[1:]]) + for line in lines + ] + return data + + +if __name__ == '__main__': + # convert txt anno list to json + args = parse_args() + lines = open(args.annofile).readlines() + lines = [x.strip() for x in lines] + result = lines2dictlist(lines, args.format) + if args.output is None: + args.output = args.annofile.replace('.txt', '.json') + mmengine.dump(result, args.output) diff --git a/tools/data/ava/AVA_annotation_explained.md b/tools/data/ava/AVA_annotation_explained.md new file mode 100644 index 0000000000000000000000000000000000000000..dceab5bf55c666d9fa5d21e561e08ef80a6edc10 --- /dev/null +++ b/tools/data/ava/AVA_annotation_explained.md @@ -0,0 +1,34 @@ +# AVA Annotation Explained + +In this section, we explain the annotation format of AVA in details: + +``` +mmaction2 +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ava +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_train.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_val.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_test.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_action_list_v2.1_for_activitynet_2018.pbtxt +``` + +## The proposals generated by human detectors + +In the annotation folder, `ava_dense_proposals_[train/val/test].FAIR.recall_93.9.pkl` are human proposals generated by a human detector. They are used in training, validation and testing respectively. Take `ava_dense_proposals_train.FAIR.recall_93.9.pkl` as an example. It is a dictionary of size 203626. The key consists of the `videoID` and the `timestamp`. For example, the key `-5KQ66BBWC4,0902` means the values are the detection results for the frame at the 902-nd second in the video `-5KQ66BBWC4`. The values in the dictionary are numpy arrays with shape $$N \\times 5$$ , $$N$$ is the number of detected human bounding boxes in the corresponding frame. The format of bounding box is $$\[x_1, y_1, x_2, y_2, score\], 0 \\le x_1, y_1, x_2, w_2, score \\le 1$$. $$(x_1, y_1)$$ indicates the top-left corner of the bounding box, $$(x_2, y_2)$$ indicates the bottom-right corner of the bounding box; $$(0, 0)$$ indicates the top-left corner of the image, while $$(1, 1)$$ indicates the bottom-right corner of the image. + +## The ground-truth labels for spatio-temporal action detection + +In the annotation folder, `ava_[train/val]_v[2.1/2.2].csv` are ground-truth labels for spatio-temporal action detection, which are used during training & validation. Take `ava_train_v2.1.csv` as an example, it is a csv file with 837318 lines, each line is the annotation for a human instance in one frame. For example, the first line in `ava_train_v2.1.csv` is `'-5KQ66BBWC4,0902,0.077,0.151,0.283,0.811,80,1'`: the first two items `-5KQ66BBWC4` and `0902` indicate that it corresponds to the 902-nd second in the video `-5KQ66BBWC4`. The next four items ($$\[0.077(x_1), 0.151(y_1), 0.283(x_2), 0.811(y_2)\]$$) indicates the location of the bounding box, the bbox format is the same as human proposals. The next item `80` is the action label. The last item `1` is the ID of this bounding box. + +## Excluded timestamps + +`ava_[train/val]_excludes_timestamps_v[2.1/2.2].csv` contains excluded timestamps which are not used during training or validation. The format is `video_id, second_idx` . + +## Label map + +`ava_action_list_v[2.1/2.2]_for_activitynet_[2018/2019].pbtxt` contains the label map of the AVA dataset, which maps the action name to the label index. diff --git a/tools/data/ava/README.md b/tools/data/ava/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b8667620bbe4bbeada903c14be88e2c70b09ab57 --- /dev/null +++ b/tools/data/ava/README.md @@ -0,0 +1,148 @@ +# Preparing AVA + +## Introduction + + + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={6047--6056}, + year={2018} +} +``` + +For basic dataset information, please refer to the official [website](https://research.google.com/ava/index.html). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ava/`. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations. + +```shell +bash download_annotations.sh +``` + +This command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script. + +```shell +VERSION=2.2 bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +Then, use the following script to prepare videos. The codes are adapted from the [official crawler](https://github.com/cvdfoundation/ava-dataset). +Note that this might take a long time. + +```shell +bash download_videos.sh +``` + +Or you can use the following command to downloading AVA videos in parallel using a python script. + +```shell +bash download_videos_parallel.sh +``` + +Note that if you happen to have sudoer or have [GNU parallel](https://www.gnu.org/software/parallel/) on your machine, +you can speed up the procedure by downloading in parallel. + +```shell +# sudo apt-get install parallel +bash download_videos_gnu_parallel.sh +``` + +## Step 3. Cut Videos + +Cut each video from its 15th to 30th minute and make them at 30 fps. + +```shell +bash cut_videos.sh +``` + +## Step 4. Extract RGB and Flow + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/ava_extracted/ +ln -s /mnt/SSD/ava_extracted/ ../data/ava/rawframes/ +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using ffmpeg by the following script. + +```shell +bash extract_rgb_frames_ffmpeg.sh +``` + +If both are required, run the following script to extract frames. + +```shell +bash extract_frames.sh +``` + +## Step 5. Fetch Proposal Files + +The scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks). + +Run the following scripts to fetch the pre-computed proposal list. + +```shell +bash fetch_ava_proposals.sh +``` + +## Step 6. Folder Structure + +After the whole data pipeline for AVA preparation. +you can get the rawframes (RGB + Flow), videos and annotation files for AVA. + +In the context of the whole project (for AVA only), the *minimal* folder structure will look like: +(*minimal* means that some data are not necessary: for example, you may want to evaluate AVA using the original video format.) + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ava +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_train.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_val.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_test.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_action_list_v2.1_for_activitynet_2018.pbtxt +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU.mkv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0f39OWEqJ24.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ videos_15min +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU.mkv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0f39OWEqJ24.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +``` + +For training and evaluating on AVA, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). + +## Reference + +1. O. Tange (2018): GNU Parallel 2018, March 2018, https://doi.org/10.5281/zenodo.1146014 diff --git a/tools/data/ava/README_zh-CN.md b/tools/data/ava/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..ad6d69478372c8f85c186ef636d6b13e42988754 --- /dev/null +++ b/tools/data/ava/README_zh-CN.md @@ -0,0 +1,134 @@ +# ๅ‡†ๅค‡ AVA + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={6047--6056}, + year={2018} +} +``` + +่ฏทๅ‚็…ง [ๅฎ˜ๆ–น็ฝ‘็ซ™](https://research.google.com/ava/index.html) ไปฅ่Žทๅ–ๆ•ฐๆฎ้›†ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅผ€ๅง‹ไน‹ๅ‰๏ผŒ็”จๆˆท้œ€็กฎไฟๅฝ“ๅ‰็›ฎๅฝ•ไธบ `$MMACTION2/tools/data/ava/`ใ€‚ + +## 1. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถๅนถ่ฟ›่กŒ้ข„ๅค„็†๏ผš + +```shell +bash download_annotations.sh +``` + +่ฟ™ไธ€ๅ‘ฝไปคๅฐ†ไธ‹่ฝฝ `ava_v2.1.zip` ไปฅๅพ—ๅˆฐ AVA v2.1 ๆ ‡ๆณจๆ–‡ไปถใ€‚ๅฆ‚็”จๆˆท้œ€่ฆ AVA v2.2 ๆ ‡ๆณจๆ–‡ไปถ๏ผŒๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ๏ผš + +```shell +VERSION=2.2 bash download_annotations.sh +``` + +## 2. ไธ‹่ฝฝ่ง†้ข‘ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘๏ผŒ่ง†้ข‘ๅ‡†ๅค‡ไปฃ็ ไฟฎๆ”น่‡ช [ๅฎ˜ๆ–น็ˆฌ่™ซ](https://github.com/cvdfoundation/ava-dataset)ใ€‚ +ๆณจๆ„่ฟ™ไธ€ๆญฅ้ชคๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ดใ€‚ + +```shell +bash download_videos.sh +``` + +ไบฆๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ๏ผŒไฝฟ็”จ python ๅนถ่กŒไธ‹่ฝฝ AVA ๆ•ฐๆฎ้›†่ง†้ข‘๏ผš + +```shell +bash download_videos_parallel.sh +``` + +## 3. ๆˆชๅ–่ง†้ข‘ + +ๆˆชๅ–ๆฏไธช่ง†้ข‘ไธญ็š„ 15 ๅˆฐ 30 ๅˆ†้’Ÿ๏ผŒ่ฎพๅฎšๅธง็އไธบ 30ใ€‚ + +```shell +bash cut_videos.sh +``` + +## 4. ๆๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅœจๆๅ–ไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰่ถณๅคŸ็š„ SSD ็ฉบ้—ด๏ผŒ้‚ฃไนˆๅปบ่ฎฎๅฐ†่ง†้ข‘ๆŠฝๅ–ไธบ RGB ๅธงไปฅๆๅ‡ I/O ๆ€ง่ƒฝใ€‚็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌไธบๆŠฝๅ–ๅพ—ๅˆฐ็š„ๅธงๆ–‡ไปถๅคนๅปบ็ซ‹่ฝฏ่ฟžๆŽฅ๏ผš + +```shell +# ๆ‰ง่กŒไปฅไธ‹่„šๆœฌ (ๅ‡่ฎพ SSD ่ขซๆŒ‚่ฝฝๅœจ "/mnt/SSD/") +mkdir /mnt/SSD/ava_extracted/ +ln -s /mnt/SSD/ava_extracted/ ../data/ava/rawframes/ +``` + +ๅฆ‚ๆžœ็”จๆˆทๅชไฝฟ็”จ RGB ๅธง๏ผˆ็”ฑไบŽๅ…‰ๆตๆๅ–้žๅธธ่€—ๆ—ถ๏ผ‰๏ผŒๅฏๆ‰ง่กŒไปฅไธ‹่„šๆœฌไฝฟ็”จ denseflow ๆๅ– RGB ๅธง๏ผš + +```shell +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆœชๅฎ‰่ฃ… denseflow๏ผŒๅฏๆ‰ง่กŒไปฅไธ‹่„šๆœฌไฝฟ็”จ ffmpeg ๆๅ– RGB ๅธง๏ผš + +```shell +bash extract_rgb_frames_ffmpeg.sh +``` + +ๅฆ‚ๆžœๅŒๆ—ถ้œ€่ฆ RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅฏไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌๆŠฝๅธง๏ผš + +```shell +bash extract_frames.sh +``` + +## 5. ไธ‹่ฝฝ AVA ไธŠไบบไฝ“ๆฃ€ๆต‹็ป“ๆžœ + +ไปฅไธ‹่„šๆœฌไฟฎๆ”น่‡ช [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks)ใ€‚ + +ๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌไธ‹่ฝฝ AVA ไธŠ้ข„ๅ…ˆ่ฎก็ฎ—็š„ไบบไฝ“ๆฃ€ๆต‹็ป“ๆžœ๏ผš + +```shell +bash fetch_ava_proposals.sh +``` + +## 6. ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆ•ดๅฎŒๆˆ AVA ็š„ๆ•ฐๆฎๅค„็†ๅŽ๏ผŒๅฐ†ๅพ—ๅˆฐๅธงๆ–‡ไปถๅคน๏ผˆRGB ๅธงๅ’Œๅ…‰ๆตๅธง๏ผ‰๏ผŒ่ง†้ข‘ไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็›ฎๅฝ•ไธ‹๏ผˆไป…้’ˆๅฏน AVA๏ผ‰๏ผŒ*ๆœ€็ฎ€* ็›ฎๅฝ•็ป“ๆž„ๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ava +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_train.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_val.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_dense_proposals_test.FAIR.recall_93.9.pkl +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_train_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_val_excluded_timestamps_v2.1.csv +โ”‚ โ”‚ | โ”œโ”€โ”€ ava_action_list_v2.1_for_activitynet_2018.pbtxt +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU.mkv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0f39OWEqJ24.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ videos_15min +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU.mkv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0f39OWEqJ24.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 053oq2xB3oU +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +| โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +``` + +ๅ…ณไบŽ AVA ๆ•ฐๆฎ้›†ไธŠ็š„่ฎญ็ปƒไธŽๆต‹่ฏ•๏ผŒ่ฏทๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/ava/cut_videos.sh b/tools/data/ava/cut_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..6912216543c1c9bc3cb1fe691aa9561ed03fa050 --- /dev/null +++ b/tools/data/ava/cut_videos.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Cut each video from its 15th to 30th minute. + +IN_DATA_DIR="../../../data/ava/videos" +OUT_DATA_DIR="../../../data/ava/videos_15min" + +if [[ ! -d "${OUT_DATA_DIR}" ]]; then + echo "${OUT_DATA_DIR} doesn't exist. Creating it."; + mkdir -p ${OUT_DATA_DIR} +fi + +for video in $(ls -A1 -U ${IN_DATA_DIR}/*) +do + out_name="${OUT_DATA_DIR}/${video##*/}" + if [ ! -f "${out_name}" ]; then + ffmpeg -ss 900 -t 901 -i "${video}" -r 30 -strict experimental "${out_name}" + fi +done diff --git a/tools/data/ava/download_annotations.sh b/tools/data/ava/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..3839d6016819fcc689c13f27b849d5b4cf9c8fea --- /dev/null +++ b/tools/data/ava/download_annotations.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +VERSION=${VERSION:-"2.1"} +DATA_DIR="../../../data/ava/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://research.google.com/ava/download/ava_v${VERSION}.zip +unzip -j ava_v${VERSION}.zip -d ${DATA_DIR}/ +rm ava_v${VERSION}.zip diff --git a/tools/data/ava/download_videos.sh b/tools/data/ava/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..a26d19be793d1f394ddaef6bd17f398cbe63c432 --- /dev/null +++ b/tools/data/ava/download_videos.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/ava/videos" +ANNO_DIR="../../../data/ava/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR} + +cat ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt | +while read vid; + do wget -c "https://s3.amazonaws.com/ava-dataset/trainval/${vid}" -P ${DATA_DIR}; done + +echo "Downloading finished." diff --git a/tools/data/ava/download_videos_gnu_parallel.sh b/tools/data/ava/download_videos_gnu_parallel.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e4d37d19336123dfcd80bc4c308fad6ad0cdd78 --- /dev/null +++ b/tools/data/ava/download_videos_gnu_parallel.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/ava/videos" +ANNO_DIR="../../../data/ava/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR} + +# sudo apt-get install parallel +# parallel downloading to speed up +awk '{print "https://s3.amazonaws.com/ava-dataset/trainval/"$0}' ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt | +parallel -j 8 wget -c -q {} -P ${DATA_DIR} + +echo "Downloading finished." diff --git a/tools/data/ava/download_videos_parallel.py b/tools/data/ava/download_videos_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..9ed622f7b202b79ff78e03b3b340177cdd13294f --- /dev/null +++ b/tools/data/ava/download_videos_parallel.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +import subprocess + +import mmengine +from joblib import Parallel, delayed + +URL_PREFIX = 'https://s3.amazonaws.com/ava-dataset/trainval/' + + +def download_video(video_url, output_dir, num_attempts=5): + video_file = osp.basename(video_url) + output_file = osp.join(output_dir, video_file) + + status = False + + if not osp.exists(output_file): + command = ['wget', '-c', video_url, '-P', output_dir] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + attempts += 1 + if attempts == num_attempts: + return status, 'Downloading Failed' + else: + break + + status = osp.exists(output_file) + return status, 'Downloaded' + + +def main(source_file, output_dir, num_jobs=24, num_attempts=5): + mmengine.mkdir_or_exist(output_dir) + video_list = open(source_file).read().strip().split('\n') + video_list = [osp.join(URL_PREFIX, video) for video in video_list] + + if num_jobs == 1: + status_list = [] + for video in video_list: + video_list.append(download_video(video, output_dir, num_attempts)) + else: + status_list = Parallel(n_jobs=num_jobs)( + delayed(download_video)(video, output_dir, num_attempts) + for video in video_list) + + mmengine.dump(status_list, 'download_report.json') + + +if __name__ == '__main__': + description = 'Helper script for downloading AVA videos' + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + 'source_file', type=str, help='TXT file containing the video filename') + parser.add_argument( + 'output_dir', + type=str, + help='Output directory where videos will be saved') + parser.add_argument('-n', '--num-jobs', type=int, default=24) + parser.add_argument('--num-attempts', type=int, default=5) + main(**vars(parser.parse_args())) diff --git a/tools/data/ava/download_videos_parallel.sh b/tools/data/ava/download_videos_parallel.sh new file mode 100644 index 0000000000000000000000000000000000000000..56810a72a2c73f8b4226f95b41ae9fdda4898cc5 --- /dev/null +++ b/tools/data/ava/download_videos_parallel.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/ava/videos" +ANNO_DIR="../../../data/ava/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR} + +python download_videos_parallel.py ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt ${DATA_DIR} diff --git a/tools/data/ava/extract_frames.sh b/tools/data/ava/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..31be7ff066e36b006370eb4a84b7e9b822ef835c --- /dev/null +++ b/tools/data/ava/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/ava/videos_15min/ ../../data/ava/rawframes/ --task both --level 1 --flow-type tvl1 --mixed-ext +echo "Raw frames (RGB and Flow) Generated" +cd ava/ diff --git a/tools/data/ava/extract_rgb_frames.sh b/tools/data/ava/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..afcd8fd7651369e7ca6d98f6c7c750d2f095947d --- /dev/null +++ b/tools/data/ava/extract_rgb_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/ava/videos_15min/ ../../data/ava/rawframes/ --task rgb --level 1 --mixed-ext +echo "Genearte raw frames (RGB only)" + +cd ava/ diff --git a/tools/data/ava/extract_rgb_frames_ffmpeg.sh b/tools/data/ava/extract_rgb_frames_ffmpeg.sh new file mode 100644 index 0000000000000000000000000000000000000000..1a3af335e83b50a93646c49835a9f415d886574e --- /dev/null +++ b/tools/data/ava/extract_rgb_frames_ffmpeg.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Extract frames from videos. + +IN_DATA_DIR="../../../data/ava/videos_15min" +OUT_DATA_DIR="../../../data/ava/rawframes" + +if [[ ! -d "${OUT_DATA_DIR}" ]]; then + echo "${OUT_DATA_DIR} doesn't exist. Creating it."; + mkdir -p ${OUT_DATA_DIR} +fi + +for video in $(ls -A1 -U ${IN_DATA_DIR}/*) +do + video_name=${video##*/} + + if [[ $video_name = *".webm" ]]; then + video_name=${video_name::-5} + else + video_name=${video_name::-4} + fi + + out_video_dir=${OUT_DATA_DIR}/${video_name} + mkdir -p "${out_video_dir}" + + out_name="${out_video_dir}/img_%05d.jpg" + + ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}" +done diff --git a/tools/data/ava/fetch_ava_proposals.sh b/tools/data/ava/fetch_ava_proposals.sh new file mode 100644 index 0000000000000000000000000000000000000000..18fdb67be4d216a2bcdf8e2f12ff40507b001921 --- /dev/null +++ b/tools/data/ava/fetch_ava_proposals.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/ava/annotations" + +wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_train.FAIR.recall_93.9.pkl -P ${DATA_DIR} +wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_val.FAIR.recall_93.9.pkl -P ${DATA_DIR} +wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_test.FAIR.recall_93.9.pkl -P ${DATA_DIR} diff --git a/tools/data/ava/label_map.txt b/tools/data/ava/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0547b81309d9aa2538e88321b3fc5ecdeb9096 --- /dev/null +++ b/tools/data/ava/label_map.txt @@ -0,0 +1,60 @@ +1: bend/bow (at the waist) +3: crouch/kneel +4: dance +5: fall down +6: get up +7: jump/leap +8: lie/sleep +9: martial art +10: run/jog +11: sit +12: stand +13: swim +14: walk +15: answer phone +17: carry/hold (an object) +20: climb (e.g., a mountain) +22: close (e.g., a door, a box) +24: cut +26: dress/put on clothing +27: drink +28: drive (e.g., a car, a truck) +29: eat +30: enter +34: hit (an object) +36: lift/pick up +37: listen (e.g., to music) +38: open (e.g., a window, a car door) +41: play musical instrument +43: point to (an object) +45: pull (an object) +46: push (an object) +47: put down +48: read +49: ride (e.g., a bike, a car, a horse) +51: sail boat +52: shoot +54: smoke +56: take a photo +57: text on/look at a cellphone +58: throw +59: touch (an object) +60: turn (e.g., a screwdriver) +61: watch (e.g., TV) +62: work on a computer +63: write +64: fight/hit (a person) +65: give/serve (an object) to (a person) +66: grab (a person) +67: hand clap +68: hand shake +69: hand wave +70: hug (a person) +72: kiss (a person) +73: lift (a person) +74: listen to (a person) +76: push (another person) +77: sing to (e.g., self, a person, a group) +78: take (an object) from (a person) +79: talk to (e.g., self, a person, a group) +80: watch (a person) diff --git a/tools/data/ava_kinetics/README.md b/tools/data/ava_kinetics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e636b5dd39b99cbdefe865c15baaac9efe1dba05 --- /dev/null +++ b/tools/data/ava_kinetics/README.md @@ -0,0 +1,173 @@ +# Preparing AVA-Kinetics + +## Introduction + + + +```BibTeX +@article{li2020ava, + title={The ava-kinetics localized human actions video dataset}, + author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew}, + journal={arXiv preprint arXiv:2005.00214}, + year={2020} +} +``` + +For basic dataset information, please refer to the official [website](https://research.google.com/ava/index.html). +AVA-Kinetics dataset is a crossover between the AVA Actions and Kinetics datasets. You may want to first prepare the AVA datasets. In this file, we provide commands to prepare the Kinetics part and merge the two parts together. + +For model training, we will keep reading from raw frames for the AVA part, but read from videos using `decord` for the Kinetics part to accelerate training. + +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ava_kinetics/`. + +## Step 1. Prepare the Kinetics700 dataset + +The Kinetics part of the AVA-Kinetics dataset are sampled from the Kinetics-700 dataset. + +It is best if you have prepared the Kinetics-700 dataset (only videos required) following +[Preparing Kinetics](https://github.com/open-mmlab/mmaction2/tree/master/tools/data/kinetics). We will also have alternative method to prepare these videos if you do not have enough storage (coming soon). + +We will need the videos of this dataset (`$MMACTION2/data/kinetics700/videos_train`) and the videos file list (`$MMACTION2/data/kinetics700/kinetics700_train_list_videos.txt`), which is generated by [Step 4 in Preparing Kinetics](https://github.com/open-mmlab/mmaction2/tree/master/tools/data/kinetics#step-4-generate-file-list) + +The format of the file list should be: + +``` +Path_to_video_1 label_1\n +Path_to_video_2 label_2\n +... +Path_to_video_n label_n\n +``` + +The timestamp (start and end of the video) must be contained. For example: + +``` +class602/o3lCwWyyc_s_000012_000022.mp4 602\n +``` + +It means that this video clip is the 12th to 22nd seconds of the original video. It is okay if some videos are missing, and we will ignore them in the next steps. + +## Step 2. Download Annotations + +Download the annotation tar file (recall that the directory should be located at `$MMACTION2/tools/data/ava_kinetics/`). + +```shell +wget https://storage.googleapis.com/deepmind-media/Datasets/ava_kinetics_v1_0.tar.gz +tar xf ava_kinetics_v1_0.tar.gz && rm ava_kinetics_v1_0.tar.gz +``` + +You should have the `ava_kinetics_v1_0` folder at `$MMACTION2/tools/data/ava_kinetics/`. + +## Step 3. Cut Videos + +Use `cut_kinetics.py` to find the desired videos from the Kinetics-700 dataset and trim them to contain only annotated clips. Currently we only use the train set of the Kinetics part to improve training. Validation on the Kinetics part will come soon. + +Here is the script: + +```shell +python3 cut_kinetics.py --avakinetics_anotation=$AVAKINETICS_ANOTATION \ + --kinetics_list=$KINETICS_LIST \ + --avakinetics_root=$AVAKINETICS_ROOT \ + [--num_workers=$NUM_WORKERS ] +``` + +Arguments: + +- `avakinetics_anotation`: the directory to ava-kinetics anotations. Defaults to `./ava_kinetics_v1_0`. +- `kinetics_list`: the path to the videos file list as mentioned in Step 1. If you have prepared the Kinetics700 dataset following `mmaction2`, it should be `$MMACTION2/data/kinetics700/kinetics700_train_list_videos.txt`. +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. +- `num_workers`: number of workers used to cut videos. Defaults to -1 and use all available cpus. + +There should be about 100k videos. It is OK if some videos are missing and we will ignore them in the next steps. + +## Step 4. Extract RGB Frames + +This step is similar to Step 4 in [Preparing AVA](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/ava#step-4-extract-rgb-and-flow). + +Here we provide a script to extract RGB frames using ffmpeg: + +```shell +python3 extract_rgb_frames.py --avakinetics_root=$AVAKINETICS_ROOT \ + [--num_workers=$NUM_WORKERS ] +``` + +Arguments: + +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. +- `num_workers`: number of workers used to extract frames. Defaults to -1 and use all available cpus. + +If you have installed denseflow, you can also use `build_rawframes.py` to extract RGB frames: + +```shell +python3 ../build_rawframes.py ../../../data/ava_kinetics/videos/ ../../../data/ava_kinetics/rawframes/ --task rgb --level 1 --mixed-ext +``` + +## Step 5. Prepare Annotations + +Use `prepare_annotation.py` to prepare the training annotations. It will generate a `kinetics_train.csv` file containning the spatial-temporal annotations for the Kinetics part, localting at `$AVAKINETICS_ROOT`. + +Here is the script: + +```shell +python3 prepare_annotation.py --avakinetics_anotation=$AVAKINETICS_ANOTATION \ + --avakinetics_root=$AVAKINETICS_ROOT \ + [--num_workers=$NUM_WORKERS] +``` + +Arguments: + +- `avakinetics_anotation`: the directory to ava-kinetics anotations. Defaults to `./ava_kinetics_v1_0`. +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. +- `num_workers`: number of workers used to prepare annotations. Defaults to -1 and use all available cpus. + +## Step 6. Fetch Proposal Files + +The pre-computed proposals for AVA dataset are provided by FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks). For the Kinetics part, we use `Cascade R-CNN X-101-64x4d-FPN` from [mmdetection](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) to fetch the proposals. Here is the script: + +```shell +python3 fetch_proposal.py --avakinetics_root=$AVAKINETICS_ROOT \ + --datalist=$DATALIST \ + --picklepath=$PICKLEPATH \ + [--config=$CONFIG ] \ + [--checkpoint=$CHECKPOINT ] + +``` + +It will generate a `kinetics_proposal.pkl` file at `$MMACTION2/data/ava_kinetics/`. + +Arguments: + +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. +- `datalist`: path to the `kinetics_train.csv` file generated at Step 3. +- `picklepath`: path to save the extracted proposal pickle file. +- `config`: the config file for the human detection model. Defaults to `X-101-64x4d-FPN.py`. +- `checkpoint`: the checkpoint for the human detection model. Defaults to the `mmdetection` pretraining checkpoint. + +## Step 7. Merge AVA to AVA-Kinetics + +Now we are done with the preparations for the Kinetics part. We need to merge the AVA part into the `ava_kinetics` folder (assuming you have AVA dataset ready at `$MMACTION2/data/ava`). First we make a copy of the AVA anotation to the `ava_kinetics` folder (recall that you are at `$MMACTION2/tools/data/ava_kinetics/`): + +```shell +cp -r ../../../data/ava/annotations/ ../../../data/ava_kinetics/ +``` + +Next we merge the generated anotation files of the Kinetics part to AVA. Please check: you should have two files `kinetics_train.csv` and `kinetics_proposal.pkl` at `$MMACTION2/data/ava_kinetics/` generated from Step 5 and Step 6. Run the following script to merge these two files into `$MMACTION2/data/ava_kinetics/annotations/ava_train_v2.2.csv` and `$MMACTION2/data/ava_kinetics/annotations/ava_dense_proposals_train.FAIR.recall_93.9.pkl` respectively. + +```shell +python3 merge_annotations.py --avakinetics_root=$AVAKINETICS_ROOT +``` + +Arguments: + +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. + +Finally, we need to merge the rawframes of AVA part. You can either copy/move them or generate soft links. The following script is an example to use soft links: + +```shell +python3 softlink_ava.py --avakinetics_root=$AVAKINETICS_ROOT \ + --ava_root=$AVA_ROOT +``` + +Arguments: + +- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`. +- `ava_root`: the directory to save the ava dataset. Defaults to `$MMACTION2/data/ava`. diff --git a/tools/data/ava_kinetics/X-101-64x4d-FPN.py b/tools/data/ava_kinetics/X-101-64x4d-FPN.py new file mode 100644 index 0000000000000000000000000000000000000000..6a27ac7aa90e58e24a916432e692d3e371527b02 --- /dev/null +++ b/tools/data/ava_kinetics/X-101-64x4d-FPN.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +model = dict( + type='CascadeRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNeXt', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'), + groups=64, + base_width=4), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='disk')), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ])) + +test_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox', + format_only=False) + +test_cfg = dict(type='TestLoop') diff --git a/tools/data/ava_kinetics/cut_kinetics.py b/tools/data/ava_kinetics/cut_kinetics.py new file mode 100644 index 0000000000000000000000000000000000000000..f06459bf75827f5afa0f3a26e48d9e815a8e1854 --- /dev/null +++ b/tools/data/ava_kinetics/cut_kinetics.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import multiprocessing +import os +from collections import defaultdict +from typing import List + +import decord + + +def get_kinetics_frames(kinetics_anotation_file: str) -> dict: + """Given the AVA-kinetics anotation file, return a lookup to map the video + id and the the set of timestamps involved of this video id. + + Args: + kinetics_anotation_file (str): Path to the AVA-like anotation file for + the kinetics subset. + Returns: + dict: the dict keys are the kinetics videos' video id. The values are + the set of timestamps involved. + """ + with open(kinetics_anotation_file) as f: + anotated_frames = [i.split(',') for i in f.readlines()] + anotated_frames = [i for i in anotated_frames if len(i) == 7] + anotated_frames = [(i[0], int(float(i[1]))) for i in anotated_frames] + + frame_lookup = defaultdict(set) + for video_id, timestamp in anotated_frames: + frame_lookup[video_id].add(timestamp) + return frame_lookup + + +def filter_missing_videos(kinetics_list: str, frame_lookup: dict) -> dict: + """Given the kinetics700 dataset list, remove the video ids from the lookup + that are missing videos or frames. + + Args: + kinetics_list (str): Path to the kinetics700 dataset list. + The content of the list should be: + ``` + Path_to_video1 label_1\n + Path_to_video2 label_2\n + ... + Path_to_videon label_n\n + ``` + The start and end of the video must be contained in the filename. + For example: + ``` + class602/o3lCwWyyc_s_000012_000022.mp4\n + ``` + frame_lookup (dict): the dict from `get_kinetics_frames`. + Returns: + dict: the dict keys are the kinetics videos' video id. The values are + the a list of tuples: + (start_of_the_video, end_of_the_video, video_path) + """ + video_lookup = defaultdict(set) + with open(kinetics_list) as f: + for line in f.readlines(): + video_path = line.split(' ')[0] # remove label information + video_name = video_path.split('/')[-1] # get the file name + video_name = video_name.split('.')[0] # remove file extensions + video_name = video_name.split('_') + video_id = '_'.join(video_name[:-2]) + if video_id not in frame_lookup: + continue + + start, end = int(video_name[-2]), int(video_name[-1]) + frames = frame_lookup[video_id] + frames = [frame for frame in frames if start < frame < end] + if len(frames) == 0: + continue + + start, end = max(start, min(frames) - 2), min(end, max(frames) + 2) + video_lookup[video_id].add((start, end, video_path)) + + # Some frame ids exist in multiple videos in the Kinetics dataset. + # The reason is the part of one video may fall into different categories. + # Remove the duplicated records + for video in video_lookup: + if len(video_lookup[video]) == 1: + continue + info_list = list(video_lookup[video]) + removed_list = [] + for i, info_i in enumerate(info_list): + start_i, end_i, _ = info_i + for j in range(i + 1, len(info_list)): + start_j, end_j, _ = info_list[j] + if start_i <= start_j and end_j <= end_i: + removed_list.append(j) + elif start_j <= start_i and end_i <= end_j: + removed_list.append(i) + new_list = [] + for i, info in enumerate(info_list): + if i not in removed_list: + new_list.append(info) + video_lookup[video] = set(new_list) + return video_lookup + + +template = ('ffmpeg -ss %d -t %d -accurate_seek -i' + ' %s -r 30 -avoid_negative_ts 1 %s') + + +def generate_cut_cmds(video_lookup: dict, data_root: str) -> List[str]: + cmds = [] + for video_id in video_lookup: + for start, end, video_path in video_lookup[video_id]: + start0 = int(video_path.split('_')[-2]) + new_path = '%s/%s_%06d_%06d.mp4' % (data_root, video_id, start, + end) + cmd = template % (start - start0, end - start, video_path, + new_path) + cmds.append(cmd) + return cmds + + +def run_cmd(cmd): + os.system(cmd) + return + + +def remove_failed_video(video_path: str) -> None: + """Given the path to the video, delete the video if it cannot be read or if + the actual length of the video is 0.75 seconds shorter than expected.""" + try: + v = decord.VideoReader(video_path) + fps = v.get_avg_fps() + num_frames = len(v) + x = video_path.split('.')[0].split('_') + time = int(x[-1]) - int(x[-2]) + if num_frames < (time - 3 / 4) * fps: + os.remove(video_path) + except: # noqa: E722 + os.remove(video_path) + return + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + '--avakinetics_anotation', + type=str, + default='./ava_kinetics_v1_0', + help='the directory to ava-kinetics anotations') + p.add_argument( + '--kinetics_list', + type=str, + help='the datalist of the kinetics700 training videos') + p.add_argument( + '--num_workers', + type=int, + default=-1, + help='number of workers used for multiprocessing') + p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics dataset') + args = p.parse_args() + + if args.num_workers > 0: + num_workers = args.num_workers + else: + num_workers = max(multiprocessing.cpu_count() - 1, 1) + + # Find videos from the Kinetics700 dataset required for AVA-Kinetics + kinetics_train = args.avakinetics_anotation + '/kinetics_train_v1.0.csv' + frame_lookup = get_kinetics_frames(kinetics_train) + video_lookup = filter_missing_videos(args.kinetics_list, frame_lookup) + + root = args.avakinetics_root + os.makedirs(root, exist_ok=True) + video_path = root + '/videos/' + os.makedirs(video_path, exist_ok=True) + all_cmds = generate_cut_cmds(video_lookup, video_path) + + # Cut and save the videos for AVA-Kinetics + pool = multiprocessing.Pool(num_workers) + _ = pool.map(run_cmd, all_cmds) + + # Remove failed videos + videos = os.listdir(video_path) + videos = ['%s/%s' % (video_path, video) for video in videos] + _ = pool.map(remove_failed_video, videos) diff --git a/tools/data/ava_kinetics/extract_rgb_frames.py b/tools/data/ava_kinetics/extract_rgb_frames.py new file mode 100644 index 0000000000000000000000000000000000000000..fa2d83b472f5773b8d5517b009b733703e50fe28 --- /dev/null +++ b/tools/data/ava_kinetics/extract_rgb_frames.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import multiprocessing +import os + + +def extract_rgb(video_name, frame_path, video_path): + video_id = video_name.split('.')[0] + os.makedirs('%s/%s' % (frame_path, video_id), exist_ok=True) + cmd = 'ffmpeg -i %s/%s -r 30 -q:v 1 %s/%s' % (video_path, video_name, + frame_path, video_id) + cmd += '/img_%05d.jpg' + return cmd + + +def run_cmd(cmd): + os.system(cmd) + return + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics dataset') + p.add_argument( + '--num_workers', + type=int, + default=-1, + help='number of workers used for multiprocessing') + args = p.parse_args() + + if args.num_workers > 0: + num_workers = args.num_workers + else: + num_workers = max(multiprocessing.cpu_count() - 1, 1) + + root = args.avakinetics_root + video_path = root + '/videos/' + frame_path = root + '/rawframes/' + os.makedirs(frame_path, exist_ok=True) + + all_cmds = [ + extract_rgb(video_name, frame_path, video_path) + for video_name in os.listdir(video_path) + ] + + pool = multiprocessing.Pool(num_workers) + out = pool.map(run_cmd, all_cmds) diff --git a/tools/data/ava_kinetics/fetch_proposal.py b/tools/data/ava_kinetics/fetch_proposal.py new file mode 100644 index 0000000000000000000000000000000000000000..a0dd932faf8698d0152246b0ca0e34bef11bafec --- /dev/null +++ b/tools/data/ava_kinetics/fetch_proposal.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import multiprocessing as mp +import os +import pickle + +import numpy as np +from mmdet.apis import inference_detector, init_detector +from mmdet.utils import register_all_modules +from PIL import Image + + +def get_vid_from_path(path): + video_id = path.split('/')[-1] + video_id = video_id.split('_')[:-2] + return '_'.join(video_id) + + +def prepare_det_lookup(datalist, frame_root): + with open(datalist) as f: + records = f.readlines() + det_lookup = {} + for record in records: + record = record.split(',') + folder_path = record[0] + video_id = get_vid_from_path(folder_path) + frame_id = int(record[1]) + for idx in range(frame_id - 1, frame_id + 2): + proposal_id = '%s,%04d' % (video_id, idx) + det_lookup[proposal_id] = '%s/%s' % (frame_root, folder_path) + return det_lookup + + +def single_worker(rank, det_lookup, args): + detect_list = list(det_lookup) + detect_sublist = [ + detect_list[i] for i in range(len(detect_list)) + if i % args.num_gpus == rank + ] + + # register all modules in mmdet into the registries + register_all_modules() + model = init_detector( + args.config, args.checkpoint, device='cuda:%d' % rank) + + lookup = {} + for count, key in enumerate(detect_sublist): + try: + folder_path = det_lookup[key] + start = int(folder_path.split('/')[-1].split('_')[-2]) + time = int(key.split(',')[1]) + frame_id = (time - start) * 30 + 1 + frame_path = '%s/img_%05d.jpg' % (folder_path, frame_id) + img = Image.open(frame_path) + result = inference_detector(model, frame_path) + bboxes = result._pred_instances.bboxes.cpu() + scores = result._pred_instances.scores.cpu() + labels = result._pred_instances.labels.cpu() + + bboxes = bboxes[labels == 0] + scores = scores[labels == 0] + + bboxes = bboxes[scores > 0.7].numpy() + scores = scores[scores > 0.7] + if scores.numel() > 0: + result_ = [] + for idx, (h1, w1, h2, w2) in enumerate(bboxes): + h1 /= img.size[0] + h2 /= img.size[0] + w1 /= img.size[1] + w2 /= img.size[1] + score = scores[idx].item() + result_.append((h1, w1, h2, w2, score)) + lookup[key] = np.array(result_) + except: # noqa: E722 + pass + + with open('tmp_person_%d.pkl' % rank, 'wb') as f: + pickle.dump(lookup, f) + return + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics dataset') + p.add_argument( + '--datalist', + type=str, + default='../../../data/ava_kinetics/kinetics_train.csv', + help='the list for kinetics videos') + p.add_argument( + '--config', + type=str, + default='X-101-64x4d-FPN.py', + help='the human detector') + p.add_argument( + '--checkpoint', + type=str, + default='https://download.openmmlab.com/mmdetection/v2.0/' + 'cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/' + 'cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_' + '075702-43ce6a30.pth', + help='the human detector checkpoint') + p.add_argument( + '--picklepath', + type=str, + default='../../../data/ava_kinetics/kinetics_proposal.pkl') + p.add_argument('--num_gpus', type=int, default=8) + + args = p.parse_args() + + frame_root = args.avakinetics_root + '/rawframes/' + det_lookup = prepare_det_lookup(args.datalist, frame_root) + + processes = [] + for rank in range(args.num_gpus): + ctx = mp.get_context('spawn') + p = ctx.Process(target=single_worker, args=(rank, det_lookup, args)) + p.start() + processes.append(p) + + for p in processes: + p.join() + + lookup = {} + for k in range(args.num_gpus): + one_lookup = pickle.load(open('tmp_person_%d.pkl' % k, 'rb')) + os.remove('tmp_person_%d.pkl' % k) + for key in one_lookup: + lookup[key] = one_lookup[key] + + with open(args.picklepath, 'wb') as f: + pickle.dump(lookup, f) diff --git a/tools/data/ava_kinetics/merge_annotations.py b/tools/data/ava_kinetics/merge_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..51771ea3aaf6665dfbc53c36b6f6956f2a178cee --- /dev/null +++ b/tools/data/ava_kinetics/merge_annotations.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import pickle + + +def check_file(path): + if os.path.isfile(path): + return + else: + path = path.split('/') + folder = '/'.join(path[:-1]) + filename = path[-1] + info = '%s not found at %s' % (filename, folder) + raise FileNotFoundError(info) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics dataset') + root = p.parse_args().avakinetics_root + + kinetics_annot = root + '/kinetics_train.csv' + ava_annot = root + '/annotations/ava_train_v2.2.csv' + + check_file(kinetics_annot) + check_file(ava_annot) + + with open(kinetics_annot) as f: + record = f.readlines() + + with open(ava_annot) as f: + record += f.readlines() + + with open(ava_annot, 'w') as f: + for line in record: + f.write(line) + + kinetics_proposal = root + '/kinetics_proposal.pkl' + ava_proposal = root + '/annotations/' \ + 'ava_dense_proposals_train.FAIR.recall_93.9.pkl' + + check_file(kinetics_proposal) + check_file(ava_proposal) + + lookup = pickle.load(open(kinetics_proposal, 'rb')) + lookup.update(pickle.load(open(ava_proposal, 'rb'))) + + with open(ava_proposal, 'wb') as f: + pickle.dump(lookup, f) diff --git a/tools/data/ava_kinetics/prepare_annotation.py b/tools/data/ava_kinetics/prepare_annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..49e6874beec36303b5898bb682a74358915d6429 --- /dev/null +++ b/tools/data/ava_kinetics/prepare_annotation.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import multiprocessing +import os +from collections import defaultdict + +FPS = 30 + + +def get_video_info(frame_folder): + folder_name = frame_folder.split('/')[-1] + filename = folder_name.split('_') + video_id = '_'.join(filename[:-2]) + start = int(filename[-2]) + length = len(os.listdir(frame_folder)) // FPS + return (video_id, start, start + length, folder_name) + + +def get_avaialble_clips(frame_root, num_cpus): + folders = os.listdir(frame_root) + folders = ['%s/%s' % (frame_root, folder) for folder in folders] + pool = multiprocessing.Pool(num_cpus) + outputs = pool.map(get_video_info, folders) + lookup = defaultdict(list) + for record in outputs: + lookup[record[0]].append(record[1:]) + return lookup + + +def filter_train_list(kinetics_anotation_file, lookup): + with open(kinetics_anotation_file) as f: + anotated_frames = [i.split(',') for i in f.readlines()] + anotated_frames = [i for i in anotated_frames if len(i) == 7] + + filtered = [] + for line in anotated_frames: + if line[0] not in lookup: + continue + flag = False + for start, end, video_path in lookup[line[0]]: + if start < float(line[1]) < end: + flag = True + break + if flag is False: + continue + + frame_idx, x1, y1, x2, y2, label = list(map(float, line[1:7])) + frame_idx, label = int(frame_idx), int(label) + + string = (f'{video_path},{frame_idx},' + f'{x1:.3f},{y1:.3f},{x2:.3f},{y2:.3f},{label},-1\n') + + filtered.append(string) + return filtered + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + '--avakinetics_anotation', + type=str, + default='./ava_kinetics_v1_0', + help='the directory to ava-kinetics annotations') + p.add_argument( + '--num_workers', + type=int, + default=-1, + help='number of workers used for multiprocessing') + p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics videos') + args = p.parse_args() + + if args.num_workers > 0: + num_workers = args.num_workers + else: + num_workers = max(multiprocessing.cpu_count() - 1, 1) + + frame_root = args.avakinetics_root + '/rawframes/' + frame_root = os.path.abspath(frame_root) + lookup = get_avaialble_clips(frame_root, num_workers) + + kinetics_train = args.avakinetics_anotation + '/kinetics_train_v1.0.csv' + filtered_list = filter_train_list(kinetics_train, lookup) + + with open('%s/kinetics_train.csv' % args.avakinetics_root, 'w') as f: + for line in filtered_list: + f.write(line) diff --git a/tools/data/ava_kinetics/softlink_ava.py b/tools/data/ava_kinetics/softlink_ava.py new file mode 100644 index 0000000000000000000000000000000000000000..a377c5f672e9d765130122e455333665a1c9ff8f --- /dev/null +++ b/tools/data/ava_kinetics/softlink_ava.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os + +p = argparse.ArgumentParser() +p.add_argument( + '--ava_root', + type=str, + default='../../../data/ava', + help='the path to save ava dataset') +p.add_argument( + '--avakinetics_root', + type=str, + default='../../../data/ava_kinetics', + help='the path to save ava-kinetics dataset') +args = p.parse_args() + +ava_frames = os.path.abspath(args.ava_root + '/rawframes/') +kinetics_frames = os.path.abspath(args.avakinetics_root + '/rawframes/') + +ava_folders = os.listdir(ava_frames) +for folder in ava_folders: + cmd = 'ln -s %s/%s %s/%s' % (ava_frames, folder, kinetics_frames, folder) + os.system(cmd) diff --git a/tools/data/build_audio_features.py b/tools/data/build_audio_features.py new file mode 100644 index 0000000000000000000000000000000000000000..fe1a3b4dcd8b486cbcbc232b300a0d806f7c3788 --- /dev/null +++ b/tools/data/build_audio_features.py @@ -0,0 +1,320 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import sys +from multiprocessing import Pool + +import mmengine +import numpy as np +from scipy.io import wavfile + +try: + import librosa + import lws +except ImportError: + print('Please import librosa, lws first.') + +sys.path.append('..') + +SILENCE_THRESHOLD = 2 +FMIN = 125 +FMAX = 7600 +FRAME_SHIFT_MS = None +MIN_LEVEL_DB = -100 +REF_LEVEL_DB = 20 +RESCALING = True +RESCALING_MAX = 0.999 +ALLOW_CLIPPING_IN_NORMALIZATION = True +LOG_SCALE_MIN = -32.23619130191664 +NORM_AUDIO = True + + +class AudioTools: + """All methods related to audio feature extraction. Code Reference: + + `_, + `_. + + Args: + frame_rate (int): The frame rate per second of the video. + Defaults to 30. + sample_rate (int): The sample rate for audio sampling. + Defaults to 16000. + num_mels (int): Number of channels of the melspectrogram. + Defaults to 80. + fft_size (int): fft_size / sample_rate is window size. + Defaults to 1280. + hop_size (int): hop_size / sample_rate is step size. + Defaults to 320. + """ + + def __init__(self, + frame_rate=30, + sample_rate=16000, + num_mels=80, + fft_size=1280, + hop_size=320, + spectrogram_type='lws'): + self.frame_rate = frame_rate + self.sample_rate = sample_rate + self.silence_threshold = SILENCE_THRESHOLD + self.num_mels = num_mels + self.fmin = FMIN + self.fmax = FMAX + self.fft_size = fft_size + self.hop_size = hop_size + self.frame_shift_ms = FRAME_SHIFT_MS + self.min_level_db = MIN_LEVEL_DB + self.ref_level_db = REF_LEVEL_DB + self.rescaling = RESCALING + self.rescaling_max = RESCALING_MAX + self.allow_clipping_in_normalization = ALLOW_CLIPPING_IN_NORMALIZATION + self.log_scale_min = LOG_SCALE_MIN + self.norm_audio = NORM_AUDIO + self.spectrogram_type = spectrogram_type + assert spectrogram_type in ['lws', 'librosa'] + + def load_wav(self, path): + """Load an audio file into numpy array.""" + return librosa.core.load(path, sr=self.sample_rate)[0] + + @staticmethod + def audio_normalize(samples, desired_rms=0.1, eps=1e-4): + """RMS normalize the audio data.""" + rms = np.maximum(eps, np.sqrt(np.mean(samples**2))) + samples = samples * (desired_rms / rms) + return samples + + def generate_spectrogram_magphase(self, audio, with_phase=False): + """Separate a complex-valued spectrogram D into its magnitude (S) + + and phase (P) components, so that D = S * P. + + Args: + audio (np.ndarray): The input audio signal. + with_phase (bool): Determines whether to output the + phase components. Default: False. + + Returns: + np.ndarray: magnitude and phase component of the complex-valued + spectrogram. + """ + spectro = librosa.core.stft( + audio, + hop_length=self.get_hop_size(), + n_fft=self.fft_size, + center=True) + spectro_mag, spectro_phase = librosa.core.magphase(spectro) + spectro_mag = np.expand_dims(spectro_mag, axis=0) + if with_phase: + spectro_phase = np.expand_dims(np.angle(spectro_phase), axis=0) + return spectro_mag, spectro_phase + + return spectro_mag + + def save_wav(self, wav, path): + """Save the wav to disk.""" + # 32767 = (2 ^ 15 - 1) maximum of int16 + wav *= 32767 / max(0.01, np.max(np.abs(wav))) + wavfile.write(path, self.sample_rate, wav.astype(np.int16)) + + def trim(self, quantized): + """Trim the audio wavfile.""" + start, end = self.start_and_end_indices(quantized, + self.silence_threshold) + return quantized[start:end] + + def adjust_time_resolution(self, quantized, mel): + """Adjust time resolution by repeating features. + + Args: + quantized (np.ndarray): (T,) + mel (np.ndarray): (N, D) + + Returns: + tuple: Tuple of (T,) and (T, D) + """ + assert quantized.ndim == 1 + assert mel.ndim == 2 + + upsample_factor = quantized.size // mel.shape[0] + mel = np.repeat(mel, upsample_factor, axis=0) + n_pad = quantized.size - mel.shape[0] + if n_pad != 0: + assert n_pad > 0 + mel = np.pad( + mel, [(0, n_pad), (0, 0)], mode='constant', constant_values=0) + + # trim + start, end = self.start_and_end_indices(quantized, + self.silence_threshold) + + return quantized[start:end], mel[start:end, :] + + @staticmethod + def start_and_end_indices(quantized, silence_threshold=2): + """Trim the audio file when reaches the silence threshold.""" + for start in range(quantized.size): + if abs(quantized[start] - 127) > silence_threshold: + break + for end in range(quantized.size - 1, 1, -1): + if abs(quantized[end] - 127) > silence_threshold: + break + + assert abs(quantized[start] - 127) > silence_threshold + assert abs(quantized[end] - 127) > silence_threshold + + return start, end + + def melspectrogram(self, y): + """Generate the melspectrogram.""" + D = self._lws_processor().stft(y).T + S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db + if not self.allow_clipping_in_normalization: + assert S.max() <= 0 and S.min() - self.min_level_db >= 0 + return self._normalize(S) + + def get_hop_size(self): + """Calculate the hop size.""" + hop_size = self.hop_size + if hop_size is None: + assert self.frame_shift_ms is not None + hop_size = int(self.frame_shift_ms / 1000 * self.sample_rate) + return hop_size + + def _lws_processor(self): + """Perform local weighted sum. + + Please refer to `_. + """ + return lws.lws(self.fft_size, self.get_hop_size(), mode='speech') + + @staticmethod + def lws_num_frames(length, fsize, fshift): + """Compute number of time frames of lws spectrogram. + + Please refer to `_. + """ + pad = (fsize - fshift) + if length % fshift == 0: + M = (length + pad * 2 - fsize) // fshift + 1 + else: + M = (length + pad * 2 - fsize) // fshift + 2 + return M + + def lws_pad_lr(self, x, fsize, fshift): + """Compute left and right padding lws internally uses. + + Please refer to `_. + """ + M = self.lws_num_frames(len(x), fsize, fshift) + pad = (fsize - fshift) + T = len(x) + 2 * pad + r = (M - 1) * fshift + fsize - T + return pad, pad + r + + def _linear_to_mel(self, spectrogram): + """Warp linear scale spectrograms to the mel scale. + + Please refer to `_ + """ + global _mel_basis + _mel_basis = self._build_mel_basis() + return np.dot(_mel_basis, spectrogram) + + def _build_mel_basis(self): + """Build mel filters. + + Please refer to `_ + """ + assert self.fmax <= self.sample_rate // 2 + return librosa.filters.mel( + self.sample_rate, + self.fft_size, + fmin=self.fmin, + fmax=self.fmax, + n_mels=self.num_mels) + + def _amp_to_db(self, x): + min_level = np.exp(self.min_level_db / 20 * np.log(10)) + return 20 * np.log10(np.maximum(min_level, x)) + + @staticmethod + def _db_to_amp(x): + return np.power(10.0, x * 0.05) + + def _normalize(self, S): + return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1) + + def _denormalize(self, S): + return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db + + def read_audio(self, audio_path): + wav = self.load_wav(audio_path) + if self.norm_audio: + wav = self.audio_normalize(wav) + else: + wav = wav / np.abs(wav).max() + + return wav + + def audio_to_spectrogram(self, wav): + if self.spectrogram_type == 'lws': + spectrogram = self.melspectrogram(wav).astype(np.float32).T + elif self.spectrogram_type == 'librosa': + spectrogram = self.generate_spectrogram_magphase(wav) + return spectrogram + + +def extract_audio_feature(wav_path, audio_tools, mel_out_dir): + file_name, _ = osp.splitext(osp.basename(wav_path)) + # Write the spectrograms to disk: + mel_filename = os.path.join(mel_out_dir, file_name + '.npy') + if not os.path.exists(mel_filename): + try: + wav = audio_tools.read_audio(wav_path) + + spectrogram = audio_tools.audio_to_spectrogram(wav) + + np.save( + mel_filename, + spectrogram.astype(np.float32), + allow_pickle=False) + + except BaseException: + print(f'Read audio [{wav_path}] failed.') + + +if __name__ == '__main__': + audio_tools = AudioTools( + fft_size=512, hop_size=256) # window_size:32ms hop_size:16ms + + parser = argparse.ArgumentParser() + parser.add_argument('audio_home_path', type=str) + parser.add_argument('spectrogram_save_path', type=str) + parser.add_argument('--level', type=int, default=1) + parser.add_argument('--ext', default='m4a') + parser.add_argument('--num-workers', type=int, default=4) + parser.add_argument('--part', type=str, default='1/1') + args = parser.parse_args() + + mmengine.mkdir_or_exist(args.spectrogram_save_path) + + files = glob.glob(args.audio_home_path + '/*' * args.level + '.' + + args.ext) + print(f'found {len(files)} files.') + files = sorted(files) + if args.part is not None: + [this_part, num_parts] = [int(i) for i in args.part.split('/')] + part_len = len(files) // num_parts + + p = Pool(args.num_workers) + for file in files[part_len * (this_part - 1):( + part_len * this_part) if this_part != num_parts else len(files)]: + p.apply_async( + extract_audio_feature, + args=(file, audio_tools, args.spectrogram_save_path)) + p.close() + p.join() diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py new file mode 100644 index 0000000000000000000000000000000000000000..e00fd382c9891c2b33ca17c6bee26b3e8874de4d --- /dev/null +++ b/tools/data/build_file_list.py @@ -0,0 +1,269 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import json +import os.path as osp +import random + +from mmengine.runner import set_random_seed + +from tools.data.anno_txt2json import lines2dictlist +from tools.data.parse_file_list import (parse_directory, parse_diving48_splits, + parse_hmdb51_split, + parse_jester_splits, + parse_kinetics_splits, + parse_mit_splits, parse_mmit_splits, + parse_sthv1_splits, parse_sthv2_splits, + parse_ucf101_splits) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Build file list') + parser.add_argument( + 'dataset', + type=str, + choices=[ + 'ucf101', 'kinetics400', 'kinetics600', 'kinetics700', 'thumos14', + 'sthv1', 'sthv2', 'mit', 'mmit', 'activitynet', 'hmdb51', 'jester', + 'diving48' + ], + help='dataset to be built file list') + parser.add_argument( + 'src_folder', type=str, help='root directory for the frames or videos') + parser.add_argument( + '--rgb-prefix', type=str, default='img_', help='prefix of rgb frames') + parser.add_argument( + '--flow-x-prefix', + type=str, + default='flow_x_', + help='prefix of flow x frames') + parser.add_argument( + '--flow-y-prefix', + type=str, + default='flow_y_', + help='prefix of flow y frames') + parser.add_argument( + '--num-split', + type=int, + default=3, + help='number of split to file list') + parser.add_argument( + '--subset', + type=str, + default='train', + choices=['train', 'val', 'test'], + help='subset to generate file list') + parser.add_argument( + '--level', + type=int, + default=2, + choices=[1, 2], + help='directory level of data') + parser.add_argument( + '--format', + type=str, + default='rawframes', + choices=['rawframes', 'videos'], + help='data format') + parser.add_argument( + '--out-root-path', + type=str, + default='data/', + help='root path for output') + parser.add_argument( + '--output-format', + type=str, + default='txt', + choices=['txt', 'json'], + help='built file list format') + parser.add_argument('--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--shuffle', + action='store_true', + default=False, + help='whether to shuffle the file list') + args = parser.parse_args() + + return args + + +def build_file_list(splits, frame_info, shuffle=False): + """Build file list for a certain data split. + + Args: + splits (tuple): Data split to generate file list. + frame_info (dict): Dict mapping from frames to path. e.g., + 'Skiing/v_Skiing_g18_c02': ('data/ucf101/rawframes/Skiing/v_Skiing_g18_c02', 0, 0). # noqa: E501 + shuffle (bool): Whether to shuffle the file list. + + Returns: + tuple: RGB file list for training and testing, together with + Flow file list for training and testing. + """ + + def build_list(split): + """Build RGB and Flow file list with a given split. + + Args: + split (list): Split to be generate file list. + + Returns: + tuple[list, list]: (rgb_list, flow_list), rgb_list is the + generated file list for rgb, flow_list is the generated + file list for flow. + """ + rgb_list, flow_list = list(), list() + for item in split: + if item[0] not in frame_info: + continue + if frame_info[item[0]][1] > 0: + # rawframes + rgb_cnt = frame_info[item[0]][1] + flow_cnt = frame_info[item[0]][2] + if isinstance(item[1], int): + rgb_list.append(f'{item[0]} {rgb_cnt} {item[1]}\n') + flow_list.append(f'{item[0]} {flow_cnt} {item[1]}\n') + elif isinstance(item[1], list): + # only for multi-label datasets like mmit + rgb_list.append(f'{item[0]} {rgb_cnt} ' + + ' '.join([str(digit) + for digit in item[1]]) + '\n') + rgb_list.append(f'{item[0]} {flow_cnt} ' + + ' '.join([str(digit) + for digit in item[1]]) + '\n') + else: + raise ValueError( + 'frame_info should be ' + + '[`video`(str), `label`(int)|`labels(list[int])`') + else: + # videos + if isinstance(item[1], int): + rgb_list.append(f'{frame_info[item[0]][0]} {item[1]}\n') + flow_list.append(f'{frame_info[item[0]][0]} {item[1]}\n') + elif isinstance(item[1], list): + # only for multi-label datasets like mmit + rgb_list.append(f'{frame_info[item[0]][0]} ' + + ' '.join([str(digit) + for digit in item[1]]) + '\n') + flow_list.append( + f'{frame_info[item[0]][0]} ' + + ' '.join([str(digit) for digit in item[1]]) + '\n') + else: + raise ValueError( + 'frame_info should be ' + + '[`video`(str), `label`(int)|`labels(list[int])`') + if shuffle: + random.shuffle(rgb_list) + random.shuffle(flow_list) + return rgb_list, flow_list + + train_rgb_list, train_flow_list = build_list(splits[0]) + test_rgb_list, test_flow_list = build_list(splits[1]) + return (train_rgb_list, test_rgb_list), (train_flow_list, test_flow_list) + + +def main(): + args = parse_args() + + if args.seed is not None: + print(f'Set random seed to {args.seed}') + set_random_seed(args.seed) + + if args.format == 'rawframes': + frame_info = parse_directory( + args.src_folder, + rgb_prefix=args.rgb_prefix, + flow_x_prefix=args.flow_x_prefix, + flow_y_prefix=args.flow_y_prefix, + level=args.level) + elif args.format == 'videos': + if args.level == 1: + # search for one-level directory + video_list = glob.glob(osp.join(args.src_folder, '*')) + elif args.level == 2: + # search for two-level directory + video_list = glob.glob(osp.join(args.src_folder, '*', '*')) + else: + raise ValueError(f'level must be 1 or 2, but got {args.level}') + frame_info = {} + for video in video_list: + video_path = osp.relpath(video, args.src_folder) + # video_id: (video_relative_path, -1, -1) + frame_info[osp.splitext(video_path)[0]] = (video_path, -1, -1) + else: + raise NotImplementedError('only rawframes and videos are supported') + + if args.dataset == 'ucf101': + splits = parse_ucf101_splits(args.level) + elif args.dataset == 'sthv1': + splits = parse_sthv1_splits(args.level) + elif args.dataset == 'sthv2': + splits = parse_sthv2_splits(args.level) + elif args.dataset == 'mit': + splits = parse_mit_splits() + elif args.dataset == 'mmit': + splits = parse_mmit_splits() + elif args.dataset in ['kinetics400', 'kinetics600', 'kinetics700']: + splits = parse_kinetics_splits(args.level, args.dataset) + elif args.dataset == 'hmdb51': + splits = parse_hmdb51_split(args.level) + elif args.dataset == 'jester': + splits = parse_jester_splits(args.level) + elif args.dataset == 'diving48': + splits = parse_diving48_splits() + else: + raise ValueError( + f"Supported datasets are 'ucf101, sthv1, sthv2', 'jester', " + f"'mmit', 'mit', 'kinetics400', 'kinetics600', 'kinetics700', but " + f'got {args.dataset}') + + assert len(splits) == args.num_split + + out_path = args.out_root_path + args.dataset + + if len(splits) > 1: + for i, split in enumerate(splits): + file_lists = build_file_list( + split, frame_info, shuffle=args.shuffle) + train_name = f'{args.dataset}_train_split_{i+1}_{args.format}.txt' + val_name = f'{args.dataset}_val_split_{i+1}_{args.format}.txt' + if args.output_format == 'txt': + with open(osp.join(out_path, train_name), 'w') as f: + f.writelines(file_lists[0][0]) + with open(osp.join(out_path, val_name), 'w') as f: + f.writelines(file_lists[0][1]) + elif args.output_format == 'json': + train_list = lines2dictlist(file_lists[0][0], args.format) + val_list = lines2dictlist(file_lists[0][1], args.format) + train_name = train_name.replace('.txt', '.json') + val_name = val_name.replace('.txt', '.json') + with open(osp.join(out_path, train_name), 'w') as f: + json.dump(train_list, f) + with open(osp.join(out_path, val_name), 'w') as f: + json.dump(val_list, f) + else: + lists = build_file_list(splits[0], frame_info, shuffle=args.shuffle) + + if args.subset == 'train': + ind = 0 + elif args.subset == 'val': + ind = 1 + elif args.subset == 'test': + ind = 2 + else: + raise ValueError(f"subset must be in ['train', 'val', 'test'], " + f'but got {args.subset}.') + + filename = f'{args.dataset}_{args.subset}_list_{args.format}.txt' + if args.output_format == 'txt': + with open(osp.join(out_path, filename), 'w') as f: + f.writelines(lists[0][ind]) + elif args.output_format == 'json': + data_list = lines2dictlist(lists[0][ind], args.format) + filename = filename.replace('.txt', '.json') + with open(osp.join(out_path, filename), 'w') as f: + json.dump(data_list, f) + + +if __name__ == '__main__': + main() diff --git a/tools/data/build_rawframes.py b/tools/data/build_rawframes.py new file mode 100644 index 0000000000000000000000000000000000000000..0613fa0cb90425c62a2eca336267a72bcb742d08 --- /dev/null +++ b/tools/data/build_rawframes.py @@ -0,0 +1,278 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import sys +import warnings +from multiprocessing import Lock, Pool + +import mmcv +import numpy as np + + +def extract_frame(vid_item): + """Generate optical flow using dense flow. + + Args: + vid_item (list): Video item containing video full path, + video (short) path, video id. + + Returns: + bool: Whether generate optical flow successfully. + """ + full_path, vid_path, vid_id, method, task, report_file = vid_item + if '/' in vid_path: + act_name = osp.basename(osp.dirname(vid_path)) + out_full_path = osp.join(args.out_dir, act_name) + else: + out_full_path = args.out_dir + + run_success = -1 + + if task == 'rgb': + if args.use_opencv: + # Not like using denseflow, + # Use OpenCV will not make a sub directory with the video name + try: + video_name = osp.splitext(osp.basename(vid_path))[0] + out_full_path = osp.join(out_full_path, video_name) + + vr = mmcv.VideoReader(full_path) + for i, vr_frame in enumerate(vr): + if vr_frame is not None: + w, h, _ = np.shape(vr_frame) + if args.new_short == 0: + if args.new_width == 0 or args.new_height == 0: + # Keep original shape + out_img = vr_frame + else: + out_img = mmcv.imresize( + vr_frame, + (args.new_width, args.new_height)) + else: + if min(h, w) == h: + new_h = args.new_short + new_w = int((new_h / h) * w) + else: + new_w = args.new_short + new_h = int((new_w / w) * h) + out_img = mmcv.imresize(vr_frame, (new_h, new_w)) + mmcv.imwrite(out_img, + f'{out_full_path}/img_{i + 1:05d}.jpg') + else: + warnings.warn( + 'Length inconsistent!' + f'Early stop with {i + 1} out of {len(vr)} frames.' + ) + break + run_success = 0 + except Exception: + run_success = -1 + else: + if args.new_short == 0: + cmd = osp.join( + f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'" + f' -nw={args.new_width} -nh={args.new_height} -v') + else: + cmd = osp.join( + f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'" + f' -ns={args.new_short} -v') + run_success = os.system(cmd) + elif task == 'flow': + if args.input_frames: + if args.new_short == 0: + cmd = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -nw={args.new_width} --nh={args.new_height} -v --if') + else: + cmd = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -ns={args.new_short} -v --if') + else: + if args.new_short == 0: + cmd = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -nw={args.new_width} --nh={args.new_height} -v') + else: + cmd = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -ns={args.new_short} -v') + run_success = os.system(cmd) + else: + if args.new_short == 0: + cmd_rgb = osp.join( + f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'" + f' -nw={args.new_width} -nh={args.new_height} -v') + cmd_flow = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -nw={args.new_width} -nh={args.new_height} -v') + else: + cmd_rgb = osp.join( + f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'" + f' -ns={args.new_short} -v') + cmd_flow = osp.join( + f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'" # noqa: E501 + f' -ns={args.new_short} -v') + run_success_rgb = os.system(cmd_rgb) + run_success_flow = os.system(cmd_flow) + if run_success_flow == 0 and run_success_rgb == 0: + run_success = 0 + + if run_success == 0: + print(f'{task} {vid_id} {vid_path} {method} done') + sys.stdout.flush() + + lock.acquire() + with open(report_file, 'a') as f: + line = full_path + '\n' + f.write(line) + lock.release() + else: + print(f'{task} {vid_id} {vid_path} {method} got something wrong') + sys.stdout.flush() + + return True + + +def parse_args(): + parser = argparse.ArgumentParser(description='extract optical flows') + parser.add_argument('src_dir', type=str, help='source video directory') + parser.add_argument('out_dir', type=str, help='output rawframe directory') + parser.add_argument( + '--task', + type=str, + default='flow', + choices=['rgb', 'flow', 'both'], + help='which type of frames to be extracted') + parser.add_argument( + '--level', + type=int, + choices=[1, 2], + default=2, + help='directory level of data') + parser.add_argument( + '--num-worker', + type=int, + default=8, + help='number of workers to build rawframes') + parser.add_argument( + '--flow-type', + type=str, + default=None, + choices=[None, 'tvl1', 'warp_tvl1', 'farn', 'brox'], + help='flow type to be generated') + parser.add_argument( + '--out-format', + type=str, + default='jpg', + choices=['jpg', 'h5', 'png'], + help='output format') + parser.add_argument( + '--ext', + type=str, + default='avi', + choices=['avi', 'mp4', 'webm'], + help='video file extensions') + parser.add_argument( + '--mixed-ext', + action='store_true', + help='process video files with mixed extensions') + parser.add_argument( + '--new-width', type=int, default=0, help='resize image width') + parser.add_argument( + '--new-height', type=int, default=0, help='resize image height') + parser.add_argument( + '--new-short', + type=int, + default=0, + help='resize image short side length keeping ratio') + parser.add_argument('--num-gpu', type=int, default=8, help='number of GPU') + parser.add_argument( + '--resume', + action='store_true', + default=False, + help='resume optical flow extraction instead of overwriting') + parser.add_argument( + '--use-opencv', + action='store_true', + help='Whether to use opencv to extract rgb frames') + parser.add_argument( + '--input-frames', + action='store_true', + help='Whether to extract flow frames based on rgb frames') + parser.add_argument( + '--report-file', + type=str, + default='build_report.txt', + help='report to record files which have been successfully processed') + args = parser.parse_args() + + return args + + +def init(lock_): + global lock + lock = lock_ + + +if __name__ == '__main__': + args = parse_args() + + if not osp.isdir(args.out_dir): + print(f'Creating folder: {args.out_dir}') + os.makedirs(args.out_dir) + + if args.level == 2: + classes = os.listdir(args.src_dir) + for classname in classes: + new_dir = osp.join(args.out_dir, classname) + if not osp.isdir(new_dir): + print(f'Creating folder: {new_dir}') + os.makedirs(new_dir) + + if args.input_frames: + print('Reading rgb frames from folder: ', args.src_dir) + fullpath_list = glob.glob(args.src_dir + '/*' * args.level) + print('Total number of rgb frame folders found: ', len(fullpath_list)) + else: + print('Reading videos from folder: ', args.src_dir) + if args.mixed_ext: + print('Extension of videos is mixed') + fullpath_list = glob.glob(args.src_dir + '/*' * args.level) + else: + print('Extension of videos: ', args.ext) + fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' + + args.ext) + print('Total number of videos found: ', len(fullpath_list)) + + if args.resume: + done_fullpath_list = [] + with open(args.report_file) as f: + for line in f: + if line == '\n': + continue + done_full_path = line.strip().split()[0] + done_fullpath_list.append(done_full_path) + done_fullpath_list = set(done_fullpath_list) + fullpath_list = list(set(fullpath_list).difference(done_fullpath_list)) + + if args.level == 2: + vid_list = list( + map( + lambda p: osp.join( + osp.basename(osp.dirname(p)), osp.basename(p)), + fullpath_list)) + elif args.level == 1: + vid_list = list(map(osp.basename, fullpath_list)) + + lock = Lock() + pool = Pool(args.num_worker, initializer=init, initargs=(lock, )) + pool.map( + extract_frame, + zip(fullpath_list, vid_list, range(len(vid_list)), + len(vid_list) * [args.flow_type], + len(vid_list) * [args.task], + len(vid_list) * [args.report_file])) + pool.close() + pool.join() diff --git a/tools/data/build_videos.py b/tools/data/build_videos.py new file mode 100644 index 0000000000000000000000000000000000000000..6540157dda9fd4d4a27c0981985ffd8728289807 --- /dev/null +++ b/tools/data/build_videos.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import sys +from multiprocessing import Pool + + +def encode_video(frame_dir_item): + """Encode frames to video using ffmpeg. + + Args: + frame_dir_item (list): Rawframe item containing raw frame directory + full path, rawframe directory (short) path, rawframe directory id. + + Returns: + bool: Whether synthesize video successfully. + """ + full_path, frame_dir_path, frame_dir_id = frame_dir_item + out_full_path = args.out_dir + + img_name_tmpl = args.filename_tmpl + '.' + args.in_format + img_path = osp.join(full_path, img_name_tmpl) + + out_vid_name = frame_dir_path + '.' + args.ext + out_vid_path = osp.join(out_full_path, out_vid_name) + + cmd = osp.join( + f"ffmpeg -start_number {args.start_idx} -r {args.fps} -i '{img_path}' " + f"-vcodec {args.vcodec} '{out_vid_path}'") + os.system(cmd) + + print(f'{frame_dir_id} {frame_dir_path} done') + sys.stdout.flush() + return True + + +def parse_args(): + parser = argparse.ArgumentParser(description='synthesize videos') + parser.add_argument('src_dir', type=str, help='source rawframe directory') + parser.add_argument('out_dir', type=str, help='output video directory') + parser.add_argument( + '--fps', type=int, default=30, help='fps of videos to be synthesized') + parser.add_argument( + '--level', + type=int, + choices=[1, 2], + default=2, + help='directory level of data') + parser.add_argument( + '--num-worker', + type=int, + default=8, + help='number of workers to build videos') + parser.add_argument( + '--in-format', + type=str, + default='jpg', + choices=['jpg', 'png'], + help='input format') + parser.add_argument( + '--start-idx', type=int, default=0, help='starting index of rawframes') + parser.add_argument( + '--filename-tmpl', + type=str, + default='img_%05d', + help='filename template of rawframes') + parser.add_argument( + '--vcodec', type=str, default='mpeg4', help='coding method of videos') + parser.add_argument( + '--ext', + type=str, + default='mp4', + choices=['mp4', 'avi'], + help='video file extensions') + parser.add_argument('--num-gpu', type=int, default=8, help='number of GPU') + parser.add_argument( + '--resume', + action='store_true', + default=False, + help='resume optical flow extraction instead of overwriting') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + + if not osp.isdir(args.out_dir): + print(f'Creating folder: {args.out_dir}') + os.makedirs(args.out_dir) + + if args.level == 2: + classes = os.listdir(args.src_dir) + for classname in classes: + new_dir = osp.join(args.out_dir, classname) + if not osp.isdir(new_dir): + print(f'Creating folder: {new_dir}') + os.makedirs(new_dir) + + print('Reading rgb frames from folder: ', args.src_dir) + print('Input format of rgb frames: ', args.in_format) + fullpath_list = glob.glob(args.src_dir + '/*' * args.level) + done_fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' + + args.ext) + print('Total number of rgb frame folders found: ', len(fullpath_list)) + + if args.resume: + fullpath_list = set(fullpath_list).difference(set(done_fullpath_list)) + fullpath_list = list(fullpath_list) + print('Resuming. number of videos to be synthesized: ', + len(fullpath_list)) + + if args.level == 2: + frame_dir_list = list( + map( + lambda p: osp.join( + osp.basename(osp.dirname(p)), osp.basename(p)), + fullpath_list)) + elif args.level == 1: + frame_dir_list = list(map(osp.basename, fullpath_list)) + + pool = Pool(args.num_worker) + pool.map(encode_video, + zip(fullpath_list, frame_dir_list, range(len(frame_dir_list)))) diff --git a/tools/data/charades-sta/README.md b/tools/data/charades-sta/README.md new file mode 100644 index 0000000000000000000000000000000000000000..788dd51472f9ed663e7964ec1ad96878f597d1e1 --- /dev/null +++ b/tools/data/charades-sta/README.md @@ -0,0 +1,59 @@ +# Preparing AVA + +## Introduction + + + +```BibTeX +@inproceedings{gao2017tall, + title={Tall: Temporal activity localization via language query}, + author={Gao, Jiyang and Sun, Chen and Yang, Zhenheng and Nevatia, Ram}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={5267--5275}, + year={2017} +} + +@inproceedings{DRN2020CVPR, + author = {Runhao, Zeng and Haoming, Xu and Wenbing, Huang and Peihao, Chen and Mingkui, Tan and Chuang Gan}, + title = {Dense Regression Network for Video Grounding}, + booktitle = {CVPR}, + year = {2020}, +} +``` + +Charades-STA is a new dataset built on top of Charades by adding sentence temporal annotations. It is introduced by Gao et al. in `TALL: Temporal Activity Localization via Language Query`. Currently, we only support C3D features from `Dense Regression Network for Video Grounding`. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations from the official repository of DRN: + +```shell +bash download_annotations.sh +``` + +## Step 2. Prepare C3D features + +After the first step, you should be at `${MMACTION2}/data/CharadesSTA/`. Download the C3D features following the [official command](https://github.com/Alvin-Zeng/DRN/tree/master#download-features) to the current directory `${MMACTION2}/data/CharadesSTA/`. + +After finishing the two steps, the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ CharadesSTA +โ”‚ โ”‚ โ”œโ”€โ”€ C3D_unit16_overlap0.5_merged +โ”‚ โ”‚ | โ”œโ”€โ”€ 001YG.pt +โ”‚ โ”‚ | โ”œโ”€โ”€ 003WS.pt +โ”‚ โ”‚ | โ”œโ”€โ”€ 004QE.pt +โ”‚ โ”‚ | โ”œโ”€โ”€ 00607.pt +โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_duration.json +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_fps_dict.json +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_frames_info.json +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_sta_test.txt +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_sta_train.txt +โ”‚ โ”‚ โ”œโ”€โ”€ Charades_word2id.json +``` diff --git a/tools/data/charades-sta/download_annotations.sh b/tools/data/charades-sta/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..9755f81a958f77cff4408cbb591d0369d0e0477b --- /dev/null +++ b/tools/data/charades-sta/download_annotations.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/CharadesSTA/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +URL="https://raw.githubusercontent.com/Alvin-Zeng/DRN/master/data/dataset/Charades" +wget ${URL}/Charades_frames_info.json +wget ${URL}/Charades_duration.json +wget ${URL}/Charades_fps_dict.json +wget ${URL}/Charades_sta_test.txt +wget ${URL}/Charades_sta_train.txt +wget ${URL}/Charades_word2id.json diff --git a/tools/data/denormalize_proposal_file.py b/tools/data/denormalize_proposal_file.py new file mode 100644 index 0000000000000000000000000000000000000000..7e07832765e92a8ed04b1328ad5e333e5257091d --- /dev/null +++ b/tools/data/denormalize_proposal_file.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp + +from mmaction.localization import load_localize_proposal_file +from tools.data.parse_file_list import parse_directory + + +def process_norm_proposal_file(norm_proposal_file, frame_dict): + """Process the normalized proposal file and denormalize it. + + Args: + norm_proposal_file (str): Name of normalized proposal file. + frame_dict (dict): Information of frame folders. + """ + proposal_file = norm_proposal_file.replace('normalized_', '') + norm_proposals = load_localize_proposal_file(norm_proposal_file) + + processed_proposal_list = [] + for idx, norm_proposal in enumerate(norm_proposals): + video_id = norm_proposal[0] + frame_info = frame_dict[video_id] + num_frames = frame_info[1] + frame_path = osp.basename(frame_info[0]) + + gt = [[ + int(x[0]), + int(float(x[1]) * num_frames), + int(float(x[2]) * num_frames) + ] for x in norm_proposal[2]] + + proposal = [[ + int(x[0]), + float(x[1]), + float(x[2]), + int(float(x[3]) * num_frames), + int(float(x[4]) * num_frames) + ] for x in norm_proposal[3]] + + gt_dump = '\n'.join(['{} {} {}'.format(*x) for x in gt]) + gt_dump += '\n' if len(gt) else '' + proposal_dump = '\n'.join( + ['{} {:.04f} {:.04f} {} {}'.format(*x) for x in proposal]) + proposal_dump += '\n' if len(proposal) else '' + + processed_proposal_list.append( + f'# {idx}\n{frame_path}\n{num_frames}\n1' + f'\n{len(gt)}\n{gt_dump}{len(proposal)}\n{proposal_dump}') + + with open(proposal_file, 'w') as f: + f.writelines(processed_proposal_list) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Denormalize proposal file') + parser.add_argument( + 'dataset', + type=str, + choices=['thumos14'], + help='dataset to be denormalize proposal file') + parser.add_argument( + '--norm-proposal-file', + type=str, + help='normalized proposal file to be denormalize') + parser.add_argument( + '--data-prefix', + type=str, + help='path to a directory where rawframes are held') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + print(f'Converting from {args.norm_proposal_file}.') + frame_dict = parse_directory(args.data_prefix) + process_norm_proposal_file(args.norm_proposal_file, frame_dict) + + +if __name__ == '__main__': + main() diff --git a/tools/data/diving48/README.md b/tools/data/diving48/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a441be78eb072de81d956c8d663ddf85189d4c14 --- /dev/null +++ b/tools/data/diving48/README.md @@ -0,0 +1,143 @@ +# Preparing Diving48 + +## Introduction + + + +```BibTeX +@inproceedings{li2018resound, + title={Resound: Towards action recognition without representation bias}, + author={Li, Yingwei and Li, Yi and Vasconcelos, Nuno}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + pages={513--528}, + year={2018} +} +``` + +For basic dataset information, you can refer to the official dataset [website](http://www.svcl.ucsd.edu/projects/resound/dataset.html). + +`````{tabs} + +````{group-tab} Download by MIM +MIM supports downloading from OpenDataLab and preprocessing Diving48 dataset with one command line. +```Bash +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login +# download and preprocess by MIM +mim download mmaction2 --dataset diving48 +``` + +```` + +````{group-tab} Download form Official Source + +## Step 1. Prepare Annotations + +You can run the following script to download annotations (considering the correctness of annotation files, we only download V2 version here). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/diving48/`. + +```shell +bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +You can run the following script to download videos. + +```shell +bash download_videos.sh +``` + +## Step 3. Prepare RGB and Flow + +This part is **optional** if you only want to use the video loader. + +The frames provided in official compressed file are not complete. You may need to go through the following extraction steps to get the complete frames. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/diving48_extracted/ +ln -s /mnt/SSD/diving48_extracted/ ../../../data/diving48/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames. + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_frames.sh +``` + +## Step 4. Generate File List + +you can run the follow script to generate file list in the format of rawframes and videos. + +```shell +bash generate_videos_filelist.sh +bash generate_rawframes_filelist.sh +``` + +```` +````` + +### Check Directory Structure + +After the whole data process for Diving48 preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for Diving48. + +In the context of the whole project (for Diving48 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ diving48 +โ”‚ โ”‚ โ”œโ”€โ”€ diving48_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ diving48_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations (optinonal) +โ”‚ | | โ”œโ”€โ”€ Diving48_V2_train.json +โ”‚ | | โ”œโ”€โ”€ Diving48_V2_test.json +โ”‚ | | โ”œโ”€โ”€ Diving48_vocab.json +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ _8Vy3dlHg2w_00000.mp4 +โ”‚ | | โ”œโ”€โ”€ _8Vy3dlHg2w_00001.mp4 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | โ”œโ”€โ”€ rawframes (optional) +โ”‚ | | โ”œโ”€โ”€ 2x00lRzlTVQ_00000 +โ”‚ | | | โ”œโ”€โ”€ img_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ img_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2x00lRzlTVQ_00001 +โ”‚ | | โ”œโ”€โ”€ ... +``` + +For training and evaluating on Diving48, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/diving48/README_zh-CN.md b/tools/data/diving48/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..9f3d85a757b8e5b507af638aecaed593726d6e14 --- /dev/null +++ b/tools/data/diving48/README_zh-CN.md @@ -0,0 +1,141 @@ +# ๅ‡†ๅค‡ Diving48 + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{li2018resound, + title={Resound: Towards action recognition without representation bias}, + author={Li, Yingwei and Li, Yi and Vasconcelos, Nuno}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + pages={513--528}, + year={2018} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](http://www.svcl.ucsd.edu/projects/resound/dataset.html)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ + +`````{tabs} + +````{group-tab} ไฝฟ็”จ MIM ไธ‹่ฝฝ +# MIM ๆ”ฏๆŒไธ‹่ฝฝ Diving48 ๆ•ฐๆฎ้›†ใ€‚็”จๆˆทๅฏไปฅ้€š่ฟ‡ไธ€่กŒๅ‘ฝไปค๏ผŒไปŽ OpenDataLab ่ฟ›่กŒไธ‹่ฝฝ๏ผŒๅนถ่ฟ›่กŒ้ข„ๅค„็†ใ€‚ +```Bash +# ๅฎ‰่ฃ… OpenXLab CLI ๅทฅๅ…ท +pip install -U openxlab +# ็™ปๅฝ• OpenXLab +openxlab login +# ้€š่ฟ‡ MIM ่ฟ›่กŒๆ•ฐๆฎ้›†ไธ‹่ฝฝ๏ผŒ้ข„ๅค„็†ใ€‚ๆณจๆ„่ฟ™ๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ด +mim download mmaction2 --dataset diving48 +``` + +```` + +````{group-tab} ไปŽๅฎ˜ๆ–นๆบไธ‹่ฝฝ +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ๏ผˆ่€ƒ่™‘ๅˆฐๆ ‡ๆณจ็š„ๅ‡†็กฎๆ€ง๏ผŒ่ฟ™้‡Œไป…ไธ‹่ฝฝ V2 ็‰ˆๆœฌ๏ผ‰ใ€‚ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/diving48/`ใ€‚ + +```shell +bash download_annotations.sh +``` + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝ่ง†้ข‘ใ€‚ + +```shell +bash download_videos.sh +``` + +## Step 3. ๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅฎ˜็ฝ‘ๆไพ›็š„ๅธงๅŽ‹็ผฉๅŒ…ๅนถไธๅฎŒๆ•ดใ€‚่‹ฅๆƒณ่Žทๅ–ๅฎŒๆ•ด็š„ๆ•ฐๆฎ๏ผŒๅฏไปฅไฝฟ็”จไปฅไธ‹ๆญฅ้ชค่งฃๅธงใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/diving48_extracted/ +ln -s /mnt/SSD/diving48_extracted/ ../../../data/diving48/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +cd $MMACTION2/tools/data/diving48/ +bash extract_frames.sh +``` + +## ๆญฅ้ชค 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +bash generate_videos_filelist.sh +bash generate_rawframes_filelist.sh +``` + +```` +````` + +### ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ Diving48 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ +็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„ RGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒDiving48 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ diving48 +โ”‚ โ”‚ โ”œโ”€โ”€ diving48_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ diving48_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ | | โ”œโ”€โ”€ Diving48_V2_train.json +โ”‚ | | โ”œโ”€โ”€ Diving48_V2_test.json +โ”‚ | | โ”œโ”€โ”€ Diving48_vocab.json +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ _8Vy3dlHg2w_00000.mp4 +โ”‚ | | โ”œโ”€โ”€ _8Vy3dlHg2w_00001.mp4 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | โ”œโ”€โ”€ rawframes๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ | | โ”œโ”€โ”€ 2x00lRzlTVQ_00000 +โ”‚ | | | โ”œโ”€โ”€ img_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ img_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2x00lRzlTVQ_00001 +โ”‚ | | โ”œโ”€โ”€ ... +``` + +ๅ…ณไบŽๅฏน Diving48 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/diving48/download_annotations.sh b/tools/data/diving48/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..19b8bfb9d4ebde5b86ba0ae9da95a3568b88f97a --- /dev/null +++ b/tools/data/diving48/download_annotations.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/diving48/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget http://www.svcl.ucsd.edu/projects/resound/Diving48_vocab.json +wget http://www.svcl.ucsd.edu/projects/resound/Diving48_V2_train.json +wget http://www.svcl.ucsd.edu/projects/resound/Diving48_V2_test.json + +cd - diff --git a/tools/data/diving48/download_videos.sh b/tools/data/diving48/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..210cbb7b945b74e53fe207338115d2c1d5f6b6c2 --- /dev/null +++ b/tools/data/diving48/download_videos.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/diving48/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget http://www.svcl.ucsd.edu/projects/resound/Diving48_rgb.tar.gz --no-check-certificate +tar -zxvf Diving48_rgb.tar.gz +mv ./rgb ./videos + +cd - diff --git a/tools/data/diving48/extract_frames.sh b/tools/data/diving48/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..704d53a13509200adac714e452655556451be342 --- /dev/null +++ b/tools/data/diving48/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task both --level 1 --flow-type tvl1 --ext mp4 +echo "Raw frames (RGB and tv-l1) Generated" +cd - diff --git a/tools/data/diving48/extract_rgb_frames.sh b/tools/data/diving48/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..13990a464a5297e3e0891867960242a0575fdee3 --- /dev/null +++ b/tools/data/diving48/extract_rgb_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task rgb --level 1 --ext mp4 +echo "Genearte raw frames (RGB only)" + +cd - diff --git a/tools/data/diving48/extract_rgb_frames_opencv.sh b/tools/data/diving48/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..bec75ef2fd8ad32240f1a97f407ce002768ee277 --- /dev/null +++ b/tools/data/diving48/extract_rgb_frames_opencv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task rgb --level 1 --ext mp4 --use-opencv +echo "Genearte raw frames (RGB only)" + +cd - diff --git a/tools/data/diving48/generate_rawframes_filelist.sh b/tools/data/diving48/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..48043324b0118773264912d2a195aa6793d8b2da --- /dev/null +++ b/tools/data/diving48/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/rawframes/ --num-split 1 --level 1 --subset train --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/rawframes/ --num-split 1 --level 1 --subset val --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/diving48/ diff --git a/tools/data/diving48/generate_videos_filelist.sh b/tools/data/diving48/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..fb170470acbd827fb3c9444ce9b80d69b0a24372 --- /dev/null +++ b/tools/data/diving48/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/diving48/ diff --git a/tools/data/diving48/label_map.txt b/tools/data/diving48/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b7569da5675bae6693d147d7701896bf774b4d9 --- /dev/null +++ b/tools/data/diving48/label_map.txt @@ -0,0 +1,48 @@ +Back+15som+05Twis+FREE +Back+15som+15Twis+FREE +Back+15som+25Twis+FREE +Back+15som+NoTwis+PIKE +Back+15som+NoTwis+TUCK +Back+25som+15Twis+PIKE +Back+25som+25Twis+PIKE +Back+25som+NoTwis+PIKE +Back+25som+NoTwis+TUCK +Back+2som+15Twis+FREE +Back+2som+25Twis+FREE +Back+35som+NoTwis+PIKE +Back+35som+NoTwis+TUCK +Back+3som+NoTwis+PIKE +Back+3som+NoTwis+TUCK +Back+Dive+NoTwis+PIKE +Back+Dive+NoTwis+TUCK +Forward+15som+1Twis+FREE +Forward+15som+2Twis+FREE +Forward+15som+NoTwis+PIKE +Forward+1som+NoTwis+PIKE +Forward+25som+1Twis+PIKE +Forward+25som+2Twis+PIKE +Forward+25som+3Twis+PIKE +Forward+25som+NoTwis+PIKE +Forward+25som+NoTwis+TUCK +Forward+35som+NoTwis+PIKE +Forward+35som+NoTwis+TUCK +Forward+45som+NoTwis+TUCK +Forward+Dive+NoTwis+PIKE +Forward+Dive+NoTwis+STR +Inward+15som+NoTwis+PIKE +Inward+15som+NoTwis+TUCK +Inward+25som+NoTwis+PIKE +Inward+25som+NoTwis+TUCK +Inward+35som+NoTwis+TUCK +Inward+Dive+NoTwis+PIKE +Reverse+15som+05Twis+FREE +Reverse+15som+15Twis+FREE +Reverse+15som+25Twis+FREE +Reverse+15som+35Twis+FREE +Reverse+15som+NoTwis+PIKE +Reverse+25som+15Twis+PIKE +Reverse+25som+NoTwis+PIKE +Reverse+25som+NoTwis+TUCK +Reverse+35som+NoTwis+TUCK +Reverse+Dive+NoTwis+PIKE +Reverse+Dive+NoTwis+TUCK diff --git a/tools/data/diving48/preprocess.sh b/tools/data/diving48/preprocess.sh new file mode 100644 index 0000000000000000000000000000000000000000..75b649b50a978ddf9a4f2e10bf54abe8bcb27f1d --- /dev/null +++ b/tools/data/diving48/preprocess.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +cat $DOWNLOAD_DIR/OpenDataLab___diving48/raw/*.tar.gz.* | tar -xvz -C $(dirname $DATA_ROOT) +tar -xvf $DATA_ROOT/diving48.tar -C $(dirname $DATA_ROOT) +rm $DATA_ROOT/diving48.tar diff --git a/tools/data/extract_audio.py b/tools/data/extract_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..8b754dbad38deb04cb2fcffaf15f959fd0ec92ce --- /dev/null +++ b/tools/data/extract_audio.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +from multiprocessing import Pool + +import mmengine + + +def extract_audio_wav(line): + """Extract the audio wave from video streams using FFMPEG.""" + video_id, _ = osp.splitext(osp.basename(line)) + video_dir = osp.dirname(line) + video_rel_dir = osp.relpath(video_dir, args.root) + dst_dir = osp.join(args.dst_root, video_rel_dir) + os.popen(f'mkdir -p {dst_dir}') + try: + if osp.exists(f'{dst_dir}/{video_id}.wav'): + return + cmd = f'ffmpeg -i ./{line} -map 0:a -y {dst_dir}/{video_id}.wav' + os.popen(cmd) + except BaseException: + with open('extract_wav_err_file.txt', 'a+') as f: + f.write(f'{line}\n') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Extract audios') + parser.add_argument('root', type=str, help='source video directory') + parser.add_argument('dst_root', type=str, help='output audio directory') + parser.add_argument( + '--level', type=int, default=2, help='directory level of data') + parser.add_argument( + '--ext', + type=str, + default='mp4', + choices=['avi', 'mp4', 'webm'], + help='video file extensions') + parser.add_argument( + '--num-worker', type=int, default=8, help='number of workers') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + + mmengine.mkdir_or_exist(args.dst_root) + + print('Reading videos from folder: ', args.root) + print('Extension of videos: ', args.ext) + fullpath_list = glob.glob(args.root + '/*' * args.level + '.' + args.ext) + done_fullpath_list = glob.glob(args.dst_root + '/*' * args.level + '.wav') + print('Total number of videos found: ', len(fullpath_list)) + print('Total number of videos extracted finished: ', + len(done_fullpath_list)) + + pool = Pool(args.num_worker) + pool.map(extract_audio_wav, fullpath_list) diff --git a/tools/data/gym/README.md b/tools/data/gym/README.md new file mode 100644 index 0000000000000000000000000000000000000000..56e5e7693f6d6232e5dcd9c394b67122d0eb12c0 --- /dev/null +++ b/tools/data/gym/README.md @@ -0,0 +1,109 @@ +# Preparing GYM + +## Introduction + + + +```BibTeX +@inproceedings{shao2020finegym, + title={Finegym: A hierarchical video dataset for fine-grained action understanding}, + author={Shao, Dian and Zhao, Yue and Dai, Bo and Lin, Dahua}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2616--2625}, + year={2020} +} +``` + +For basic dataset information, please refer to the official [project](https://sdolivia.github.io/FineGym/) and the [paper](https://arxiv.org/abs/2004.06704). +We currently provide the data pre-processing pipeline for GYM99. +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/gym/`. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations. + +```shell +bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. +The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. + +```shell +bash download_videos.sh +``` + +## Step 3. Trim Videos into Events + +First, you need to trim long videos into events based on the annotation of GYM with the following scripts. + +```shell +python trim_event.py +``` + +## Step 4. Trim Events into Subactions + +Then, you need to trim events into subactions based on the annotation of GYM with the following scripts. We use the two stage trimming for better efficiency (trimming multiple short clips from a long video can be extremely inefficient, since you need to go over the video many times). + +```shell +python trim_subaction.py +``` + +## Step 5. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader for RGB model training. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +Run the following script to extract both rgb and flow using "tvl1" algorithm. + +```shell +bash extract_frames.sh +``` + +## Step 6. Generate file list for GYM99 based on extracted subactions + +You can use the following script to generate train / val lists for GYM99. + +```shell +python generate_file_list.py +``` + +## Step 7. Folder Structure + +After the whole data pipeline for GYM preparation. You can get the subaction clips, event clips, raw videos and GYM99 train/val lists. + +In the context of the whole project (for GYM only), the full folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ gym +| | โ”œโ”€โ”€ annotations +| | | โ”œโ”€โ”€ gym99_train_org.txt +| | | โ”œโ”€โ”€ gym99_val_org.txt +| | | โ”œโ”€โ”€ gym99_train.txt +| | | โ”œโ”€โ”€ gym99_val.txt +| | | โ”œโ”€โ”€ annotation.json +| | | โ””โ”€โ”€ event_annotation.json +โ”‚ โ”‚ โ”œโ”€โ”€ videos +| | | โ”œโ”€โ”€ 0LtLS9wROrk.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ events +| | | โ”œโ”€โ”€ 0LtLS9wROrk_E_002407_002435.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw_E_006732_006824.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ subactions +| | | โ”œโ”€โ”€ 0LtLS9wROrk_E_002407_002435_A_0003_0005.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw_E_006244_006252_A_0000_0007.mp4 +| | โ””โ”€โ”€ subaction_frames +``` + +For training and evaluating on GYM, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/gym/README_zh-CN.md b/tools/data/gym/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..626b46fc11c55e06972b4a6f29600b7440072a68 --- /dev/null +++ b/tools/data/gym/README_zh-CN.md @@ -0,0 +1,109 @@ +# ๅ‡†ๅค‡ GYM + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{shao2020finegym, + title={Finegym: A hierarchical video dataset for fine-grained action understanding}, + author={Shao, Dian and Zhao, Yue and Dai, Bo and Lin, Dahua}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2616--2625}, + year={2020} +} +``` + +่ฏทๅ‚็…ง [้กน็›ฎไธป้กต](https://sdolivia.github.io/FineGym/) ๅŠ [ๅŽŸ่ฎบๆ–‡](https://sdolivia.github.io/FineGym/) ไปฅ่Žทๅ–ๆ•ฐๆฎ้›†ๅŸบๆœฌไฟกๆฏใ€‚ +MMAction2 ๅฝ“ๅ‰ๆ”ฏๆŒ GYM99 ็š„ๆ•ฐๆฎ้›†้ข„ๅค„็†ใ€‚ +ๅœจๅผ€ๅง‹ไน‹ๅ‰๏ผŒ็”จๆˆท้œ€็กฎไฟๅฝ“ๅ‰็›ฎๅฝ•ไธบ `$MMACTION2/tools/data/gym/`ใ€‚ + +## 1. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถๅนถ่ฟ›่กŒ้ข„ๅค„็†๏ผš + +```shell +bash download_annotations.sh +``` + +## 2. ๅ‡†ๅค‡่ง†้ข‘ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘๏ผŒ่ง†้ข‘ๅ‡†ๅค‡ไปฃ็ ไฟฎๆ”น่‡ช [ActivityNet ็ˆฌ่™ซ](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)ใ€‚ +ๆณจๆ„่ฟ™ไธ€ๆญฅ้ชคๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ดใ€‚ + +```shell +bash download_videos.sh +``` + +## 3. ่ฃๅ‰ช้•ฟ่ง†้ข‘่‡ณๅŠจไฝœ็บงๅˆซ + +็”จๆˆท้ฆ–ๅ…ˆ้œ€่ฆไฝฟ็”จไปฅไธ‹่„šๆœฌๅฐ† GYM ไธญ็š„้•ฟ่ง†้ข‘ไพๆฎๆ ‡ๆณจๆ–‡ไปถ่ฃๅ‰ช่‡ณๅŠจไฝœ็บงๅˆซใ€‚ + +```shell +python trim_event.py +``` + +## 4. ่ฃๅ‰ชๅŠจไฝœ่ง†้ข‘่‡ณๅˆ†ๅŠจไฝœ็บงๅˆซ + +้šๅŽ๏ผŒ็”จๆˆท้œ€่ฆไฝฟ็”จไปฅไธ‹่„šๆœฌๅฐ† GYM ไธญ็š„ๅŠจไฝœ่ง†้ข‘ไพๆฎๆ ‡ๆณจๆ–‡ไปถ่ฃๅ‰ช่‡ณๅˆ†ๅŠจไฝœ็บงๅˆซใ€‚ๅฐ†่ง†้ข‘็š„่ฃๅ‰ชๅˆ†ๆˆไธคไธช็บงๅˆซๅฏไปฅๅธฆๆฅๆ›ด้ซ˜็š„ๆ•ˆ็އ๏ผˆๅœจ้•ฟ่ง†้ข‘ไธญ่ฃๅ‰ชๅคšไธชๆž็Ÿญ็‰‡ๆฎตๅผ‚ๅธธ่€—ๆ—ถ๏ผ‰ใ€‚ + +```shell +python trim_subaction.py +``` + +## 5. ๆๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทไป…ไฝฟ็”จ video loader๏ผŒๅˆ™ๅฏไปฅ่ทณ่ฟ‡ๆœฌๆญฅใ€‚ + +ๅœจๆๅ–ไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +็”จๆˆทๅฏไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌๅŒๆ—ถๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผˆๆๅ–ๅ…‰ๆตๆ—ถไฝฟ็”จ tvl1 ็ฎ—ๆณ•๏ผ‰๏ผš + +```shell +bash extract_frames.sh +``` + +## 6. ๅŸบไบŽๆๅ–ๅ‡บ็š„ๅˆ†ๅŠจไฝœ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌไธบ GYM99 ็”Ÿๆˆ่ฎญ็ปƒๅŠๆต‹่ฏ•็š„ๆ–‡ไปถๅˆ—่กจ๏ผš + +```shell +python generate_file_list.py +``` + +## 7. ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆ•ดๅฎŒๆˆ GYM ็š„ๆ•ฐๆฎๅค„็†ๅŽ๏ผŒๅฐ†ๅพ—ๅˆฐๅธงๆ–‡ไปถๅคน๏ผˆRGB ๅธงๅ’Œๅ…‰ๆตๅธง๏ผ‰๏ผŒๅŠจไฝœ่ง†้ข‘็‰‡ๆฎต๏ผŒๅˆ†ๅŠจไฝœ่ง†้ข‘็‰‡ๆฎตไปฅๅŠ่ฎญ็ปƒๆต‹่ฏ•ๆ‰€็”จๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็›ฎๅฝ•ไธ‹๏ผˆไป…้’ˆๅฏน GYM๏ผ‰๏ผŒๅฎŒๆ•ด็›ฎๅฝ•็ป“ๆž„ๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ gym +| | โ”œโ”€โ”€ annotations +| | | โ”œโ”€โ”€ gym99_train_org.txt +| | | โ”œโ”€โ”€ gym99_val_org.txt +| | | โ”œโ”€โ”€ gym99_train.txt +| | | โ”œโ”€โ”€ gym99_val.txt +| | | โ”œโ”€โ”€ annotation.json +| | | โ””โ”€โ”€ event_annotation.json +โ”‚ โ”‚ โ”œโ”€โ”€ videos +| | | โ”œโ”€โ”€ 0LtLS9wROrk.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ events +| | | โ”œโ”€โ”€ 0LtLS9wROrk_E_002407_002435.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw_E_006732_006824.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ subactions +| | | โ”œโ”€โ”€ 0LtLS9wROrk_E_002407_002435_A_0003_0005.mp4 +| | | โ”œโ”€โ”€ ... +| | | โ””โ”€โ”€ zfqS-wCJSsw_E_006244_006252_A_0000_0007.mp4 +| | โ””โ”€โ”€ subaction_frames +``` + +ๅ…ณไบŽ GYM ๆ•ฐๆฎ้›†ไธŠ็š„่ฎญ็ปƒไธŽๆต‹่ฏ•๏ผŒ่ฏทๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/gym/download.py b/tools/data/gym/download.py new file mode 100644 index 0000000000000000000000000000000000000000..9b89d38539c9b4668b1a086a1d844ea6dc8801ff --- /dev/null +++ b/tools/data/gym/download.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This scripts is copied from +# https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py # noqa: E501 +# The code is licensed under the MIT licence. +import argparse +import os +import ssl +import subprocess + +import mmengine +from joblib import Parallel, delayed + +ssl._create_default_https_context = ssl._create_unverified_context + + +def download(video_identifier, + output_filename, + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + + if not os.path.exists(output_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate', + '-f', 'mp4', '-o', + '"%s"' % output_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + attempts += 1 + if attempts == num_attempts: + return status, 'Fail' + else: + break + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + return status, 'Downloaded' + + +def download_wrapper(youtube_id, output_dir): + """Wrapper for parallel processing purposes.""" + # we do this to align with names in annotations + output_filename = os.path.join(output_dir, youtube_id + '.mp4') + if os.path.exists(output_filename): + status = tuple([youtube_id, True, 'Exists']) + return status + + downloaded, log = download(youtube_id, output_filename) + status = tuple([youtube_id, downloaded, log]) + return status + + +def main(input, output_dir, num_jobs=24): + # Reading and parsing ActivityNet. + youtube_ids = mmengine.load(input).keys() + # Creates folders where videos will be saved later. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Download all clips. + if num_jobs == 1: + status_list = [] + for index in youtube_ids: + status_list.append(download_wrapper(index, output_dir)) + else: + status_list = Parallel(n_jobs=num_jobs)( + delayed(download_wrapper)(index, output_dir) + for index in youtube_ids) + + # Save download report. + mmengine.dump(status_list, 'download_report.json') + + +if __name__ == '__main__': + description = 'Helper script for downloading GYM videos.' + p = argparse.ArgumentParser(description=description) + p.add_argument('input', type=str, help='The gym annotation file') + p.add_argument( + 'output_dir', type=str, help='Output directory to save videos.') + p.add_argument('-n', '--num-jobs', type=int, default=24) + main(**vars(p.parse_args())) diff --git a/tools/data/gym/download_annotations.sh b/tools/data/gym/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0a1bd728a2fddaa98797307b9da4d53509c5945 --- /dev/null +++ b/tools/data/gym/download_annotations.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/gym/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://sdolivia.github.io/FineGym/resources/dataset/finegym_annotation_info_v1.0.json -O $DATA_DIR/annotation.json +wget https://sdolivia.github.io/FineGym/resources/dataset/gym99_train_element_v1.0.txt -O $DATA_DIR/gym99_train_org.txt +wget https://sdolivia.github.io/FineGym/resources/dataset/gym99_val_element.txt -O $DATA_DIR/gym99_val_org.txt diff --git a/tools/data/gym/download_videos.sh b/tools/data/gym/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..4f6fbce6e83255756502b280fa7a34939839b469 --- /dev/null +++ b/tools/data/gym/download_videos.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate gym +pip install mmengine +pip install --upgrade youtube-dl + +DATA_DIR="../../../data/gym" +ANNO_DIR="../../../data/gym/annotations" +python download.py ${ANNO_DIR}/annotation.json ${DATA_DIR}/videos + +source deactivate gym +conda remove -n gym --all diff --git a/tools/data/gym/environment.yml b/tools/data/gym/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..95469fd8be53f80450ac99a41eac5d79202f5d49 --- /dev/null +++ b/tools/data/gym/environment.yml @@ -0,0 +1,36 @@ +name: gym +channels: + - anaconda + - menpo + - conda-forge + - defaults +dependencies: + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl diff --git a/tools/data/gym/extract_frames.sh b/tools/data/gym/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..594bc898e804c1e7d31495f1a5c5695eeedf6341 --- /dev/null +++ b/tools/data/gym/extract_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/gym/subactions/ ../../data/gym/subaction_frames/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated" + +cd gym/ diff --git a/tools/data/gym/generate_file_list.py b/tools/data/gym/generate_file_list.py new file mode 100644 index 0000000000000000000000000000000000000000..17ea121a803dae88b1cfe399cc9b6b410033ecb3 --- /dev/null +++ b/tools/data/gym/generate_file_list.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp + +annotation_root = '../../../data/gym/annotations' +data_root = '../../../data/gym/subactions' +frame_data_root = '../../../data/gym/subaction_frames' + +videos = os.listdir(data_root) +videos = set(videos) + +train_file_org = osp.join(annotation_root, 'gym99_train_org.txt') +val_file_org = osp.join(annotation_root, 'gym99_val_org.txt') +train_file = osp.join(annotation_root, 'gym99_train.txt') +val_file = osp.join(annotation_root, 'gym99_val.txt') +train_frame_file = osp.join(annotation_root, 'gym99_train_frame.txt') +val_frame_file = osp.join(annotation_root, 'gym99_val_frame.txt') + +train_org = open(train_file_org).readlines() +train_org = [x.strip().split() for x in train_org] +train = [x for x in train_org if x[0] + '.mp4' in videos] +if osp.exists(frame_data_root): + train_frames = [] + for line in train: + length = len(os.listdir(osp.join(frame_data_root, line[0]))) + train_frames.append([line[0], str(length // 3), line[1]]) + train_frames = [' '.join(x) for x in train_frames] + with open(train_frame_file, 'w') as fout: + fout.write('\n'.join(train_frames)) + +train = [x[0] + '.mp4 ' + x[1] for x in train] +with open(train_file, 'w') as fout: + fout.write('\n'.join(train)) + +val_org = open(val_file_org).readlines() +val_org = [x.strip().split() for x in val_org] +val = [x for x in val_org if x[0] + '.mp4' in videos] +if osp.exists(frame_data_root): + val_frames = [] + for line in val: + length = len(os.listdir(osp.join(frame_data_root, line[0]))) + val_frames.append([line[0], str(length // 3), line[1]]) + val_frames = [' '.join(x) for x in val_frames] + with open(val_frame_file, 'w') as fout: + fout.write('\n'.join(val_frames)) + +val = [x[0] + '.mp4 ' + x[1] for x in val] +with open(val_file, 'w') as fout: + fout.write('\n'.join(val)) diff --git a/tools/data/gym/label_map.txt b/tools/data/gym/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..79f4010816b30047a74b1b71786c7d94770ebdbb --- /dev/null +++ b/tools/data/gym/label_map.txt @@ -0,0 +1,99 @@ +(VT) round-off, flic-flac with 0.5 turn on, stretched salto forward with 0.5 turn off +(VT) round-off, flic-flac on, stretched salto backward with 2 turn off +(VT) round-off, flic-flac on, stretched salto backward with 1 turn off +(VT) round-off, flic-flac on, stretched salto backward with 1.5 turn off +(VT) round-off, flic-flac on, stretched salto backward with 2.5 turn off +(VT) round-off, flic-flac on, stretched salto backward off +(FX) switch leap with 0.5 turn +(FX) switch leap with 1 turn +(FX) split leap with 1 turn +(FX) split leap with 1.5 turn or more +(FX) switch leap (leap forward with leg change to cross split) +(FX) split jump with 1 turn +(FX) split jump (leg separation 180 degree parallel to the floor) +(FX) johnson with additional 0.5 turn +(FX) straddle pike or side split jump with 1 turn +(FX) switch leap to ring position +(FX) stag jump +(FX) 2 turn with free leg held upward in 180 split position throughout turn +(FX) 2 turn in tuck stand on one leg, free leg straight throughout turn +(FX) 3 turn on one leg, free leg optional below horizontal +(FX) 2 turn on one leg, free leg optional below horizontal +(FX) 1 turn on one leg, free leg optional below horizontal +(FX) 2 turn or more with heel of free leg forward at horizontal throughout turn +(FX) 1 turn with heel of free leg forward at horizontal throughout turn +(FX) arabian double salto tucked +(FX) salto forward tucked +(FX) aerial walkover forward +(FX) salto forward stretched with 2 twist +(FX) salto forward stretched with 1 twist +(FX) salto forward stretched with 1.5 twist +(FX) salto forward stretched, feet land together +(FX) double salto backward stretched +(FX) salto backward stretched with 3 twist +(FX) salto backward stretched with 2 twist +(FX) salto backward stretched with 2.5 twist +(FX) salto backward stretched with 1.5 twist +(FX) double salto backward tucked with 2 twist +(FX) double salto backward tucked with 1 twist +(FX) double salto backward tucked +(FX) double salto backward piked with 1 twist +(FX) double salto backward piked +(BB) sissone (leg separation 180 degree on the diagonal to the floor, take off two feet, land on one foot) +(BB) split jump with 0.5 turn in side position +(BB) split jump +(BB) straddle pike jump or side split jump +(BB) split ring jump (ring jump with front leg horizontal to the floor) +(BB) switch leap with 0.5 turn +(BB) switch leap (leap forward with leg change) +(BB) split leap forward +(BB) johnson (leap forward with leg change and 0.25 turn to side split or straddle pike position) +(BB) switch leap to ring position +(BB) sheep jump (jump with upper back arch and head release with feet to head height/closed Ring) +(BB) wolf hop or jump (hip angle at 45, knees together) +(BB) 1 turn with heel of free leg forward at horizontal throughout turn +(BB) 2 turn on one leg, free leg optional below horizontal +(BB) 1 turn on one leg, free leg optional below horizontal +(BB) 2 turn in tuck stand on one leg, free leg optional +(BB) salto backward tucked with 1 twist +(BB) salto backward tucked +(BB) salto backward stretched-step out (feet land successively) +(BB) salto backward stretched with legs together +(BB) salto sideward tucked, take off from one leg to side stand +(BB) free aerial cartwheel landing in cross position +(BB) salto forward tucked to cross stand +(BB) free aerial walkover forward, landing on one or both feet +(BB) jump backward, flic-flac take-off with 0.5 twist through handstand to walkover forward, also with support on one arm +(BB) flic-flac to land on both feet +(BB) flic-flac with step-out, also with support on one arm +(BB) round-off +(BB) double salto backward tucked +(BB) salto backward tucked +(BB) double salto backward piked +(BB) salto backward stretched with 2 twist +(BB) salto backward stretched with 2.5 twist +(UB) pike sole circle backward with 1 turn to handstand +(UB) pike sole circle backward with 0.5 turn to handstand +(UB) pike sole circle backward to handstand +(UB) giant circle backward with 1 turn to handstand +(UB) giant circle backward with 0.5 turn to handstand +(UB) giant circle backward +(UB) giant circle forward with 1 turn on one arm before handstand phase +(UB) giant circle forward with 0.5 turn to handstand +(UB) giant circle forward +(UB) clear hip circle backward to handstand +(UB) clear pike circle backward with 1 turn to handstand +(UB) clear pike circle backward with 0.5 turn to handstand +(UB) clear pike circle backward to handstand +(UB) stalder backward with 1 turn to handstand +(UB) stalder backward to handstand +(UB) counter straddle over high bar to hang +(UB) counter piked over high bar to hang +(UB) (swing backward or front support) salto forward straddled to hang on high bar +(UB) (swing backward) salto forward piked to hang on high bar +(UB) (swing forward or hip circle backward) salto backward with 0.5 turn piked to hang on high bar +(UB) transition flight from high bar to low bar +(UB) transition flight from low bar to high bar +(UB) (swing forward) double salto backward tucked with 1 turn +(UB) (swing backward) double salto forward tucked +(UB) (swing forward) double salto backward stretched diff --git a/tools/data/gym/trim_event.py b/tools/data/gym/trim_event.py new file mode 100644 index 0000000000000000000000000000000000000000..006b0646391c73c42da37cb74b458e5d486f5d0f --- /dev/null +++ b/tools/data/gym/trim_event.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import subprocess + +import mmengine + +data_root = '../../../data/gym' +video_root = f'{data_root}/videos' +anno_root = f'{data_root}/annotations' +anno_file = f'{anno_root}/annotation.json' + +event_anno_file = f'{anno_root}/event_annotation.json' +event_root = f'{data_root}/events' + +videos = os.listdir(video_root) +videos = set(videos) +annotation = mmengine.load(anno_file) +event_annotation = {} + +mmengine.mkdir_or_exist(event_root) + +for k, v in annotation.items(): + if k + '.mp4' not in videos: + print(f'video {k} has not been downloaded') + continue + + video_path = osp.join(video_root, k + '.mp4') + + for event_id, event_anno in v.items(): + timestamps = event_anno['timestamps'][0] + start_time, end_time = timestamps + event_name = k + '_' + event_id + + output_filename = event_name + '.mp4' + + command = [ + 'ffmpeg', '-i', + '"%s"' % video_path, '-ss', + str(start_time), '-t', + str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '8', '-loglevel', 'panic', + '"%s"' % osp.join(event_root, output_filename) + ] + command = ' '.join(command) + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + print( + f'Trimming of the Event {event_name} of Video {k} Failed', + flush=True) + + segments = event_anno['segments'] + if segments is not None: + event_annotation[event_name] = segments + +mmengine.dump(event_annotation, event_anno_file) diff --git a/tools/data/gym/trim_subaction.py b/tools/data/gym/trim_subaction.py new file mode 100644 index 0000000000000000000000000000000000000000..7cecceaf2b5d12a5f667073a855980e30fddf80c --- /dev/null +++ b/tools/data/gym/trim_subaction.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import subprocess + +import mmengine + +data_root = '../../../data/gym' +anno_root = f'{data_root}/annotations' + +event_anno_file = f'{anno_root}/event_annotation.json' +event_root = f'{data_root}/events' +subaction_root = f'{data_root}/subactions' + +events = os.listdir(event_root) +events = set(events) +annotation = mmengine.load(event_anno_file) + +mmengine.mkdir_or_exist(subaction_root) + +for k, v in annotation.items(): + if k + '.mp4' not in events: + print(f'video {k[:11]} has not been downloaded ' + f'or the event clip {k} not generated') + continue + + video_path = osp.join(event_root, k + '.mp4') + + for subaction_id, subaction_anno in v.items(): + timestamps = subaction_anno['timestamps'] + start_time, end_time = timestamps[0][0], timestamps[-1][1] + subaction_name = k + '_' + subaction_id + + output_filename = subaction_name + '.mp4' + + command = [ + 'ffmpeg', '-i', + '"%s"' % video_path, '-ss', + str(start_time), '-t', + str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '8', '-loglevel', 'panic', + '"%s"' % osp.join(subaction_root, output_filename) + ] + command = ' '.join(command) + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + print( + f'Trimming of the Subaction {subaction_name} of Event ' + f'{k} Failed', + flush=True) diff --git a/tools/data/hacs/README-CN.md b/tools/data/hacs/README-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..cb1ab76727a36d0805f3ab6598b4e536aac03292 --- /dev/null +++ b/tools/data/hacs/README-CN.md @@ -0,0 +1,119 @@ +# ๅ‡†ๅค‡ HACS Segments + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{zhao2019hacs, + title={Hacs: Human action clips and segments dataset for recognition and temporal localization}, + author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={8668--8678}, + year={2019} +} +``` + +### 0. ไธ‹่ฝฝ่ง†้ข‘ + +ๅœจๆˆ‘ไปฌๅผ€ๅง‹ๅ‡†ๅค‡ๆ•ฐๆฎ้›†ไน‹ๅ‰๏ผŒ่ฏทๆŒ‰็…ง[ๅฎ˜ๆ–นไปฃ็ ๅบ“](https://github.com/hangzhaomit/HACS-dataset)็š„ๆŒ‡ไปคไธ‹่ฝฝHACS Segmentsๆ•ฐๆฎ้›†ไธญ็š„่ง†้ข‘ใ€‚ๅฆ‚ๆžœๆœ‰่ง†้ข‘็ผบๅคฑ๏ผŒๆ‚จๅฏไปฅๅ‘HACSๆ•ฐๆฎ้›†ๅญ˜ๅ‚จๅบ“็š„็ปดๆŠค่€…ๆไบค่ฏทๆฑ‚ไปฅ่Žทๅ–็ผบๅคฑ็š„่ง†้ข‘ใ€‚ไฝ†ๆ˜ฏๅฆ‚ๆžœไธ€ไบ›่ง†้ข‘็ผบๅคฑ๏ผŒๆ‚จไป็„ถๅฏไปฅไธบMMAction2ๅ‡†ๅค‡ๆ•ฐๆฎ้›†ใ€‚ + +ๅœจไธ‹่ฝฝๅฎŒๆ•ฐๆฎ้›†ๅŽ๏ผŒ่ฏทๅฐ†ๆ•ฐๆฎ้›†ๆ–‡ไปถๅคน็งปๅŠจๅˆฐ(ๆˆ–่€…ไฝฟ็”จ่ฝฏ้“พๆŽฅ)`$MMACTION2/tools/data/hacs/`ใ€‚ๆ–‡ไปถๅคน็ป“ๆž„ๅบ”่ฏฅๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ data +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ tools +โ”‚ โ”œโ”€โ”€ hacs +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_feature_infer.py +โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”œโ”€โ”€ data +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ Applying_sunscreen +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_0Ch__DqMPwA.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_9CTDjFHl8WE.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. + + +``` + +ๅœจๅผ€ๅง‹ไน‹ๅ‰๏ผŒ่ฏท็กฎไฟๆ‚จไฝไบŽ`$MMACTION2/tools/data/hacs/`่ทฏๅพ„ไธ‹ใ€‚ + +### 1. ๆๅ–็‰นๅพ + +ไปฅไธ‹ๆ˜ฏไฝฟ็”จ[SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py)ๅœจKinetics700ๆ•ฐๆฎ้›†ไธŠ้ข„่ฎญ็ปƒ็š„ๆจกๅž‹๏ผŒไปŽHACS่ง†้ข‘ไธญๆๅ–็‰นๅพใ€‚ๅฏนไบŽๆฏไธช่ง†้ข‘๏ผŒๆˆ‘ไปฌๅ‡ๅŒ€้‡‡ๆ ท100ไธช่ง†้ข‘็‰‡ๆฎต๏ผŒๅนถๆๅ–700็ปด่พ“ๅ‡บ๏ผˆsoftmaxไน‹ๅ‰๏ผ‰ไฝœไธบ็‰นๅพ๏ผŒๅณ็‰นๅพๅฝข็Šถไธบ100x700ใ€‚ + +้ฆ–ๅ…ˆ๏ผŒๆˆ‘ไปฌไฝฟ็”จๅฆ‚ไธ‹ๅ‘ฝไปค็”Ÿๆˆๆ•ฐๆฎ้›†็š„่ง†้ข‘ๅˆ—่กจ๏ผš + +``` +python generate_list.py +``` + +่ฟ™ๅฐ†็”Ÿๆˆไธ€ไธชไฝไบŽ`$MMACTION2/tools/data/hacs/`็š„`hacs_data.txt`ๆ–‡ไปถ๏ผŒๅ…ถๅ†…ๅฎนๆ ผๅผๅฆ‚ไธ‹๏ผš + +``` +Horseback_riding/v_Sr2BSq_8FMw.mp4 0 +Horseback_riding/v_EQb6OKoqz3Q.mp4 1 +Horseback_riding/v_vYKUV8TRngg.mp4 2 +Horseback_riding/v_Y8U0X1F-0ck.mp4 3 +Horseback_riding/v_hnspbB7wNh0.mp4 4 +Horseback_riding/v_HPhlhrT9IOk.mp4 5 +``` + +ๆŽฅไธ‹ๆฅ๏ผŒๆˆ‘ไปฌไฝฟ็”จ[slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) ้…็ฝฎๆ–‡ไปถๆฅๆๅ–็‰นๅพ๏ผš + +``` +# ๆŒ‡ๅฎšๆๅ–็‰นๅพ็š„GPUๆ•ฐ้‡ +NUM_GPUS=8 + +# ไธ‹่ฝฝ้ข„่ฎญ็ปƒๆจกๅž‹ๆƒ้‡ +wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth + +bash ../mmaction2/tools/dist_test.sh \ + slowonly_feature_infer.py \ + slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \ + $NUM_GPUS --dump result.pkl +``` + +ๆˆ‘ไปฌๅฐ†ๅพ—ๅˆฐไธ€ไธชๅไธบ `result.pkl` ็š„ๆ–‡ไปถ๏ผŒๅ…ถไธญๅŒ…ๅซๆฏไธช่ง†้ข‘็š„ๅคงๅฐไธบ100x700็š„็‰นๅพใ€‚ๆˆ‘ไปฌๅฐ†็‰นๅพ้‡ๅ†™ไธบcsvๆ ผๅผ๏ผŒๅนถไฟๅญ˜ๅœจ `$MMACTION2/data/HACS/` ็›ฎๅฝ•ไธ‹ใ€‚ + +``` +๏ผƒ็กฎไฟๆ‚จไฝไบŽ$ $MMACTION2/tools/data/hacs/ +python write_feature_csv.py +``` + +### 2. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +ๆˆ‘ไปฌ้ฆ–ๅ…ˆไปŽๅฎ˜ๆ–นไป“ๅบ“ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ๏ผš + +``` +wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip +unzip HACS_v1.1.1.zip +``` + +่งฃๅŽ‹็ผฉๅŽ๏ผŒๅบ”่ฏฅๆœ‰ไธ€ไธชๅไธบ`HACS_v1.1.1`็š„ๆ–‡ไปถๅคน๏ผŒๅ…ถไธญๅŒ…ๅซไธ€ไธชๅไธบ`HACS_segments_v1.1.1.json`็š„ๆ–‡ไปถใ€‚ + +ๆˆ‘ไปฌๅœจ`$MMACTION2/data/HACS/`็›ฎๅฝ•ไธ‹็”Ÿๆˆ`hacs_anno_train.json`ใ€`hacs_anno_val.json`ๅ’Œ`hacs_anno_test.json`ๆ–‡ไปถ๏ผš + +``` +python3 generate_anotations.py +``` + +ๅฎŒๆˆ่ฟ™ไธคไธชๆญฅ้ชคๅŽ๏ผŒHACS Segmentsๆ•ฐๆฎ้›†็š„ๆ–‡ไปถๅคน็ป“ๆž„ๅบ”่ฏฅๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ HACS +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_train.json +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_val.json +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_test.json +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_feature +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_008gY2B8Pf4.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_0095rqic1n8.csv +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ tools + +``` diff --git a/tools/data/hacs/README.md b/tools/data/hacs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ea82c4e858ff217c23399489658a43787e41687f --- /dev/null +++ b/tools/data/hacs/README.md @@ -0,0 +1,119 @@ +# Preparing HACS Segments + +## Introduction + + + +```BibTeX +@inproceedings{zhao2019hacs, + title={Hacs: Human action clips and segments dataset for recognition and temporal localization}, + author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={8668--8678}, + year={2019} +} +``` + +### Step 0. Download Videos + +Before we start preparing the dataset, please following the official [repository](https://github.com/hangzhaomit/HACS-dataset) to download videos from the HACS Segments dataset. You can submit a request for missing videos to the maintainer of the HACS dataset repository. But you can still prepare the dataset for MMAction2 if some videos are missing. + +After you finish downloading the dataset, please move the dataset folder to `$MMACTION2/tools/data/hacs/` or use a soft link. The the folder structure should look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ data +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ tools +โ”‚ โ”œโ”€โ”€ hacs +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_feature_infer.py +โ”‚ โ”‚ โ”œโ”€โ”€ .. +โ”‚ โ”‚ โ”œโ”€โ”€ data +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ Applying_sunscreen +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_0Ch__DqMPwA.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_9CTDjFHl8WE.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ .. + + +``` + +Before we start, make sure you are at `$MMACTION2/tools/data/hacs/`. + +### Step 1. Extract Features + +We extract features from the HACS videos using [SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) pretrained on Kinetics700 dataset. For each video, we uniformly sample 100 video clips and extract the 700-dimensional output (before softmax) as the feature, i.e., the feature shape is 100x700. + +First, we generate a video list of the dataset: + +``` +python generate_list.py +``` + +It will generate an `hacs_data.txt` file located at `$MMACTION2/tools/data/hacs/` which looks like: + +``` +Horseback_riding/v_Sr2BSq_8FMw.mp4 0 +Horseback_riding/v_EQb6OKoqz3Q.mp4 1 +Horseback_riding/v_vYKUV8TRngg.mp4 2 +Horseback_riding/v_Y8U0X1F-0ck.mp4 3 +Horseback_riding/v_hnspbB7wNh0.mp4 4 +Horseback_riding/v_HPhlhrT9IOk.mp4 5 +``` + +Next we use the [slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) config to extract features: + +``` +# number of GPUs to extract feature +NUM_GPUS=8 + +# download the pretraining checkpoint +wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth + +bash ../mmaction2/tools/dist_test.sh \ + slowonly_feature_infer.py \ + slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \ + $NUM_GPUS --dump result.pkl +``` + +We will get a `result.pkl` that contains the 100x700 feature for each video. We re-write the features into csv format at `$MMACTION2/data/HACS/`: + +``` +# Make sure you are at $MMACTION2/tools/data/hacs/ +python write_feature_csv.py +``` + +### Step 2. Prepare Annotations + +We first download the original annotations from the official repository: + +``` +wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip +unzip HACS_v1.1.1.zip +``` + +After unzipping, there should be an `HACS_v1.1.1` folder with an `HACS_segments_v1.1.1.json` file in it. + +We generate `hacs_anno_train.json`, `hacs_anno_val.json` and `hacs_anno_test.json` files at `$MMACTION2/data/HACS/`: + +``` +python3 generate_anotations.py +``` + +After the two steps finished, the folder structure of the HACS Segments dataset should look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ HACS +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_train.json +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_val.json +โ”‚ โ”‚ โ”œโ”€โ”€ hacs_anno_test.json +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_feature +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_008gY2B8Pf4.csv +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_0095rqic1n8.csv +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ tools + +``` diff --git a/tools/data/hacs/generate_anotations.py b/tools/data/hacs/generate_anotations.py new file mode 100644 index 0000000000000000000000000000000000000000..206a6362036675a9ad4d18c58da6330e37df6971 --- /dev/null +++ b/tools/data/hacs/generate_anotations.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import multiprocessing +import os + +import decord + +with open('HACS_v1.1.1/HACS_segments_v1.1.1.json') as f: + all_annotations = json.load(f)['database'] + + +def parse_anno(key): + anno = {} + anno['duration_second'] = float(all_annotations[key]['duration']) + anno['annotations'] = all_annotations[key]['annotations'] + anno['subset'] = all_annotations[key]['subset'] + + labels = set([i['label'] for i in anno['annotations']]) + num_frames = int(anno['duration_second'] * 30) + for label in labels: + path = f'data/{label}/v_{key}.mp4' + if os.path.isfile(path): + vr = decord.VideoReader(path) + num_frames = len(vr) + break + + anno['feature_frame'] = anno['duration_frame'] = num_frames + anno['key'] = f'v_{key}' + return anno + + +pool = multiprocessing.Pool(16) +video_list = list(all_annotations) +outputs = pool.map(parse_anno, video_list) + +train_anno = {} +val_anno = {} +test_anno = {} + +for anno in outputs: + key = anno.pop('key') + subset = anno.pop('subset') + if subset == 'training': + train_anno[key] = anno + elif subset == 'validation': + val_anno[key] = anno + else: + test_anno[key] = anno + +outdir = '../../../data/HACS' +with open(f'{outdir}/hacs_anno_train.json', 'w') as f: + json.dump(train_anno, f) + +with open(f'{outdir}/hacs_anno_val.json', 'w') as f: + json.dump(val_anno, f) + +with open(f'{outdir}/hacs_anno_test.json', 'w') as f: + json.dump(test_anno, f) diff --git a/tools/data/hacs/generate_list.py b/tools/data/hacs/generate_list.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b25d55d11bb9f5115b86c32dc02c11fef46dac --- /dev/null +++ b/tools/data/hacs/generate_list.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os + +data_root = './data' + +video_list = [] +idx = 0 +for folder in os.listdir(data_root): + path = f'{data_root}/{folder}' + for video in os.listdir(path): + line = f'{folder}/{video} {idx}\n' + idx += 1 + video_list.append(line) + +with open('hacs_data.txt', 'w') as f: + for line in video_list: + f.write(line) diff --git a/tools/data/hacs/slowonly_feature_infer.py b/tools/data/hacs/slowonly_feature_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..3b52b04312077690012996747d19f4a149f7fc7b --- /dev/null +++ b/tools/data/hacs/slowonly_feature_infer.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +default_hooks = dict( + runtime_info=dict(type='RuntimeInfoHook'), + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=20, ignore_last=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=4, save_best='auto'), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) + +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')]) +log_level = 'INFO' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=700, + spatial_type='avg', + dropout_ratio=0.5, + average_clips=None), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +data_root = './data' +ann_file = 'hacs_data.txt' + +test_pipeline = [ + dict(type='DecordInit', io_backend='disk'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=100, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='VideoDataset', + ann_file=ann_file, + data_prefix=dict(video=data_root), + pipeline=test_pipeline, + test_mode=True)) + +test_evaluator = dict(type='DumpResults', out_file_path='result.pkl') +test_cfg = dict(type='TestLoop') diff --git a/tools/data/hacs/write_feature_csv.py b/tools/data/hacs/write_feature_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..18b0f34a907a49c3f149d4eb0168d735a937307f --- /dev/null +++ b/tools/data/hacs/write_feature_csv.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmengine + +features = mmengine.load('result.pkl') +video_list = mmengine.list_from_file('hacs_data.txt') +feature_dir = '../../../data/HACS/slowonly_feature' +mmengine.mkdir_or_exist(feature_dir) + +head = ','.join([f'f{i}' for i in range(700)]) + '\n' + +for feature, video in zip(features, video_list): + video_id = video.split()[0].split('/')[1] + csv_file = video_id.replace('mp4', 'csv') + feat = feature['pred_scores']['item'].numpy() + feat = feat.tolist() + csv_path = f'{feature_dir}/{csv_file}' + with open(csv_path, 'w') as f: + f.write(head) + for line in feat: + f.write(str(line)[1:-1] + '\n') diff --git a/tools/data/hmdb51/README.md b/tools/data/hmdb51/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3417f22944d8728c998b10f8e04091136dd8fea6 --- /dev/null +++ b/tools/data/hmdb51/README.md @@ -0,0 +1,125 @@ +# Preparing HMDB51 + +## Introduction + + + +```BibTeX +@article{Kuehne2011HMDBAL, + title={HMDB: A large video database for human motion recognition}, + author={Hilde Kuehne and Hueihan Jhuang and E. Garrote and T. Poggio and Thomas Serre}, + journal={2011 International Conference on Computer Vision}, + year={2011}, + pages={2556-2563} +} +``` + +For basic dataset information, you can refer to the dataset [website](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/hmdb51/`. + +To run the bash scripts below, you need to install `unrar`. you can install it by `sudo apt-get install unrar`, +or refer to [this repo](https://github.com/innerlee/setup) by following the usage and taking [`zzunrar.sh`](https://github.com/innerlee/setup/blob/master/zzunrar.sh) +script for easy installation without sudo. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations. + +```shell +bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. + +```shell +bash download_videos.sh +``` + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/hmdb51_extracted/ +ln -s /mnt/SSD/hmdb51_extracted/ ../../../data/hmdb51/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames using "tvl1" algorithm. + +```shell +bash extract_frames.sh +``` + +## Step 4. Generate File List + +you can run the follow script to generate file list in the format of rawframes and videos. + +```shell +bash generate_rawframes_filelist.sh +bash generate_videos_filelist.sh +``` + +## Step 5. Check Directory Structure + +After the whole data process for HMDB51 preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for HMDB51. + +In the context of the whole project (for HMDB51 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ hmdb51 +โ”‚ โ”‚ โ”œโ”€โ”€ hmdb51_{train,val}_split_{1,2,3}_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ hmdb51_{train,val}_split_{1,2,3}_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ brush_hair +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0.avi + +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wave +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ brush_hair +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wave +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ winKen_wave_u_cm_np1_ri_bad_1 + +``` + +For training and evaluating on HMDB51, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/hmdb51/README_zh-CN.md b/tools/data/hmdb51/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..60d01d3c6e6ec9c8b2e23abb40d00da06be5fc9a --- /dev/null +++ b/tools/data/hmdb51/README_zh-CN.md @@ -0,0 +1,121 @@ +# ๅ‡†ๅค‡ HMDB51 + +## ็ฎ€ไป‹ + + + +```BibTeX +@article{Kuehne2011HMDBAL, + title={HMDB: A large video database for human motion recognition}, + author={Hilde Kuehne and Hueihan Jhuang and E. Garrote and T. Poggio and Thomas Serre}, + journal={2011 International Conference on Computer Vision}, + year={2011}, + pages={2556-2563} +} +``` + +็”จๆˆทๅฏไปฅๅ‚็…งๆ•ฐๆฎ้›† [ๅฎ˜็ฝ‘](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/)๏ผŒ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅ‡†ๅค‡ๆ•ฐๆฎ้›†ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/hmdb51/`ใ€‚ + +ไธบ่ฟ่กŒไธ‹้ข็š„ bash ่„šๆœฌ๏ผŒ้œ€่ฆๅฎ‰่ฃ… `unrar`ใ€‚็”จๆˆทๅฏ่ฟ่กŒ `sudo apt-get install unrar` ๅฎ‰่ฃ…๏ผŒๆˆ–ๅ‚็…ง [setup](https://github.com/innerlee/setup)๏ผŒ่ฟ่กŒ [`zzunrar.sh`](https://github.com/innerlee/setup/blob/master/zzunrar.sh) ่„šๆœฌๅฎž็Žฐๆ— ็ฎก็†ๅ‘˜ๆƒ้™ไธ‹็š„็ฎ€ๆ˜“ๅฎ‰่ฃ…ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +bash download_annotations.sh +``` + +## ๆญฅ้ชค 2. ไธ‹่ฝฝ่ง†้ข‘ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๆŒ‡ไปคไธ‹่ฝฝ่ง†้ข‘ + +```shell +bash download_videos.sh +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ–ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธŠใ€‚ +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒๆŒ‡ไปค่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"ไธŠ๏ผ‰ +mkdir /mnt/SSD/hmdb51_extracted/ +ln -s /mnt/SSD/hmdb51_extracted/ ../../../data/hmdb51/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ๏ผŒไฝฟ็”จ "tvl1" ็ฎ—ๆณ•่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +bash extract_frames.sh +``` + +## ๆญฅ้ชค 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +bash generate_rawframes_filelist.sh +bash generate_videos_filelist.sh +``` + +## ๆญฅ้ชค 5. ๆฃ€ๆŸฅ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆˆ HMDB51 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅๅพ—ๅˆฐ HMDB51 ็š„ RGB ๅธง + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒHMDB51 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ hmdb51 +โ”‚ โ”‚ โ”œโ”€โ”€ hmdb51_{train,val}_split_{1,2,3}_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ hmdb51_{train,val}_split_{1,2,3}_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ brush_hair +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0.avi + +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wave +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ brush_hair +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wave +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ winKen_wave_u_cm_np1_ri_bad_1 + +``` + +ๅ…ณไบŽๅฏน HMDB51 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒๅฏไปฅๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/hmdb51/download_annotations.sh b/tools/data/hmdb51/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..bab3a4b9394eaa479dbbd43586c7068cc581ca60 --- /dev/null +++ b/tools/data/hmdb51/download_annotations.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/hmdb51/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} +wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar --no-check-certificate + +# sudo apt-get install unrar +unrar x test_train_splits.rar +rm test_train_splits.rar + +mv testTrainMulti_7030_splits/*.txt ./ +rmdir testTrainMulti_7030_splits + +cd - diff --git a/tools/data/hmdb51/download_videos.sh b/tools/data/hmdb51/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..76dbede5f8ec1d648e0a2142b151f55d336e648e --- /dev/null +++ b/tools/data/hmdb51/download_videos.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/hmdb51/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +mkdir -p ./videos +cd ./videos + +wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar --no-check-certificate + +# sudo apt-get install unrar +unrar x ./hmdb51_org.rar +rm ./hmdb51_org.rar + +# extract all rar files with full path +for file in *.rar; do unrar x $file; done + +rm ./*.rar +cd "../../../tools/data/hmdb51" diff --git a/tools/data/hmdb51/extract_frames.sh b/tools/data/hmdb51/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..95df7125543b5d3a5ca1ce955b5f1ae739dfe0bd --- /dev/null +++ b/tools/data/hmdb51/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task both --level 2 --flow-type tvl1 +echo "Raw frames (RGB and Flow) Generated" +cd hmdb51/ diff --git a/tools/data/hmdb51/extract_rgb_frames.sh b/tools/data/hmdb51/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c8865e40451a2b20f52d7fabd09c9cc42581ea2 --- /dev/null +++ b/tools/data/hmdb51/extract_rgb_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task rgb --level 2 --ext avi +echo "Genearte raw frames (RGB only)" + +cd hmdb51/ diff --git a/tools/data/hmdb51/extract_rgb_frames_opencv.sh b/tools/data/hmdb51/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..9e257635608a1871e6e4af9d8d7d615540d22da5 --- /dev/null +++ b/tools/data/hmdb51/extract_rgb_frames_opencv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task rgb --level 2 --ext avi --use-opencv +echo "Genearte raw frames (RGB only)" + +cd hmdb51/ diff --git a/tools/data/hmdb51/generate_rawframes_filelist.sh b/tools/data/hmdb51/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..4fc16d55d96522162f145d442bfda1c98d676264 --- /dev/null +++ b/tools/data/hmdb51/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ + +PYTHONPATH=. python tools/data/build_file_list.py hmdb51 data/hmdb51/rawframes/ --level 2 --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/hmdb51/ diff --git a/tools/data/hmdb51/generate_videos_filelist.sh b/tools/data/hmdb51/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d6fe7dcfb91a9baa62695892f1bd183b39cbd7b --- /dev/null +++ b/tools/data/hmdb51/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ + +PYTHONPATH=. python tools/data/build_file_list.py hmdb51 data/hmdb51/videos/ --level 2 --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/hmdb51/ diff --git a/tools/data/hmdb51/label_map.txt b/tools/data/hmdb51/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..852af0f99d13744a041bd2fe50adc6437c86c4ed --- /dev/null +++ b/tools/data/hmdb51/label_map.txt @@ -0,0 +1,51 @@ +brush_hair +cartwheel +catch +chew +clap +climb +climb_stairs +dive +draw_sword +dribble +drink +eat +fall_floor +fencing +flic_flac +golf +handstand +hit +hug +jump +kick +kick_ball +kiss +laugh +pick +pour +pullup +punch +push +pushup +ride_bike +ride_horse +run +shake_hands +shoot_ball +shoot_bow +shoot_gun +sit +situp +smile +smoke +somersault +stand +swing_baseball +sword +sword_exercise +talk +throw +turn +walk +wave diff --git a/tools/data/hvu/README.md b/tools/data/hvu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d31a8725340f39aade2589da6b26e3af9503773d --- /dev/null +++ b/tools/data/hvu/README.md @@ -0,0 +1,123 @@ +# Preparing HVU + +## Introduction + + + +```BibTeX +@article{Diba2019LargeSH, + title={Large Scale Holistic Video Understanding}, + author={Ali Diba and M. Fayyaz and Vivek Sharma and Manohar Paluri and Jurgen Gall and R. Stiefelhagen and L. Gool}, + journal={arXiv: Computer Vision and Pattern Recognition}, + year={2019} +} +``` + +For basic dataset information, please refer to the official [project](https://github.com/holistic-video-understanding/HVU-Dataset/) and the [paper](https://arxiv.org/abs/1904.11451). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/hvu/`. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations. + +```shell +bash download_annotations.sh +``` + +Besides, you need to run the following command to parse the tag list of HVU. + +```shell +python parse_tag_list.py +``` + +## Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. +The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. + +```shell +bash download_videos.sh +``` + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +You can use the following script to extract both RGB and Flow frames. + +```shell +bash extract_frames.sh +``` + +By default, we generate frames with short edge resized to 256. +More details can be found in [prepare_dataset](/docs/en/user_guides/prepare_dataset.md) + +## Step 4. Generate File List + +You can run the follow scripts to generate file list in the format of videos and rawframes, respectively. + +```shell +bash generate_videos_filelist.sh +# execute the command below when rawframes are ready +bash generate_rawframes_filelist.sh +``` + +## Step 5. Generate File List for Each Individual Tag Categories + +This part is **optional** if you don't want to train models on HVU for a specific tag category. + +The file list generated in step 4 contains labels of different categories. These file lists can only be +handled with HVUDataset and used for multi-task learning of different tag categories. The component +`LoadHVULabel` is needed to load the multi-category tags, and the `HVULoss` should be used to train +the model. + +If you only want to train video recognition models for a specific tag category, i.e. you want to train +a recognition model on HVU which only handles tags in the category `action`, we recommend you to use +the following command to generate file lists for the specific tag category. The new list, which only +contains tags of a specific category, can be handled with `VideoDataset` or `RawframeDataset`. The +recognition models can be trained with `BCELossWithLogits`. + +The following command generates file list for the tag category ${category}, note that the tag category you +specified should be in the 6 tag categories available in HVU: \['action', 'attribute', 'concept', 'event', +'object', 'scene'\]. + +```shell +python generate_sub_file_list.py path/to/filelist.json ${category} +``` + +The filename of the generated file list for ${category} is generated by replacing `hvu` in the original +filename with `hvu_${category}`. For example, if the original filename is `hvu_train.json`, the filename +of the file list for action is `hvu_action_train.json`. + +## Step 6. Folder Structure + +After the whole data pipeline for HVU preparation. +you can get the rawframes (RGB + Flow), videos and annotation files for HVU. + +In the context of the whole project (for HVU only), the full folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ hvu +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_train_video.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_val_video.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_train.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_val.json +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ OLpWTpTC4P8_000570_000670.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ xsPKW4tZZBc_002330_002430.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_train +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_val + +``` + +For training and evaluating on HVU, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/hvu/README_zh-CN.md b/tools/data/hvu/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..5bb1185c9c87db52a448dfea590bad20dc6534a3 --- /dev/null +++ b/tools/data/hvu/README_zh-CN.md @@ -0,0 +1,110 @@ +# ๅ‡†ๅค‡ HVU + +## ็ฎ€ไป‹ + + + +```BibTeX +@article{Diba2019LargeSH, + title={Large Scale Holistic Video Understanding}, + author={Ali Diba and M. Fayyaz and Vivek Sharma and Manohar Paluri and Jurgen Gall and R. Stiefelhagen and L. Gool}, + journal={arXiv: Computer Vision and Pattern Recognition}, + year={2019} +} +``` + +่ฏทๅ‚็…ง [ๅฎ˜ๆ–น้กน็›ฎ](https://github.com/holistic-video-understanding/HVU-Dataset/) ๅŠ [ๅŽŸ่ฎบๆ–‡](https://arxiv.org/abs/1904.11451) ไปฅ่Žทๅ–ๆ•ฐๆฎ้›†ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅผ€ๅง‹ไน‹ๅ‰๏ผŒ็”จๆˆท้œ€็กฎไฟๅฝ“ๅ‰็›ฎๅฝ•ไธบ `$MMACTION2/tools/data/hvu/`ใ€‚ + +## 1. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถๅนถ่ฟ›่กŒ้ข„ๅค„็†๏ผš + +```shell +bash download_annotations.sh +``` + +ๆญคๅค–๏ผŒ็”จๆˆทๅฏไฝฟ็”จๅฆ‚ไธ‹ๅ‘ฝไปค่งฃๆž HVU ็š„ๆ ‡็ญพๅˆ—่กจ๏ผš + +```shell +python parse_tag_list.py +``` + +## 2. ๅ‡†ๅค‡่ง†้ข‘ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘๏ผŒ่ง†้ข‘ๅ‡†ๅค‡ไปฃ็ ไฟฎๆ”น่‡ช [ActivityNet ็ˆฌ่™ซ](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)ใ€‚ +ๆณจๆ„่ฟ™ไธ€ๆญฅ้ชคๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ดใ€‚ + +```shell +bash download_videos.sh +``` + +## 3. ๆๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทไป…ไฝฟ็”จ video loader๏ผŒๅˆ™ๅฏไปฅ่ทณ่ฟ‡ๆœฌๆญฅใ€‚ + +ๅœจๆๅ–ไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +็”จๆˆทๅฏไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌๅŒๆ—ถๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผš + +```shell +bash extract_frames.sh +``` + +่ฏฅ่„šๆœฌ้ป˜่ฎค็”Ÿๆˆ็Ÿญ่พน้•ฟๅบฆไธบ 256 ็š„ๅธง๏ผŒๅฏๅ‚่€ƒ [ๆ•ฐๆฎๅ‡†ๅค‡](/docs/zh_cn/user_guides/prepare_dataset.md) ่Žทๅพ—ๆ›ดๅคš็ป†่Š‚ใ€‚ + +## 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ไธคไธช่„šๆœฌๅˆ†ๅˆซไธบ่ง†้ข‘ๅ’Œๅธงๆ–‡ไปถๅคน็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ๏ผš + +```shell +bash generate_videos_filelist.sh +# ไธบๅธงๆ–‡ไปถๅคน็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ +bash generate_rawframes_filelist.sh +``` + +## 5. ไธบๆฏไธช tag ็ง็ฑป็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +่‹ฅ็”จๆˆท้œ€่ฆไธบ HVU ๆ•ฐๆฎ้›†็š„ๆฏไธช tag ็ง็ฑป่ฎญ็ปƒ่ฏ†ๅˆซๆจกๅž‹๏ผŒๅˆ™้œ€่ฆ่ฟ›่กŒๆญคๆญฅ้ชคใ€‚ + +ๆญฅ้ชค 4 ไธญ็”Ÿๆˆ็š„ๆ–‡ไปถๅˆ—่กจๅŒ…ๅซไธๅŒ็ฑปๅž‹็š„ๆ ‡็ญพ๏ผŒไป…ๆ”ฏๆŒไฝฟ็”จ HVUDataset ่ฟ›่กŒๆถ‰ๅŠๅคšไธชๆ ‡็ญพ็ง็ฑป็š„ๅคšไปปๅŠกๅญฆไน ใ€‚ๅŠ ่ฝฝๆ•ฐๆฎ็š„่ฟ‡็จ‹ไธญ้œ€่ฆไฝฟ็”จ `LoadHVULabel` ็ฑป่ฟ›่กŒๅคš็ฑปๅˆซๆ ‡็ญพ็š„ๅŠ ่ฝฝ๏ผŒ่ฎญ็ปƒ่ฟ‡็จ‹ไธญไฝฟ็”จ `HVULoss` ไฝœไธบๆŸๅคฑๅ‡ฝๆ•ฐใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทไป…้œ€่ฎญ็ปƒๆŸไธ€็‰นๅฎš็ฑปๅˆซ็š„ๆ ‡็ญพ๏ผŒไพ‹ๅฆ‚่ฎญ็ปƒไธ€่ฏ†ๅˆซๆจกๅž‹็”จไบŽ่ฏ†ๅˆซ HVU ไธญ `action` ็ฑปๅˆซ็š„ๆ ‡็ญพ๏ผŒๅˆ™ๅปบ่ฎฎไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌไธบ็‰นๅฎšๆ ‡็ญพ็ง็ฑป็”Ÿๆˆๆ–‡ไปถๅˆ—่กจใ€‚ๆ–ฐ็”Ÿๆˆ็š„ๅˆ—่กจๅฐ†ๅชๅซๆœ‰็‰นๅฎš็ฑปๅˆซ็š„ๆ ‡็ญพ๏ผŒๅ› ๆญคๅฏไฝฟ็”จ `VideoDataset` ๆˆ– `RawframeDataset` ่ฟ›่กŒๅŠ ่ฝฝใ€‚่ฎญ่ฎญ็ปƒ่ฟ‡็จ‹ไธญไฝฟ็”จ `BCELossWithLogits` ไฝœไธบๆŸๅคฑๅ‡ฝๆ•ฐใ€‚ + +ไปฅไธ‹่„šๆœฌไธบ็ฑปๅˆซไธบ ${category} ็š„ๆ ‡็ญพ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ๏ผŒๆณจๆ„ไป…ๆ”ฏๆŒ HVU ๆ•ฐๆฎ้›†ๅŒ…ๅซ็š„ 6 ็งๆ ‡็ญพ็ฑปๅˆซ: action, attribute, concept, event, object, sceneใ€‚ + +```shell +python generate_sub_file_list.py path/to/filelist.json ${category} +``` + +ๅฏนไบŽ็ฑปๅˆซ ${category}๏ผŒ็”Ÿๆˆ็š„ๆ ‡็ญพๅˆ—่กจๆ–‡ไปถๅไธญๅฐ†ไฝฟ็”จ `hvu_${category}` ๆ›ฟไปฃ `hvu`ใ€‚ไพ‹ๅฆ‚๏ผŒ่‹ฅๅŽŸๆŒ‡ๅฎšๆ–‡ไปถๅไธบ `hvu_train.json`๏ผŒๅˆ™ๅฏนไบŽ็ฑปๅˆซ action๏ผŒ็”Ÿๆˆ็š„ๆ–‡ไปถๅˆ—่กจๅไธบ `hvu_action_train.json`ใ€‚ + +## 6. ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆ•ดๅฎŒๆˆ HVU ็š„ๆ•ฐๆฎๅค„็†ๅŽ๏ผŒๅฐ†ๅพ—ๅˆฐๅธงๆ–‡ไปถๅคน๏ผˆRGB ๅธงๅ’Œๅ…‰ๆตๅธง๏ผ‰๏ผŒ่ง†้ข‘ไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็›ฎๅฝ•ไธ‹๏ผˆไป…้’ˆๅฏน HVU๏ผ‰๏ผŒๅฎŒๆ•ด็›ฎๅฝ•็ป“ๆž„ๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ hvu +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_train_video.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_val_video.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_train.json +โ”‚ โ”‚ โ”œโ”€โ”€ hvu_val.json +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ OLpWTpTC4P8_000570_000670.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ xsPKW4tZZBc_002330_002430.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_train +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_val + +``` + +ๅ…ณไบŽ HVU ๆ•ฐๆฎ้›†ไธŠ็š„่ฎญ็ปƒไธŽๆต‹่ฏ•๏ผŒ่ฏทๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/hvu/download.py b/tools/data/hvu/download.py new file mode 100644 index 0000000000000000000000000000000000000000..c86b4da6cac0311bf0ea560151ad5ecbb18b3d48 --- /dev/null +++ b/tools/data/hvu/download.py @@ -0,0 +1,203 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/activitynet/ActivityNet/ +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import argparse +import glob +import os +import shutil +import ssl +import subprocess +import uuid + +import mmengine +from joblib import Parallel, delayed + +ssl._create_default_https_context = ssl._create_unverified_context +args = None + + +def create_video_folders(output_dir, tmp_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + +def construct_video_filename(item, trim_format, output_dir): + """Given a dataset row, this function constructs the output filename for a + given video.""" + youtube_id, start_time, end_time = item + start_time, end_time = int(start_time * 10), int(end_time * 10) + basename = '%s_%s_%s.mp4' % (youtube_id, trim_format % start_time, + trim_format % end_time) + output_filename = os.path.join(output_dir, basename) + return output_filename + + +def download_clip(video_identifier, + output_filename, + start_time, + end_time, + tmp_dir='/tmp/hvu/.tmp_dir', + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + start_time: float + Indicates the beginning time in seconds from where the video + will be trimmed. + end_time: float + Indicates the ending time in seconds of the trimmed video. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4()) + + if not os.path.exists(output_filename): + if not os.path.exists(tmp_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', + '--no-check-certificate', '-f', 'mp4', '-o', + '"%s"' % tmp_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + attempts += 1 + if attempts == num_attempts: + return status, 'Downloading Failed' + else: + break + + tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] + # Construct command to trim the videos (ffmpeg required). + command = [ + 'ffmpeg', '-i', + '"%s"' % tmp_filename, '-ss', + str(start_time), '-t', + str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '1', '-loglevel', 'panic', + '"%s"' % output_filename + ] + command = ' '.join(command) + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + return status, 'Trimming Failed' + + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + os.remove(tmp_filename) + return status, 'Downloaded' + + +def download_clip_wrapper(item, trim_format, tmp_dir, output_dir): + """Wrapper for parallel processing purposes.""" + output_filename = construct_video_filename(item, trim_format, output_dir) + clip_id = os.path.basename(output_filename).split('.mp4')[0] + if os.path.exists(output_filename): + status = tuple([clip_id, True, 'Exists']) + return status + + youtube_id, start_time, end_time = item + downloaded, log = download_clip( + youtube_id, output_filename, start_time, end_time, tmp_dir=tmp_dir) + + status = tuple([clip_id, downloaded, log]) + return status + + +def parse_hvu_annotations(input_csv): + """Returns a parsed DataFrame. + arguments: + --------- + input_csv: str + Path to CSV file containing the following columns: + 'Tags, youtube_id, time_start, time_end' + returns: + ------- + dataset: List of tuples. Each tuple consists of + (youtube_id, time_start, time_end). The type of time is float. + """ + lines = open(input_csv).readlines() + lines = [x.strip().split(',')[1:] for x in lines[1:]] + + lines = [(x[0], float(x[1]), float(x[2])) for x in lines] + + return lines + + +def main(input_csv, + output_dir, + trim_format='%06d', + num_jobs=24, + tmp_dir='/tmp/hvu'): + + tmp_dir = os.path.join(tmp_dir, '.tmp_dir') + + # Reading and parsing HVU. + dataset = parse_hvu_annotations(input_csv) + + # Creates folders where videos will be saved later. + create_video_folders(output_dir, tmp_dir) + + # Download all clips. + if num_jobs == 1: + status_lst = [] + for item in dataset: + status_lst.append( + download_clip_wrapper(item, trim_format, tmp_dir, output_dir)) + else: + status_lst = Parallel(n_jobs=num_jobs)( + delayed(download_clip_wrapper)(item, trim_format, tmp_dir, + output_dir) for item in dataset) + + # Clean tmp dir. + shutil.rmtree(tmp_dir) + # Save download report. + mmengine.dump(status_lst, 'download_report.json') + + +if __name__ == '__main__': + description = 'Helper script for downloading and trimming HVU videos.' + p = argparse.ArgumentParser(description=description) + p.add_argument( + 'input_csv', + type=str, + help=('CSV file containing the following format: ' + 'Tags, youtube_id, time_start, time_end')) + p.add_argument( + 'output_dir', + type=str, + help='Output directory where videos will be saved.') + p.add_argument( + '-f', + '--trim-format', + type=str, + default='%06d', + help=('This will be the format for the ' + 'filename of trimmed videos: ' + 'videoid_%0xd(start_time)_%0xd(end_time).mp4. ' + 'Note that the start_time is multiplied by 10 since ' + 'decimal exists somewhere. ')) + p.add_argument('-n', '--num-jobs', type=int, default=24) + p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/hvu') + main(**vars(p.parse_args())) diff --git a/tools/data/hvu/download_annotations.sh b/tools/data/hvu/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..a247cc12c6d7faa634c609fb7a566353883bda19 --- /dev/null +++ b/tools/data/hvu/download_annotations.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +DATA_DIR="../../../data/hvu/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +git clone https://github.com/holistic-video-understanding/HVU-Dataset.git + +cd HVU-Dataset +unzip -o HVU_Train_V1.0.zip +unzip -o HVU_Val_V1.0.zip +cd .. +mv HVU-Dataset/HVU_Train_V1.0.csv ${DATA_DIR}/hvu_train.csv +mv HVU-Dataset/HVU_Val_V1.0.csv ${DATA_DIR}/hvu_val.csv +mv HVU-Dataset/HVU_Tags_Categories_V1.0.csv ${DATA_DIR}/hvu_categories.csv + +rm -rf HVU-Dataset diff --git a/tools/data/hvu/download_videos.sh b/tools/data/hvu/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..5d2b7167f27398de908f5e7fe10b5fe2e4be88ea --- /dev/null +++ b/tools/data/hvu/download_videos.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate hvu +pip install mmcv +pip install --upgrade youtube-dl + +DATA_DIR="../../../data/hvu" +ANNO_DIR="../../../data/hvu/annotations" +python download.py ${ANNO_DIR}/hvu_train.csv ${DATA_DIR}/videos_train +python download.py ${ANNO_DIR}/hvu_val.csv ${DATA_DIR}/videos_val + +source deactivate hvu +conda remove -n hvu --all diff --git a/tools/data/hvu/environment.yml b/tools/data/hvu/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6d9959e88a91881de1be2d38928c63e9aa79938 --- /dev/null +++ b/tools/data/hvu/environment.yml @@ -0,0 +1,36 @@ +name: kinetics +channels: + - anaconda + - menpo + - conda-forge + - defaults +dependencies: + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl diff --git a/tools/data/hvu/extract_frames.sh b/tools/data/hvu/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..c81814ccd39219b3836cc0fc2bfe4a6ce929a57d --- /dev/null +++ b/tools/data/hvu/extract_frames.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/hvu/videos_train/ ../../data/hvu/rawframes_train/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated for train set" + +python build_rawframes.py ../../data/hvu/videos_val/ ../../data/hvu/rawframes_val/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated for val set" + +cd hvu/ diff --git a/tools/data/hvu/generate_file_list.py b/tools/data/hvu/generate_file_list.py new file mode 100644 index 0000000000000000000000000000000000000000..3cbfba8ace63dd717747b54124cb624c8c4e921e --- /dev/null +++ b/tools/data/hvu/generate_file_list.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import fnmatch +import glob +import os +import os.path as osp + +import mmengine + +annotation_root = '../../data/hvu/annotations' +tag_file = 'hvu_tags.json' +args = None + + +def parse_directory(path, + rgb_prefix='img_', + flow_x_prefix='flow_x_', + flow_y_prefix='flow_y_', + level=1): + """Parse directories holding extracted frames from standard benchmarks. + + Args: + path (str): Directory path to parse frames. + rgb_prefix (str): Prefix of generated rgb frames name. + default: 'img_'. + flow_x_prefix (str): Prefix of generated flow x name. + default: `flow_x_`. + flow_y_prefix (str): Prefix of generated flow y name. + default: `flow_y_`. + level (int): Directory level for glob searching. Options are 1 and 2. + default: 1. + + Returns: + dict: frame info dict with video id as key and tuple(path(str), + rgb_num(int), flow_x_num(int)) as value. + """ + print(f'parse frames under directory {path}') + if level == 1: + # Only search for one-level directory + def locate_directory(x): + return osp.basename(x) + + frame_dirs = glob.glob(osp.join(path, '*')) + + elif level == 2: + # search for two-level directory + def locate_directory(x): + return osp.join(osp.basename(osp.dirname(x)), osp.basename(x)) + + frame_dirs = glob.glob(osp.join(path, '*', '*')) + + else: + raise ValueError('level can be only 1 or 2') + + def count_files(directory, prefix_list): + """Count file number with a given directory and prefix. + + Args: + directory (str): Data directory to be search. + prefix_list (list): List or prefix. + + Returns: + list (int): Number list of the file with the prefix. + """ + lst = os.listdir(directory) + cnt_list = [len(fnmatch.filter(lst, x + '*')) for x in prefix_list] + return cnt_list + + # check RGB + frame_dict = {} + for i, frame_dir in enumerate(frame_dirs): + total_num = count_files(frame_dir, + (rgb_prefix, flow_x_prefix, flow_y_prefix)) + dir_name = locate_directory(frame_dir) + + num_x = total_num[1] + num_y = total_num[2] + if num_x != num_y: + raise ValueError(f'x and y direction have different number ' + f'of flow images in video directory: {frame_dir}') + if i % 200 == 0: + print(f'{i} videos parsed') + + frame_dict[dir_name] = (frame_dir, total_num[0], num_x) + + print('frame directory analysis done') + return frame_dict + + +def parse_args(): + parser = argparse.ArgumentParser(description='build file list for HVU') + parser.add_argument('--input_csv', type=str, help='path of input csv file') + parser.add_argument( + '--src_dir', type=str, help='source video / frames directory') + parser.add_argument( + '--output', + type=str, + help='output filename, should \ + ends with .json') + parser.add_argument( + '--mode', + type=str, + choices=['frames', 'videos'], + help='generate file list for frames or videos') + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + tag_cates = mmengine.load(tag_file) + tag2category = {} + for k in tag_cates: + for tag in tag_cates[k]: + tag2category[tag] = k + + data_list = open(args.input_csv).readlines() + data_list = [x.strip().split(',') for x in data_list[1:]] + + if args.mode == 'videos': + downloaded = os.listdir(args.src_dir) + downloaded = [x.split('.')[0] for x in downloaded] + downloaded_set = set(downloaded) + else: + parse_result = parse_directory(args.src_dir) + downloaded_set = set(parse_result) + + def parse_line(line): + tags, youtube_id, start, end = line + start, end = int(float(start) * 10), int(float(end) * 10) + newname = f'{youtube_id}_{start:06d}_{end:06d}' + tags = tags.split('|') + all_tags = {} + for tag in tags: + category = tag2category[tag] + all_tags.setdefault(category, + []).append(tag_cates[category].index(tag)) + return newname, all_tags + + data_list = [parse_line(line) for line in data_list] + data_list = [line for line in data_list if line[0] in downloaded_set] + + if args.mode == 'frames': + result = [ + dict( + frame_dir=k[0], total_frames=parse_result[k[0]][1], label=k[1]) + for k in data_list + ] + elif args.mode == 'videos': + result = [dict(filename=k[0] + '.mp4', label=k[1]) for k in data_list] + mmengine.dump(result, args.output) diff --git a/tools/data/hvu/generate_rawframes_filelist.sh b/tools/data/hvu/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..68c33b258817cf2db3104df6be34d3003d04ce7d --- /dev/null +++ b/tools/data/hvu/generate_rawframes_filelist.sh @@ -0,0 +1,5 @@ +# to generate file list of frames +python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_train.csv --src_dir ../../../data/hvu/rawframes_train \ + --output ../../../data/hvu/hvu_train.json --mode frames +python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_val.csv --src_dir ../../../data/hvu/rawframes_val \ + --output ../../../data/hvu/hvu_val.json --mode frames diff --git a/tools/data/hvu/generate_sub_file_list.py b/tools/data/hvu/generate_sub_file_list.py new file mode 100644 index 0000000000000000000000000000000000000000..f304c7f264b4e981c5af2335b9a690543d3e3350 --- /dev/null +++ b/tools/data/hvu/generate_sub_file_list.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp + +import mmengine + + +def main(annotation_file, category): + assert category in [ + 'action', 'attribute', 'concept', 'event', 'object', 'scene' + ] + + data = mmengine.load(annotation_file) + basename = osp.basename(annotation_file) + dirname = osp.dirname(annotation_file) + basename = basename.replace('hvu', f'hvu_{category}') + + target_file = osp.join(dirname, basename) + + result = [] + for item in data: + label = item['label'] + if category in label: + item['label'] = label[category] + result.append(item) + + mmengine.dump(data, target_file) + + +if __name__ == '__main__': + description = 'Helper script for generating HVU per-category file list.' + p = argparse.ArgumentParser(description=description) + p.add_argument( + 'annotation_file', + type=str, + help=('The annotation file which contains tags of all categories.')) + p.add_argument( + 'category', + type=str, + choices=['action', 'attribute', 'concept', 'event', 'object', 'scene'], + help='The tag category that you want to generate file list for.') + main(**vars(p.parse_args())) diff --git a/tools/data/hvu/generate_videos_filelist.sh b/tools/data/hvu/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..8bcbd03a7822d9e5fa181d031b610ac2eaa400c0 --- /dev/null +++ b/tools/data/hvu/generate_videos_filelist.sh @@ -0,0 +1,5 @@ +# to generate file lists of videos +python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_train.csv --src_dir ../../../data/hvu/videos_train \ + --output ../../../data/hvu/hvu_train_video.json --mode videos +python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_val.csv --src_dir ../../../data/hvu/videos_val \ + --output ../../../data/hvu/hvu_val_video.json --mode videos diff --git a/tools/data/hvu/label_map.json b/tools/data/hvu/label_map.json new file mode 100644 index 0000000000000000000000000000000000000000..a6525d473f307e5ad58b48d51515cfff6a120275 --- /dev/null +++ b/tools/data/hvu/label_map.json @@ -0,0 +1 @@ +{"action": ["abseiling", "acrobatics", "acting_in_play", "adjusting_glasses", "air_drumming", "alligator_wrestling", "alpine_skiing", "american_football", "angling", "answering_questions", "applauding", "applying_cream", "archaeological_excavation", "archery", "arguing", "arm_wrestling", "arranging_flowers", "assembling_bicycle", "assembling_computer", "attending_conference", "auctioning", "auto_racing", "backflip_human_", "baking_cookies", "ball_game", "bandaging", "barbequing", "bartending", "base_jumping", "baseball", "basketball_moves", "bathing", "bathing_dog", "baton_twirling", "battle_rope_training", "beach_soccer", "beatboxing", "bee_keeping", "belly_dancing", "bench_pressing", "bending_back", "bending_metal", "biking_through_snow", "blasting_sand", "blowdrying_hair", "blowing_bubble_gum", "blowing_glass", "blowing_leaves", "blowing_nose", "blowing_out_candles", "bmx", "boating", "bobsledding", "bodybuilding", "bodysurfing", "bookbinding", "bottling", "bouldering", "bouncing_on_bouncy_castle", "bouncing_on_trampoline", "bowling", "boxing", "braiding_hair", "breading_or_breadcrumbing", "breakdancing", "breaking_boards", "breathing_fire", "brush_painting", "brushing_hair", "brushing_teeth", "building_cabinet", "building_lego", "building_sandcastle", "building_shed", "bull_fighting", "bulldozing", "bungee_jumping", "burping", "busking", "calculating", "calf_roping", "calligraphy", "canoeing_or_kayaking", "capoeira", "capsizing", "card_game", "card_stacking", "card_throwing", "carrying_baby", "cartwheeling", "carving_ice", "carving_pumpkin", "casting_fishing_line", "catching_fish", "catching_or_throwing_baseball", "catching_or_throwing_frisbee", "catching_or_throwing_softball", "caving", "celebrating", "changing_gear_in_car", "changing_oil", "changing_wheel_not_on_bike_", "checking_tires", "cheering", "cheerleading", "chewing_gum", "chiseling_stone", "chiseling_wood", "chopping_meat", "chopping_vegetables", "chopping_wood", "choreography", "clam_digging", "clapping", "clay_pottery_making", "clean_and_jerk", "cleaning_gutters", "cleaning_pool", "cleaning_shoes", "cleaning_toilet", "cleaning_windows", "climbing", "climbing_a_rope", "climbing_ladder", "climbing_tree", "clipping_cat_claws", "coloring_in", "combing_hair", "contact_juggling", "contorting", "control", "cooking", "cooking_egg", "cooking_on_campfire", "cooking_sausages_not_on_barbeque_", "cooking_scallops", "cosplaying", "counting_money", "country_line_dancing", "cracking_back", "cracking_knuckles", "cracking_neck", "craft", "crawling_baby", "crochet", "croquet", "cross", "cross_country_cycling", "crossing_eyes", "crossing_river", "crying", "cumbia", "curling_hair", "curling_sport_", "cutting_apple", "cutting_nails", "cutting_orange", "cutting_pineapple", "cutting_the_grass", "cutting_watermelon", "cycling", "dance", "dancing_ballet", "dancing_charleston", "dancing_gangnam_style", "dancing_macarena", "deadlifting", "decorating_the_christmas_tree", "delivering_mail", "dining", "directing_traffic", "disc_dog", "disc_golfing", "diving", "diving_cliff", "docking_boat", "dodgeball", "doing_a_powerbomb", "doing_aerobics", "doing_jigsaw_puzzle", "doing_karate", "doing_kickboxing", "doing_laundry", "doing_motocross", "doing_nails", "downhill_mountain_biking", "drawing", "dribbling_basketball", "drinking", "drinking_shots", "driving_car", "driving_tractor", "drooling", "drop_kicking", "drum_corps", "drumming_fingers", "dumpster_diving", "dunking_basketball", "dyeing_eyebrows", "dyeing_hair", "eating", "eating_burger", "eating_cake", "eating_carrots", "eating_chips", "eating_doughnuts", "eating_hotdog", "eating_ice_cream", "eating_spaghetti", "eating_watermelon", "egg_hunting", "embroidering", "equitation", "exercising_with_an_exercise_ball", "extinguishing_fire", "faceplanting", "falling_off_bike", "falling_off_chair", "feeding_birds", "feeding_fish", "feeding_goats", "fencing_sport_", "fidgeting", "fight", "figure_skating", "finger_snapping", "fishing", "fixing_bicycle", "fixing_hair", "fixing_the_roof", "flint_knapping", "flipping_pancake", "fly_casting", "fly_fishing", "fly_tying", "flying_kite", "folding_clothes", "folding_napkins", "folding_paper", "folk_dance", "front_raises", "frying", "frying_vegetables", "futsal", "gambling", "geocaching", "getting_a_haircut", "getting_a_piercing", "getting_a_tattoo", "giving_or_receiving_award", "gliding", "gold_panning", "golf", "golf_chipping", "golf_driving", "golf_putting", "gospel_singing_in_church", "grappling", "grilling", "grinding_meat", "grooming_dog", "grooming_horse", "gymnastics", "gymnastics_tumbling", "hammer_throw", "hand_car_wash", "hand_washing_clothes", "harvest", "head_stand", "headbanging", "headbutting", "high_jump", "high_kick", "historical_reenactment", "hitting_a_pinata", "hitting_baseball", "hockey_stop", "holding_snake", "home_roasting_coffee", "hopscotch", "hoverboarding", "huddling", "hugging_baby", "hugging_not_baby_", "hula_hooping", "hunt_seat", "hurdling", "hurling_sport_", "ice_climbing", "ice_fishing", "ice_skating", "ice_swimming", "inflating_balloons", "inline_skating", "installing_carpet", "ironing", "ironing_hair", "javelin_throw", "jaywalking", "jetskiing", "jogging", "juggling_balls", "juggling_fire", "juggling_soccer_ball", "jumping", "jumping_bicycle", "jumping_into_pool", "jumping_jacks", "jumpstyle_dancing", "karaoke", "kicking_field_goal", "kicking_soccer_ball", "kissing", "kitesurfing", "knitting", "krumping", "land_sailing", "laughing", "lawn_mower_racing", "laying_bricks", "laying_concrete", "laying_stone", "laying_tiles", "layup_drill_in_basketball", "learning", "leatherworking", "licking", "lifting_hat", "lighting_fire", "lock_picking", "logging", "long_jump", "longboarding", "looking_at_phone", "luge", "lunge", "making_a_cake", "making_a_lemonade", "making_a_sandwich", "making_an_omelette", "making_balloon_shapes", "making_bubbles", "making_cheese", "making_horseshoes", "making_jewelry", "making_paper_aeroplanes", "making_pizza", "making_snowman", "making_sushi", "making_tea", "making_the_bed", "marching", "marching_percussion", "marriage_proposal", "massaging_back", "massaging_feet", "massaging_legs", "massaging_neck", "massaging_person_s_head", "milking_cow", "modern_dance", "moon_walking", "mopping_floor", "mosh_pit_dancing", "motorcycling", "mountain_biking", "mountain_climber_exercise_", "moving_furniture", "mowing_lawn", "mushroom_foraging", "needle_felting", "needlework", "news_anchoring", "opening_bottle_not_wine_", "opening_door", "opening_present", "opening_refrigerator", "opening_wine_bottle", "origami", "outdoor_recreation", "packing", "painting_fence", "painting_furniture", "pan_frying", "parachuting", "paragliding", "parasailing", "parkour", "passing_american_football_in_game_", "passing_american_football_not_in_game_", "passing_soccer_ball", "peeling_apples", "peeling_potatoes", "percussion", "person_collecting_garbage", "petting_animal_not_cat_", "petting_cat", "photobombing", "photocopying", "photograph", "physical_exercise", "picking_fruit", "pillow_fight", "pinching", "pirouetting", "pitch", "planing_wood", "planting_trees", "plastering", "plataform_diving", "playing_accordion", "playing_badminton", "playing_bagpipes", "playing_basketball", "playing_bass_guitar", "playing_beer_pong", "playing_blackjack", "playing_cello", "playing_chess", "playing_clarinet", "playing_congas", "playing_controller", "playing_cricket", "playing_cymbals", "playing_darts", "playing_didgeridoo", "playing_dominoes", "playing_drums", "playing_field_hockey", "playing_flute", "playing_gong", "playing_guitar", "playing_hand_clapping_games", "playing_harmonica", "playing_harp", "playing_ice_hockey", "playing_keyboard", "playing_kickball", "playing_lacrosse", "playing_laser_tag", "playing_lute", "playing_maracas", "playing_marbles", "playing_monopoly", "playing_netball", "playing_ocarina", "playing_organ", "playing_paintball", "playing_pan_pipes", "playing_piano", "playing_pinball", "playing_ping_pong", "playing_poker", "playing_polo", "playing_recorder", "playing_rubiks_cube", "playing_saxophone", "playing_scrabble", "playing_squash_or_racquetball", "playing_ten_pins", "playing_tennis", "playing_trombone", "playing_trumpet", "playing_ukulele", "playing_violin", "playing_volleyball", "playing_water_polo", "playing_with_trains", "playing_xylophone", "poking_bellybutton", "pole_vault", "polishing_forniture", "polishing_metal", "popping_balloons", "pouring_beer", "powerbocking", "preparing_pasta", "preparing_salad", "presenting_weather_forecast", "print", "public_speaking", "pull_ups", "pumping_fist", "pumping_gas", "punch", "punching_bag", "punching_person_boxing_", "purl", "push_up", "pushing_car", "pushing_cart", "pushing_wheelbarrow", "pushing_wheelchair", "putting_in_contact_lenses", "putting_on_eyeliner", "putting_on_foundation", "putting_on_lipstick", "putting_on_mascara", "putting_on_sari", "putting_on_shoes", "rafting", "raising_eyebrows", "raking_leaves", "reading", "reading_book", "reading_newspaper", "recording_music", "recreation", "recreational_fishing", "removing_curlers", "repairing_puncture", "riding_a_bike", "riding_bumper_cars", "riding_camel", "riding_elephant", "riding_mechanical_bull", "riding_mower", "riding_mule", "riding_or_walking_with_horse", "riding_scooter", "riding_snow_blower", "riding_unicycle", "ripping_paper", "river_tubing", "roasting", "roasting_marshmallows", "roasting_pig", "robot_dancing", "rock_climbing", "rock_scissors_paper", "rodeo", "roller_skating", "rollerblading", "rolling_pastry", "roof_shingle_removal", "rope_pushdown", "running", "running_on_treadmill", "sailing", "salsa_dancing", "sanding_floor", "sausage_making", "sawing_wood", "scrambling_eggs", "scrapbooking", "scrubbing_face", "scuba_diving", "separating_eggs", "setting_table", "sewing", "shaking_hands", "shaking_head", "shaping_bread_dough", "sharpening_knives", "sharpening_pencil", "shaving_head", "shaving_legs", "shearing_sheep", "shining_flashlight", "shining_shoes", "shooting", "shooting_basketball", "shooting_goal_soccer_", "shopping", "shot_put", "shoveling_snow", "shucking_oysters", "shuffling_cards", "shuffling_feet", "side_kick", "sign_language_interpreting", "singing", "sipping_cup", "sitting", "situp", "skateboarding", "ski_jumping", "skiing", "skiing_crosscountry", "skiing_mono", "skiing_slalom", "skipping_rope", "skipping_stone", "skydiving", "slacklining", "slapping", "sled_dog_racing", "sledding", "sleeping", "smashing", "smelling_feet", "smile", "smoking", "smoking_hookah", "smoking_pipe", "snatch_weight_lifting", "sneezing", "snorkeling", "snow_tubing", "snowboarding", "snowkiting", "snowmobiling", "soccer", "softball", "somersaulting", "sparring", "spelunking", "spinning_poi", "sports_training", "spray_painting", "spread_mulch", "springboard_diving", "sprint", "square_dancing", "squat", "standing", "standing_on_hands", "staring", "steer_roping", "sticking_tongue_out", "stitch", "stomping_grapes", "stone_carving", "strength_training", "stretching_arm", "stretching_leg", "sucking_lolly", "surf_fishing", "surfing_crowd", "surfing_water", "sweeping_floor", "swimming", "swimming_backstroke", "swimming_breast_stroke", "swimming_butterfly_stroke", "swimming_front_crawl", "swing_dancing", "swinging_baseball_bat", "swinging_on_something", "sword_fighting", "sword_swallowing", "table_soccer", "tackling", "tagging_graffiti", "tai_chi", "talking_on_cell_phone", "tango_dancing", "tap_dancing", "tapping_guitar", "tapping_pen", "tasting_beer", "tasting_food", "tasting_wine", "testifying", "texting", "threading_needle", "throwing_axe", "throwing_ball_not_baseball_or_american_football_", "throwing_discus", "throwing_knife", "throwing_snowballs", "throwing_tantrum", "throwing_water_balloon", "tickling", "tie_dying", "tightrope_walking", "tiptoeing", "tobogganing", "tossing_coin", "track_and_field", "trail_riding", "training_dog", "trapezing", "trimming_or_shaving_beard", "trimming_shrubs", "trimming_trees", "triple_jump", "twiddling_fingers", "tying_bow_tie", "tying_knot_not_on_a_tie_", "tying_necktie", "tying_shoe_laces", "unboxing", "underwater_diving", "unloading_truck", "using_a_microscope", "using_a_paint_roller", "using_a_power_drill", "using_a_sledge_hammer", "using_a_wrench", "using_atm", "using_bagging_machine", "using_circular_saw", "using_inhaler", "using_puppets", "using_remote_controller_not_gaming_", "using_segway", "using_the_monkey_bar", "using_the_pommel_horse", "vacuuming_floor", "visiting_the_zoo", "wading_through_mud", "wading_through_water", "waiting_in_line", "waking_up", "walking", "walking_the_dog", "walking_through_snow", "washing_dishes", "washing_feet", "washing_hair", "washing_hands", "waste", "watching_tv", "water_skiing", "water_sliding", "watering_plants", "waving_hand", "waxing_back", "waxing_chest", "waxing_eyebrows", "waxing_legs", "weaving", "weaving_basket", "weaving_fabric", "welding", "whistling", "wicker_weaving", "windsurfing", "winking", "wood_burning_art_", "worship", "wrapping_present", "wrestling", "writing", "yarn_spinning", "yawning", "yoga", "zumba"], "attribute": ["afro", "aggression", "al_dente", "angora", "art_paper", "asphalt", "azure", "bangs", "barechestedness", "beauty", "beige", "black", "black_and_white", "black_hair", "blond", "blue", "bmw", "boiling", "brass", "bricks_and_mortar", "brown", "brown_hair", "caffeine", "calm", "camouflage", "caramel_color", "cardboard", "ceramic", "citric_acid", "classic", "clay", "cleft", "cobalt_blue", "coca_cola", "complexion", "concrete", "cool", "dairy", "darkness", "daytime", "deciduous", "denim", "drama", "elder", "electric_blue", "emerald", "evergreen", "explosive_material", "floating", "fluid", "flyweight", "forward", "freezing", "fun", "glitter", "gold", "granite", "green", "happy", "human_hair_color", "hunky", "inflatable", "iron", "laminate", "layered_hair", "leather", "leisure", "lilac", "long_hair", "magenta", "maroon", "metal", "metropolis", "military", "moist", "monochrome", "multimedia", "neon", "orange", "origami_paper", "paper", "patchwork", "peach", "pigtail", "pink", "plane", "plastic", "platinum_blond", "plush", "plywood", "polka_dot", "pompadour", "purple", "rapid", "red", "red_hair", "reflection", "satin", "shade", "silk", "silver", "sweetness", "symmetry", "synthetic_rubber", "teal", "transparency_and_translucency", "turquoise", "velvet", "violet", "white", "wood", "wool", "woolen", "woven_fabric", "wrinkle", "yellow", "youth"], "concept": ["aerial_photography", "agriculture", "air_force", "air_sports", "american_food", "ancient_history", "angle", "animal_migration", "animal_source_foods", "animal_sports", "arch", "architecture", "army", "art", "artistic_gymnastics", "asian_food", "athletics", "audience", "automotive_design", "automotive_exterior", "aviation", "baked_goods", "ball_over_a_net_games", "bat_and_ball_games", "benthos", "blessing", "boardsport", "brand", "business", "cable_management", "cellular_network", "choir", "circle", "circus", "class", "classic_car", "classical_music", "clergy", "clip_art", "close_up", "collaboration", "color_guard", "combat_sport", "comfort", "comfort_food", "commodity", "community", "computer_program", "concert_band", "confectionery", "construction", "contact_sport", "convenience_food", "costume_design", "court", "court_game", "crew", "crowd", "cube", "cuisine", "currency", "cycle_sport", "cylinder", "decor", "design", "dialog_box", "diet_food", "display_advertising", "dog_breed", "dog_sports", "doubles", "dressage", "east_asian_food", "ecosystem", "electrical_network", "electricity", "electronics", "emergency", "emergency_service", "emotion", "endurance_sports", "energy", "engineering", "ensemble", "entertainment", "equestrian_sport", "erg", "european_food", "extreme_sport", "facial_expression", "family", "fashion_design", "fast_food", "fauna", "fictional_character", "field_game", "film", "finger_food", "fixed_link", "floral_design", "floristry", "font", "fried_food", "friendship", "frozen_food", "games", "geological_phenomenon", "geology", "german_food", "golf_club", "graffito", "graphic_design", "graphics", "grilled_food", "hairstyle", "handwriting", "health_care", "heart", "heat", "herd", "history", "human_behavior", "individual_sports", "indoor_games_and_sports", "industry", "infrastructure", "interaction", "interior_design", "inventory", "italian_food", "japanese_cuisine", "japanese_martial_arts", "job", "junk_food", "kite_sports", "land_vehicle", "laser", "laughter", "law_enforcement", "light_commercial_vehicle", "lighting", "line", "line_art", "local_food", "lockstitch", "logo", "love", "luxury_vehicle", "luxury_yacht", "major_appliance", "male", "management", "map", "marching_band", "marine_mammal", "martial_arts", "mass_production", "match_play", "meal", "medal_play", "medical", "medicine", "memorial", "mesh", "meteorological_phenomenon", "mid_size_car", "military_officer", "military_organization", "military_rank", "mineral", "mixture", "mode_of_transport", "modern_art", "money", "monochrome_photography", "motorsport", "music", "musical_ensemble", "natural_foods", "nature", "news", "non_sporting_group", "number", "off_road", "official", "orchestra", "organism", "pachyderm", "packaging_and_labeling", "painting", "party_supply", "pattern", "people", "performance", "performing_arts", "physical_fitness", "pint_us", "plaid", "plant_community", "plaster", "police", "pollinator", "pollution", "pop_music", "primate", "public_transport", "public_utility", "pyramid", "racquet_sport", "rapid_transit", "real_estate", "recipe", "rectangle", "religion", "research", "rock", "roller_sport", "romance", "rose_order", "seafood", "security", "selfie", "service", "shadow", "shelving", "shoal", "shooting_sport", "side_dish", "silhouette", "singles", "skin_care", "social_group", "software", "song", "spanish_cuisine", "sphere", "spiral", "spoor", "sport", "spotlight", "spring_break", "square", "star", "stick_and_ball_games", "stick_and_ball_sports", "still_life", "still_life_photography", "stock_photography", "street_art", "street_food", "striking_combat_sports", "stucco", "superfood", "surface_water_sports", "symbol", "tartan", "taste", "team", "team_sport", "technology", "telephony", "television_program", "tool", "tourism", "towed_water_sport", "tradition", "traditional_sport", "traffic", "tread", "triangle", "tribe", "troop", "underwater", "vegetarian_food", "vegetation", "video_game_software", "visual_arts", "war", "waste_containment", "water_ball_sports", "water_sport", "water_transportation", "watercraft", "weapon", "weapon_combat_sports", "website", "whole_food", "wildlife", "wind", "windsports", "winter_sport"], "event": ["800_metres", "adventure", "air_travel", "art_exhibition", "auto_show", "autumn", "award_ceremony", "banquet", "bedtime", "breakfast", "broad_jump", "brunch", "carnival", "ceremony", "championship", "christmas", "competition", "concert", "conference", "convention", "conversation", "decathlon", "demonstration", "dinner", "disaster", "evening", "exhibition", "festival", "flight", "freight_transport", "general_aviation", "graduation", "halloween", "heptathlon", "holiday", "lecture", "lunch", "manicure", "marathon", "massage", "meeting", "morning", "multi_sport_event", "news_conference", "night", "parade", "party", "photo_shoot", "picnic", "presentation", "protest", "public_event", "race", "ritual", "road_trip", "rock_concert", "safari", "seminar", "ski_cross", "speech", "spring", "summer", "sunrise_and_sunset", "supper", "tournament", "vacation", "wedding", "wedding_reception", "winter"], "object": ["abdomen", "academic_dress", "accordion", "accordionist", "acoustic_electric_guitar", "acoustic_guitar", "acrylic_paint", "action_figure", "active_undergarment", "adding_machine", "aegean_cat", "aerialist", "african_elephant", "agaric", "agaricaceae", "agaricomycetes", "agaricus", "agricultural_machinery", "agriculturist", "aioli", "air_bubble", "air_gun", "aircraft", "airliner", "alaskan_malamute", "album_cover", "alcoholic_beverage", "ale", "algae", "all_terrain_vehicle", "all_xbox_accessory", "alligator", "alloy_wheel", "alpinist", "alto_horn", "american_alligator", "american_pit_bull_terrier", "amusement_ride", "ananas", "anchor", "angle_grinder", "animal_fat", "ankle", "annual_plant", "antique", "antique_car", "appetizer", "apple", "aqua", "aqualung", "aquanaut", "aquarium", "aquatic_plant", "aquifoliaceae", "arabian_camel", "arcade_game", "archer", "arecales", "arm", "artifact", "artificial_fly", "artificial_turf", "artisan", "artwork", "athlete", "athletic_shoe", "audio_engineer", "audio_equipment", "auto_part", "automaton", "automotive_engine_part", "automotive_exhaust", "automotive_lighting", "automotive_mirror", "automotive_tire", "automotive_wheel_system", "automotive_window_part", "ax", "ax_handle", "baby_buggy", "baby_carrier", "baby_products", "baby_toys", "back", "backboard", "backhoe", "backseat", "bag", "bagel", "baggage", "bagpipes", "bait", "baker", "balance_beam", "balcony", "ball", "ballet_dancer", "ballet_skirt", "balloon", "baluster", "bandage", "banderillero", "bandoneon", "banjo", "banner", "barbell", "barber", "baritone_saxophone", "barramundi", "barrel", "barrow", "bartender", "barware", "baseball_bat", "baseball_cap", "baseball_equipment", "baseball_player", "basket", "basketball_player", "bass", "bass_drum", "bass_fiddle", "bass_guitar", "bass_oboe", "bassinet", "bassist", "bassoon", "bathing_cap", "bathroom_accessory", "bathroom_sink", "bathtub", "batter", "bayonne_ham", "bead", "beak", "beam", "bean", "beanie", "beard", "bed", "bed_frame", "bed_sheet", "bedding", "bedrock", "bee", "beef", "beef_tenderloin", "beehive", "beekeeper", "beer", "beer_cocktail", "beer_glass", "belay_device", "bell_peppers_and_chili_peppers", "bench", "berry", "beyaz_peynir", "bib", "bichon", "bicycle", "bicycle_accessory", "bicycle_chain", "bicycle_drivetrain_part", "bicycle_frame", "bicycle_handlebar", "bicycle_helmet", "bicycle_part", "bicycle_saddle", "bicycle_tire", "bicycle_wheel", "bidet", "big_cats", "bikini", "billboard", "bin", "birch", "bird", "birthday_cake", "biscuit", "black_belt", "black_cat", "blackboard", "blacksmith", "blade", "blazer", "blender", "block", "blood", "blossom", "blouse", "blue_collar_worker", "bmx_bike", "boa_constrictor", "board_game", "boas", "boat", "boats_and_boating_equipment_and_supplies", "bobsled", "bocce_ball", "bodybuilder", "bolete", "bonfire", "bongo", "bony_fish", "book", "bookcase", "boot", "bottle", "bottled_water", "boulder", "bouquet", "bow_and_arrow", "bow_tie", "bowed_string_instrument", "bowie_knife", "bowl", "bowler", "bowling_ball", "bowling_equipment", "bowling_pin", "box", "boxing_equipment", "boxing_glove", "boy", "bracelet", "brake_disk", "branch", "brass_instrument", "brassiere", "bratwurst", "bread", "bread_dough", "brick", "bricklayer", "brickwork", "bridal_clothing", "bride", "bridle", "briefs", "broccoli", "brochette", "bromeliaceae", "broom", "broth", "brush", "bubble", "bubble_gum", "bucket", "bugle", "bull", "bulldozer", "bullfighter", "bumper", "bumper_car", "bun", "bungee", "buoyancy_compensator", "bus", "businessperson", "butcher", "buttercream", "button", "button_accordion", "cab", "cabin_cruiser", "cabinet", "cabinetry", "cable", "caesar_salad", "cage", "cake", "calf", "camel", "camera", "camera_accessory", "camera_lens", "camera_operator", "camgirl", "campfire", "candle", "cannon", "canoe", "cap", "car", "car_mirror", "car_seat", "car_seat_cover", "car_tire", "car_wheel", "carbonara", "carbonated_soft_drinks", "cardboard_box", "caricaturist", "carnivoran", "carpenter", "carpet", "carriage", "carrot", "cart", "carton", "cartoon", "carving", "cash", "cash_machine", "cat", "catamaran", "cattle_like_mammal", "ceiling", "celesta", "cellist", "cello", "cellular_telephone", "center_console", "central_processing_unit", "centrepiece", "chain", "chain_link_fencing", "chain_saw", "chair", "chalk", "champagne", "champagne_stemware", "charcoal", "charcuterie", "chariot", "chassis", "cheek", "cheerleader", "cheerleading_uniform", "cheese", "cheese_pizza", "cheeseburger", "chef", "cherry", "chess_master", "chessboard", "chessman", "chest", "chest_hair", "chest_of_drawers", "chicken", "chihuahua", "child", "chin", "chip", "chocolate", "chocolate_brownie", "chocolate_cake", "chocolate_chip_cookie", "chocolate_spread", "choreographer", "christmas_decoration", "christmas_lights", "christmas_tree", "chute", "circuit", "circuit_component", "circular_saw", "circus_acrobat", "citrullus", "citrus", "city_car", "clam", "clams_oysters_mussels_and_scallops", "clarinet", "clarinet_family", "clavier", "clementine", "climber", "climbing_frame", "climbing_harness", "closet", "clothes_closet", "clothes_dryer", "clothes_hamper", "clothing", "cloud", "clown", "coat", "cobblestone", "cockapoo", "cocktail", "cocktail_dress", "cocktail_garnish", "coconut", "cod", "coffee", "coffee_bean", "coffee_cup", "coffee_table", "coin", "cola", "colander", "cold_weapon", "collage", "collar", "collection", "collie", "color_television", "colt", "colubridae", "column", "comb", "comforter", "commercial_vehicle", "common_pet_parakeet", "communication_device", "commuter", "compact_car", "compact_van", "companion_dog", "composite_material", "compound_microscope", "computer", "computer_accessory", "computer_case", "computer_component", "computer_cooling", "computer_hardware", "computer_keyboard", "concert_grand", "concertina", "condiment", "conifer", "construction_equipment", "construction_worker", "convertible", "cookie", "cookie_sheet", "cookies_and_crackers", "cookware_accessory", "cookware_and_bakeware", "cor_anglais", "coral", "coral_reef_fish", "cornet", "cosmetics", "costume", "couch", "countertop", "coverall", "cow_goat_family", "cowbarn", "cowboy", "cowboy_hat", "craftsman", "crampon", "crane", "cravat", "cream", "cream_cheese", "cricket_bat", "cricketer", "crochet_needle", "crocodile", "crocodilia", "crop", "croquet_mallet", "crossword_puzzle", "cruciferous_vegetables", "crystal", "cuatro", "cucumber", "cucumber_gourd_and_melon_family", "cucumis", "cucurbita", "cumulus", "cup", "cupboard", "curbstone", "curd", "curtain", "customer", "cut_flowers", "cutlery", "cymbal", "dairy_cattle", "dairy_cow", "dairy_product", "dance_dress", "dancer", "dashboard", "data_storage_device", "date_palm", "defenseman", "desk", "desktop_computer", "dessert", "dhow", "diaper", "diatonic_button_accordion", "digital_clock", "dining_table", "dinnerware_set", "dip", "discinaceae", "dish", "dishware", "dishwasher", "disk_jockey", "display_case", "display_device", "display_window", "distilled_beverage", "divemaster", "diver", "diving_equipment", "diving_mask", "dobok", "document", "dog", "dog_sled", "doll", "dolphin", "dome", "domestic_rabbit", "donkey", "door", "door_handle", "double_bass", "dough", "drawer", "dress", "dress_shirt", "drill", "drink", "drinker", "drinking_water", "drinkware", "drop", "drum", "drumhead", "drummer", "drumstick", "dry_suit", "dryer", "duck", "ducks_geese_and_swans", "dumbbell", "dump_truck", "duplicator", "dustpan", "ear", "earl_grey_tea", "earrings", "eating_apple", "edger", "edible_mushroom", "egg", "egg_yolk", "electric_guitar", "electric_organ", "electric_piano", "electrical_supply", "electrical_wiring", "electronic_component", "electronic_device", "electronic_keyboard", "electronic_musical_instrument", "electronic_signage", "electronics_accessory", "elephant", "elliptical_trainer", "emblem", "emergency_vehicle", "engine", "engineer", "envelope", "epee", "equestrian", "espresso", "euphonium", "executive_car", "exercise_bike", "exercise_equipment", "exercise_machine", "exhaust_system", "eye", "eye_shadow", "eyebrow", "eyelash", "eyewear", "facade", "face", "facial_hair", "family_car", "fan", "farm_machine", "farmer", "farmworker", "fashion_accessory", "fashion_model", "faucet", "feather", "feather_boa", "feature_phone", "fedora", "fence", "fencing_sword", "fencing_weapon", "fern", "ferry", "fiddle", "field_hockey_ball", "figure_skater", "figurine", "fin", "finger", "finger_paint", "fipple_flute", "fir", "fire", "firearm", "firefighter", "fireplace", "fish", "fish_feeder", "fisherman", "fishing_bait", "fishing_lure", "fishing_rod", "fishing_vessel", "fitness_professional", "flag", "flag_of_the_united_states", "flagstone", "flashlight", "flat_panel_display", "flatbread", "flautist", "flightless_bird", "flooring", "florist", "flour", "flourless_chocolate_cake", "flower", "flower_bouquet", "flowering_plant", "flowerpot", "flush_toilet", "flute", "flutist", "fly", "foal", "foil", "folk_dancer", "folk_instrument", "fondant", "food", "food_processor", "foot", "football_equipment_and_supplies", "football_helmet", "football_player", "footwear", "forehead", "fork", "forklift_truck", "formal_wear", "fortepiano", "foundation", "fountain", "fountain_pen", "free_reed_aerophone", "french_fries", "fret", "fried_egg", "fried_rice", "frost", "frozen_dessert", "fruit", "fruit_tree", "frying_pan", "fuel", "full_size_car", "fungus", "fur", "fur_clothing", "furniture", "gadget", "galliformes", "game_controller", "garbage_heap", "garbage_man", "garbage_truck", "garden_roses", "gardener", "garmon", "garnish", "gas_burner", "gas_pump", "gas_ring", "gate", "gauge", "gazebo", "gear", "gearshift", "gemstone", "german_shepherd_dog", "german_spitz", "gift", "gin_and_tonic", "giraffe", "girl", "glass", "glassblower", "glasses", "glider", "glockenspiel", "glove", "glutinous_rice", "go_kart", "goal", "goat", "goat_antelope", "goggles", "golden_retriever", "goldfish", "golf_ball", "golf_equipment", "golfcart", "golfer", "gourd", "gown", "graffiti", "grand_piano", "grape", "grapevine_family", "grass", "gravel", "great_dane", "greek_salad", "green_algae", "green_bean", "greenland_dog", "grenadier", "greyhound", "griddle", "grocer", "groom", "groundcover", "guard_dog", "guard_rail", "guitar", "guitar_accessory", "guitarist", "gymnast", "hair", "hair_accessory", "hair_coloring", "hair_dryer", "hairbrush", "hairdresser", "halter", "hamburger", "hammer", "hand", "hand_calculator", "hand_drum", "hand_glass", "handbag", "handcart", "handlebar", "handrail", "hang_glider", "hard_hat", "hardware", "hardware_accessory", "harmonica", "harp", "harvester", "hat", "hatchback", "hatchet", "havanese", "hay", "head", "head_restraint", "headgear", "headphones", "headpiece", "hearth", "heat_sink", "hedge", "heel", "helmet", "herb", "high_heeled_footwear", "highchair", "hip", "hockey_protective_equipment", "hockey_stick", "home_accessories", "home_appliance", "home_door", "home_fencing", "home_game_console_accessory", "honey_bee", "honeycomb", "hood", "hoodie", "horizontal_bar", "horn", "hors_d_oeuvre", "horse", "horse_and_buggy", "horse_harness", "horse_like_mammal", "horse_supplies", "horse_tack", "horse_trainer", "horseman", "hospital_bed", "hot_air_balloon", "hot_pot", "hot_tub", "household_cleaning_supply", "houseplant", "hub_gear", "hubcap", "human", "human_body", "human_leg", "hunting_dog", "hurdle", "hybrid_bicycle", "ice", "ice_cream", "ice_cream_cone", "ice_lolly", "ice_skate", "iceberg", "icing", "illustration", "indian_elephant", "infant", "infant_bed", "infantry", "inflatable_boat", "ingredient", "input_device", "insect", "invertebrate", "io_card", "iris", "ivy", "jack_o_lantern", "jacket", "jasmine_rice", "javelin", "jaw", "jeans", "jersey", "jewellery", "jigsaw_puzzle", "jockey", "joint", "jointer", "journalist", "joystick", "juggler", "juice", "jungle_gym", "kayak", "kettle", "keyboard_instrument", "keyboard_player", "kielbasa", "kilt", "kisser", "kitchen_appliance", "kitchen_knife", "kite", "kitten", "knackwurst", "knee", "knife", "knit_cap", "knitting_needle", "knot", "koi", "konghou", "lab_coat", "label", "labrador_retriever", "lace", "lacrosse_stick", "lacrosse_training_equipment", "ladder", "lamp", "laptop", "lasso", "latch", "lathe", "laundry", "lawn", "lcd_tv", "lead_pencil", "leaf", "leaf_vegetable", "leash", "led_backlit_lcd_display", "leggings", "lemon", "lemonade", "lens", "leotard", "lettuce", "lever", "ligament", "light_bulb", "light_fixture", "light_microscope", "lighter", "lighting_accessory", "lineman", "linens", "lingerie", "lip", "lip_gloss", "lipstick", "liquor_shelf", "litter", "little_black_dress", "livestock", "lobe", "lock", "locker", "locomotive", "loggerhead", "lollipop", "longboard", "loom", "lotion", "loudspeaker", "lovebird", "loveseat", "lumber", "lute", "macaw", "machine", "machine_tool", "magazine", "maillot", "makeup", "mallet", "maltese", "mammal", "man", "mandarin_orange", "mandolin", "mane", "maraca", "marcher", "mare", "marimba", "marine_invertebrates", "marines", "mask", "mason_jar", "mast", "mat", "matador", "matsutake", "mattress", "mattress_pad", "mcintosh", "measuring_instrument", "meat", "meat_grinder", "mechanic", "media_player", "medical_assistant", "medical_equipment", "medical_glove", "medicine_ball", "melee_weapon", "mellophone", "melon", "membrane_winged_insect", "mender", "metal_lathe", "metalsmith", "microcontroller", "microphone", "microscope", "microwave_oven", "miler", "military_camouflage", "military_person", "military_uniform", "milk", "miniature_poodle", "minibus", "minivan", "mirror", "mixer", "mixing_bowl", "mixing_console", "mobile_device", "mobile_phone", "model", "monument", "moped", "moss", "motherboard", "motocross_bike", "motor_scooter", "motor_ship", "motor_vehicle", "motorboat", "motorcycle", "motorcycle_accessories", "motorcyclist", "motorized_wheelchair", "mountain_bike", "mountaineer", "moustache", "mouth", "mower", "mud", "mug", "mule", "mural", "muscle", "musher", "mushroom", "musical_instrument", "musical_instrument_accessory", "musical_keyboard", "musician", "musket", "nail", "nail_polish", "neck", "necklace", "necktie", "needle", "neon_lamp", "neon_sign", "net", "newscaster", "newspaper", "nib", "nightwear", "non_alcoholic_beverage", "non_commissioned_officer", "non_skin_percussion_instrument", "noodle", "nose", "numeric_keypad", "oars", "oboist", "ocarina", "off_road_vehicle", "office_equipment", "office_supplies", "oil_paint", "open_wheel_car", "optical_instrument", "orator", "organ", "organ_pipe", "organist", "outdoor_furniture", "outdoor_grill", "outdoor_play_equipment", "outdoor_power_equipment", "outdoor_shoe", "outdoor_structure", "outerwear", "output_device", "overhead_power_line", "ox", "oxygen_mask", "oyster", "oyster_mushroom", "oyster_shell", "pack_animal", "paddle", "padlock", "paintball_equipment", "paintball_gun", "palm_tree", "pan", "panelist", "pantyhose", "paper_product", "paper_towel", "parachute", "parakeet", "parallel_bars", "park_bench", "parquet", "parrot", "parsley", "passenger", "passenger_ship", "pasta", "pastry", "patient", "paving", "paw", "pawn", "pearl", "pebble", "pedestrian", "peel", "pen", "pencil", "pencil_sharpener", "pepperoni", "percussion_accessory", "percussion_instrument", "percussionist", "performance_car", "perico", "personal_computer", "personal_digital_assistant", "personal_flotation_device", "personal_protective_equipment", "petal", "pezizales", "photocopier", "physical_therapist", "physician", "pianet", "pianist", "piano", "piano_keyboard", "picador", "picket_fence", "pickup_truck", "picnic_boat", "pig", "pig_like_mammal", "pigeon", "pigeons_and_doves", "pillow", "pilot_boat", "pinata", "pinball_machine", "pine", "pine_family", "pineapple", "pinscher", "pint_glass", "pipe", "pizza", "pizza_cheese", "plant", "plant_stem", "plastic_bag", "plate", "platter", "play_vehicle", "player", "playground_slide", "playpen", "playstation_3_accessory", "playstation_accessory", "pliers", "plimsoll", "plucked_string_instruments", "plumbing", "plumbing_fixture", "pocket", "pointer", "pole", "police_officer", "polo_mallet", "polo_pony", "polo_shirt", "pomeranian", "pommel_horse", "pontoon", "pony", "poodle", "porcelain", "portable_communications_device", "portable_media_player", "portrait", "poster", "potato", "potato_and_tomato_genus", "pothole", "powdered_sugar", "power_drill", "power_mower", "power_shovel", "printer", "produce", "professional_golfer", "propeller", "protective_equipment_in_gridiron_football", "protective_gear_in_sports", "pug", "pumpkin", "pungsan_dog", "puppy", "putter", "puzzle", "queen", "quill", "rabbit", "race_car", "racer", "racing_bicycle", "racket", "radial", "random_orbital_sander", "ranged_weapon", "rear_view_mirror", "recycling_bin", "red_carpet", "red_meat", "red_wine", "redhead", "reed_instrument", "refrigerator", "rein", "remote_control", "reptile", "researcher", "retaining_wall", "retriever", "ribbon", "rice", "rifle", "rim", "ring", "road_bicycle", "roast_beef", "robot", "rock_climbing_equipment", "rock_star", "rodent", "roller_blades", "roller_skates", "rolling_pin", "roof", "root", "root_vegetable", "rope", "rose", "rose_family", "rotisserie", "royal_icing", "rubber_boot", "rubble", "runner", "running_shoe", "saddle", "safe", "safety_belt", "safety_bicycle", "safety_glove", "sail", "sailboat", "sailing_ship", "salad", "salmon", "samoyed", "sand", "sand_wedge", "sandal", "sandbox", "sandwich", "sapsali", "sari", "sarong", "sash_window", "sashimi", "saucer", "sauces", "sausage", "saw", "saxhorn", "saxophone", "saxophonist", "scaffolding", "scale_model", "scaled_reptile", "scanner", "scarf", "schipperke", "schnoodle", "schooner", "scientific_instrument", "scissors", "scooter", "scoreboard", "scow", "scrap", "screen", "scuba_diver", "sculptor", "sculpture", "sea_ice", "sea_kayak", "sea_turtle", "seabird", "seaplane", "seat_belt", "seaweed", "sedan", "seed", "segway", "senior_citizen", "serger", "serpent", "serveware", "sewing_machine", "sewing_machine_needle", "shaving_cream", "shed", "sheep", "shelf", "shih_tzu", "ship", "shipwreck", "shirt", "shoe", "shopkeeper", "shopping_basket", "shopping_cart", "shorts", "shoulder", "shovel", "shower_curtain", "shrimp", "shrub", "siberian_husky", "sicilian_pizza", "sideboard", "siding", "sign", "singer", "singlet", "sink", "skateboard", "skateboarder", "skateboarding_equipment_and_supplies", "sketch", "skewer", "ski", "ski_binding", "ski_equipment", "ski_pole", "skidder", "skiff", "skin", "skin_head_percussion_instrument", "skirt", "slate_roof", "sled", "sled_dog", "sleeper", "sleeve", "sloop", "slot", "slot_machine", "small_appliance", "smartphone", "smoke", "snack", "snake", "snare_drum", "sneakers", "snorkel", "snout", "snow_thrower", "snowboard", "snowmobile", "snowplow", "snowshoe", "snowsuit", "soccer_ball", "soccer_player", "sock", "soft_drink", "soil", "soup", "space_bar", "spaghetti", "spaniel", "spatula", "speaker", "speedometer", "speleothem", "spice", "spin_dryer", "spinach", "spinach_salad", "spindle", "spinet", "spinning_wheel", "spitz", "spoke", "spokesperson", "spoon", "sport_kite", "sport_utility_vehicle", "sports_car", "sports_equipment", "sports_uniform", "sportswear", "spring_greens", "sprinkler", "spruce", "spume", "square_dancer", "squash", "stairs", "stalagmite", "stall", "stallion", "standard_poodle", "statue", "steak", "steam_iron", "steamed_rice", "steel", "steel_drum", "steering_part", "steering_wheel", "stemware", "stew", "stick", "stock_car", "stock_dove", "stocking", "stomach", "stone_wall", "stony_coral", "storage_basket", "stout", "stove_and_oven", "strainer", "straw", "streamer_fly", "street_light", "string_instrument", "string_instrument_accessory", "stubble", "student", "stuffed_toy", "stuffing", "stunt_performer", "subcompact_car", "subwoofer", "sugar_cake", "sugar_paste", "suit", "sun", "sun_hat", "sunbather", "sunglasses", "sunlight", "supercar", "superhero", "surfboard", "surfing_equipment_and_supplies", "sushi", "swab", "swan", "sweater", "sweet_grass", "swimmer", "swimsuit_bottom", "swimwear", "swing", "switch", "synthesizer", "t_shirt", "tabby_cat", "table", "table_knife", "table_tennis_racket", "tablecloth", "tabletop_game", "tableware", "tachometer", "taglierini", "tail", "tall_ship", "tank", "tarpaulin", "tattoo", "tea", "teacher", "teapot", "teddy_bear", "telephone", "television_presenter", "television_reporter", "television_set", "tennis_equipment_and_supplies", "tennis_player", "tennis_pro", "tennis_racket", "tenor_saxophonist", "tent", "terrestrial_animal", "terrestrial_plant", "terrier", "text", "textile", "theater_curtain", "therapist", "thigh", "thorns_spines_and_prickles", "thread", "thumb", "tights", "tile", "tiple", "tire", "toast", "toddler", "toe", "toilet", "toilet_tissue", "tom_tom_drum", "tomahawk", "tomato", "tongue", "tooth", "toothbrush", "top", "toppings", "torch", "torso", "torte", "tower", "toy", "toy_box", "toy_poodle", "track_spikes", "tractor", "traffic_cop", "traffic_light", "trail_bike", "trailer", "trailer_truck", "train", "trampoline", "trapeze", "travel_trailer", "tree", "tricycle", "trigger", "trombone", "trousers", "trowel", "truck", "trumpet", "trumpeter", "tub", "tudung", "tusk", "tuxedo", "twig", "uke", "umbrella", "undergarment", "underpants", "uneven_parallel_bars", "unicycle", "unicyclist", "uniform", "urinal", "vacuum_cleaner", "van", "vascular_plant", "vase", "vaulter", "vegetable", "vehicle", "vehicle_brake", "vehicle_door", "vehicle_registration_plate", "venison", "vertebrate", "vibraphone", "video_game_console", "vigil_light", "vintage_car", "vintage_clothing", "violin", "violin_family", "violinist", "violist", "vitis", "vizsla", "volleyball_net", "volleyball_player", "wagon", "waist", "waiter", "walk_behind_mower", "walker", "walking_shoe", "wall", "wardrobe", "washbasin", "washing_machine", "waste_container", "watch", "water", "water_bird", "water_feature", "water_polo_cap", "water_ski", "watercolor_paint", "waterfowl", "watering_can", "watermelon", "wave", "wedding_ceremony_supply", "wedding_dress", "wedding_ring", "weightlifter", "weights", "welder", "west_highland_white_terrier", "wetsuit", "whaler", "whales_dolphins_and_porpoises", "wheat_beer", "wheel", "wheelchair", "whipped_cream", "whippet", "whisk", "whiskers", "whisky", "whistle", "white_coat", "white_collar_worker", "white_rice", "wicker_basket", "wicket", "wig", "wildflower", "wildlife_biologist", "wind_instrument", "wind_wave", "window", "window_blind", "window_covering", "window_screen", "window_treatment", "windshield", "windshield_wiper", "wine", "wine_glass", "wing", "winter_squash", "wiper", "wire", "wire_fencing", "wok", "woman", "wood_burning_stove", "wood_stain", "woodwind_instrument", "woody_plant", "workman", "wrench", "wrestler", "wrestling_mat", "wrestling_singlet", "wrist", "xylophone", "yacht", "yakitori", "yolk"], "scene": ["aeolian_landform", "aisle", "alley", "amusement_park", "animal_shelter", "apartment", "apiary", "archaeological_site", "arena", "arroyo", "attic", "auditorium", "automobile_repair_shop", "backyard", "badlands", "bakery", "ballpark", "ballroom", "bank", "bar", "barbershop", "barn", "baseball_field", "baseball_positions", "basement", "basketball_court", "bathroom", "batting_cage", "bay", "bayou", "bazaar", "beach", "beauty_salon", "bedroom", "boardwalk", "body_of_water", "boutique", "bowling_alley", "boxing_ring", "bridge", "building", "bullring", "butcher_shop", "canyon", "cape", "carport", "casino", "cave", "channel", "chapel", "cityscape", "cliff", "clinic", "coast", "coastal_and_oceanic_landforms", "cockpit", "cocktail_lounge", "concert_hall", "condominium", "conference_hall", "coral_reef", "courtyard", "creek", "day_nursery", "deck", "desert", "dining_room", "dock", "downtown", "dune", "ecoregion", "escarpment", "estate", "factory", "fair", "farm", "fault", "field", "field_lacrosse", "fire_department", "fish_pond", "floor", "fluvial_landforms_of_streams", "football_stadium", "forest", "formation", "foundry", "function_hall", "garage", "garden", "garden_buildings", "glacial_lake", "golf_course", "grassland", "grocery_store", "grove", "gym", "hall", "harbor", "haze", "headland", "highland", "hill", "historic_site", "home", "horizon", "hospital", "hot_spring", "hotel", "hotel_room", "house", "hut", "ice_hockey_position", "ice_hockey_rink", "ice_rink", "inlet", "intersection", "kindergarten", "kitchen", "laboratory", "lake", "land_lot", "landmark", "landscape", "lane", "lecture_room", "leisure_centre", "littoral", "living_room", "log_cabin", "marina", "market", "marsh", "massif", "meadow", "meander", "metropolitan_area", "mountain", "mountain_pass", "mountain_range", "mountainous_landforms", "music_venue", "musical_theatre", "national_park", "natural_resources", "nature_reserve", "neighbourhood", "nightclub", "office", "opera", "outcrop", "paddy_field", "palace", "panorama", "park", "parking", "pasture", "path", "patio", "pavilion", "pedestrian_crossing", "performing_arts_center", "piste", "place_of_worship", "plain", "plateau", "playground", "plaza", "pond", "port", "property", "public_space", "race_track", "ranch", "reef", "religious_institute", "reservoir", "residential_area", "resort", "restaurant", "restroom", "retail", "ridge", "riparian_zone", "river", "riverbed", "road", "road_highway", "room", "rural_area", "sandbank", "sandbar", "school", "sea", "seashore", "seaside", "shack", "shooting_range", "shopping_mall", "shore", "sidewalk", "ski_slope", "sky", "skyline", "skyscraper", "snow_covered_landscape", "sport_venue", "stable", "stadium", "stage", "strand", "stream", "stream_bed", "street", "suburb", "summit", "supermarket", "swamp", "swimming_pool", "tavern", "television_room", "tennis_camp", "tennis_court", "terrain", "theatre", "toolroom", "tourist_attraction", "tower_block", "town", "town_square", "track", "tropical_beach", "tropics", "tunnel", "urban_area", "urban_design", "valley", "village", "walkway", "warehouse", "watercourse", "waterfall", "waterway", "wetland", "wildlife_region", "workshop", "yard", "zoo"]} diff --git a/tools/data/hvu/parse_tag_list.py b/tools/data/hvu/parse_tag_list.py new file mode 100644 index 0000000000000000000000000000000000000000..af5c8667fe5433470afc96fa91a9ad0c0586b4f4 --- /dev/null +++ b/tools/data/hvu/parse_tag_list.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmengine + +tag_list = '../../../data/hvu/annotations/hvu_categories.csv' + +lines = open(tag_list).readlines() +lines = [x.strip().split(',') for x in lines[1:]] +tag_categories = {} +for line in lines: + tag, category = line + tag_categories.setdefault(category, []).append(tag) + +for k in tag_categories: + tag_categories[k].sort() + +mmengine.dump(tag_categories, 'hvu_tags.json') diff --git a/tools/data/jester/README.md b/tools/data/jester/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8a05dc72e34e7c5e59536b5cc022ee0efc199a9 --- /dev/null +++ b/tools/data/jester/README.md @@ -0,0 +1,143 @@ +# Preparing Jester + +## Introduction + + + +```BibTeX +@InProceedings{Materzynska_2019_ICCV, + author = {Materzynska, Joanna and Berger, Guillaume and Bax, Ingo and Memisevic, Roland}, + title = {The Jester Dataset: A Large-Scale Video Dataset of Human Gestures}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, + month = {Oct}, + year = {2019} +} +``` + +For basic dataset information, you can refer to the dataset [website](https://developer.qualcomm.com/software/ai-datasets/jester). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/jester/`. + +## Step 1. Prepare Annotations + +First of all, you have to sign in and download annotations to `$MMACTION2/data/jester/annotations` on the official [website](https://developer.qualcomm.com/software/ai-datasets/jester). + +## Step 2. Prepare RGB Frames + +Since the [jester website](https://developer.qualcomm.com/software/ai-datasets/jester) doesn't provide the original video data and only extracted RGB frames are available, you have to directly download RGB frames from [jester website](https://developer.qualcomm.com/software/ai-datasets/jester). + +You can download all RGB frame parts on [jester website](https://developer.qualcomm.com/software/ai-datasets/jester) to `$MMACTION2/data/jester/` and use the following command to extract. + +```shell +cd $MMACTION2/data/jester/ +cat 20bn-jester-v1-?? | tar zx +cd $MMACTION2/tools/data/jester/ +``` + +For users who only want to use RGB frames, you can skip to step 5 to generate file lists in the format of rawframes. Since the prefix of official JPGs is "%05d.jpg" (e.g., "00001.jpg"), +we add `"filename_tmpl='{:05}.jpg'"` to the dict of `data.train`, `data.val` and `data.test` in the config files related with jester like this: + +``` +data = dict( + videos_per_gpu=16, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +``` + +## Step 3. Extract Flow + +This part is **optional** if you only want to use RGB frames. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/jester_extracted/ +ln -s /mnt/SSD/jester_extracted/ ../../../data/jester/rawframes +``` + +Then, you can run the following script to extract optical flow based on RGB frames. + +```shell +cd $MMACTION2/tools/data/jester/ +bash extract_flow.sh +``` + +## Step 4. Encode Videos + +This part is **optional** if you only want to use RGB frames. + +You can run the following script to encode videos. + +```shell +cd $MMACTION2/tools/data/jester/ +bash encode_videos.sh +``` + +## Step 5. Generate File List + +You can run the follow script to generate file list in the format of rawframes and videos. + +```shell +cd $MMACTION2/tools/data/jester/ +bash generate_{rawframes, videos}_filelist.sh +``` + +## Step 5. Check Directory Structure + +After the whole data process for Jester preparation, +you will get the rawframes (RGB + Flow), and annotation files for Jester. + +In the context of the whole project (for Jester only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ jester +โ”‚ โ”‚ โ”œโ”€โ”€ jester_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ jester_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +For training and evaluating on Jester, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/jester/README_zh-CN.md b/tools/data/jester/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..858e6abe204a4163cca42514021d63f973fbf97c --- /dev/null +++ b/tools/data/jester/README_zh-CN.md @@ -0,0 +1,143 @@ +# ๅ‡†ๅค‡ Jester + +## ็ฎ€ไป‹ + + + +```BibTeX +@InProceedings{Materzynska_2019_ICCV, + author = {Materzynska, Joanna and Berger, Guillaume and Bax, Ingo and Memisevic, Roland}, + title = {The Jester Dataset: A Large-Scale Video Dataset of Human Gestures}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, + month = {Oct}, + year = {2019} +} +``` + +็”จๆˆทๅฏไปฅๅ‚็…งๆ•ฐๆฎ้›† [ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/jester)๏ผŒ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅ‡†ๅค‡ๆ•ฐๆฎ้›†ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/jester/`ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆท้œ€่ฆๅœจ [ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/jester) ๅฎŒๆˆๆณจๅ†Œ๏ผŒๆ‰่ƒฝไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ไธ‹่ฝฝๅฅฝ็š„ๆ ‡ๆณจๆ–‡ไปถ้œ€่ฆๆ”พๅœจ `$MMACTION2/data/jester/annotations` ๆ–‡ไปถๅคนไธ‹ใ€‚ + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡ RGB ๅธง + +[jester ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/jester) ๅนถๆœชๆไพ›ๅŽŸๅง‹่ง†้ข‘ๆ–‡ไปถ๏ผŒๅชๆไพ›ไบ†ๅฏนๅŽŸ่ง†้ข‘ๆ–‡ไปถ่ฟ›่กŒๆŠฝๅ–ๅพ—ๅˆฐ็š„ RGB ๅธง๏ผŒ็”จๆˆทๅฏๅœจ [jester ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/jester) ็›ดๆŽฅไธ‹่ฝฝใ€‚ + +ๅฐ†ไธ‹่ฝฝๅฅฝ็š„ๅŽ‹็ผฉๆ–‡ไปถๆ”พๅœจ `$MMACTION2/data/jester/` ๆ–‡ไปถๅคนไธ‹๏ผŒๅนถไฝฟ็”จไปฅไธ‹่„šๆœฌ่ฟ›่กŒ่งฃๅŽ‹ใ€‚ + +```shell +cd $MMACTION2/data/jester/ +cat 20bn-jester-v1-?? | tar zx +cd $MMACTION2/tools/data/jester/ +``` + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ RGB ๅธง๏ผŒๅˆ™ๅฏไปฅ่ทณ่ฟ‡ไธญ้—ดๆญฅ้ชค่‡ณๆญฅ้ชค 5 ไปฅ็›ดๆŽฅ็”Ÿๆˆ่ง†้ข‘ๅธง็š„ๆ–‡ไปถๅˆ—่กจใ€‚ +็”ฑไบŽๅฎ˜็ฝ‘็š„ JPG ๆ–‡ไปถๅๅฝขๅฆ‚ "%05d.jpg" ๏ผˆๆฏ”ๅฆ‚๏ผŒ"00001.jpg"๏ผ‰๏ผŒ้œ€่ฆๅœจ้…็ฝฎๆ–‡ไปถ็š„ `data.train`, `data.val` ๅ’Œ `data.test` ๅค„ๆทปๅŠ  `"filename_tmpl='{:05}.jpg'"` ไปฃ็ ๏ผŒไปฅไฟฎๆ”นๆ–‡ไปถๅๆจกๆฟใ€‚ + +```python +data = dict( + videos_per_gpu=16, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ–ๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ RGB ๅธง่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/jester_extracted/ +ln -s /mnt/SSD/jester_extracted/ ../../../data/jester/rawframes +``` + +ๅฆ‚ๆžœๆƒณๆŠฝๅ–ๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌไปŽ RGB ๅธงไธญๆŠฝๅ–ๅ‡บๅ…‰ๆตใ€‚ + +```shell +cd $MMACTION2/tools/data/jester/ +bash extract_flow.sh +``` + +## ๆญฅ้ชค 4: ็ผ–็ ่ง†้ข‘ + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ RGB ๅธง่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +็”จๆˆทๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค่ฟ›่กŒ่ง†้ข‘็ผ–็ ใ€‚ + +```shell +cd $MMACTION2/tools/data/jester/ +bash encode_videos.sh +``` + +## ๆญฅ้ชค 5. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +cd $MMACTION2/tools/data/jester/ +bash generate_{rawframes, videos}_filelist.sh +``` + +## ๆญฅ้ชค 6. ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ Jester ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ +็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„ RGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒJester ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ jester +โ”‚ โ”‚ โ”œโ”€โ”€ jester_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ jester_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +ๅ…ณไบŽๅฏน jester ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/jester/encode_videos.sh b/tools/data/jester/encode_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..b746a3d05792381e53eb07f3dfd6b2a3314716b2 --- /dev/null +++ b/tools/data/jester/encode_videos.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_videos.py ../../data/jester/rawframes/ ../../data/jester/videos/ --fps 12 --level 1 --start-idx 1 --filename-tmpl '%05d' +echo "Encode videos" + +cd jester/ diff --git a/tools/data/jester/extract_flow.sh b/tools/data/jester/extract_flow.sh new file mode 100644 index 0000000000000000000000000000000000000000..1e81a84dfed9d56657c974b137aa283957ce0caa --- /dev/null +++ b/tools/data/jester/extract_flow.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/jester/rawframes/ ../../data/jester/rawframes/ --task flow --level 1 --flow-type tvl1 --input-frames +echo "Flow (tv-l1) Generated" +cd jester/ diff --git a/tools/data/jester/generate_rawframes_filelist.sh b/tools/data/jester/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..f4b8207205a54deafc07e0097e74e32b8eeecc3d --- /dev/null +++ b/tools/data/jester/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset train --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset val --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/jester/ diff --git a/tools/data/jester/generate_videos_filelist.sh b/tools/data/jester/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..71f145222a267c82cadfdd5b71eb4f6fdd3992f9 --- /dev/null +++ b/tools/data/jester/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/jester/ diff --git a/tools/data/jester/label_map.txt b/tools/data/jester/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3b080fcae5667b5964933ee424cd2fd7f0d1f27 --- /dev/null +++ b/tools/data/jester/label_map.txt @@ -0,0 +1,27 @@ +Swiping Left +Swiping Right +Swiping Down +Swiping Up +Pushing Hand Away +Pulling Hand In +Sliding Two Fingers Left +Sliding Two Fingers Right +Sliding Two Fingers Down +Sliding Two Fingers Up +Pushing Two Fingers Away +Pulling Two Fingers In +Rolling Hand Forward +Rolling Hand Backward +Turning Hand Clockwise +Turning Hand Counterclockwise +Zooming In With Full Hand +Zooming Out With Full Hand +Zooming In With Two Fingers +Zooming Out With Two Fingers +Thumb Up +Thumb Down +Shaking Hand +Stop Sign +Drumming Fingers +No gesture +Doing other things diff --git a/tools/data/jhmdb/README.md b/tools/data/jhmdb/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4450087c7a1177ec1311602f883a0fe9e1a8ad4f --- /dev/null +++ b/tools/data/jhmdb/README.md @@ -0,0 +1,101 @@ +# Preparing JHMDB + +## Introduction + + + +```BibTeX +@inproceedings{Jhuang:ICCV:2013, + title = {Towards understanding action recognition}, + author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black}, + booktitle = {International Conf. on Computer Vision (ICCV)}, + month = Dec, + pages = {3192-3199}, + year = {2013} +} +``` + +For basic dataset information, you can refer to the dataset [website](http://jhmdb.is.tue.mpg.de/). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/jhmdb/`. + +## Download and Extract + +You can download the RGB frames, optical flow and ground truth annotations from [google drive](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct). +The data are provided from [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md), which is adapted from [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector). + +After downloading the `JHMDB.tar.gz` file and put it in `$MMACTION2/tools/data/jhmdb/`, you can run the following command to extract. + +```shell +tar -zxvf JHMDB.tar.gz +``` + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/JHMDB/ +ln -s /mnt/SSD/JHMDB/ ../../../data/jhmdb +``` + +## Check Directory Structure + +After extracting, you will get the `FlowBrox04` directory, `Frames` directory and `JHMDB-GT.pkl` for JHMDB. + +In the context of the whole project (for JHMDB only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ jhmdb +โ”‚ | โ”œโ”€โ”€ FlowBrox04 +โ”‚ | | โ”œโ”€โ”€ brush_hair +โ”‚ | | | โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00039.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00040.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ wave +โ”‚ | | | โ”œโ”€โ”€ 21_wave_u_nm_np1_fr_goo_5 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Wie_man_winkt!!_wave_u_cm_np1_fr_med_0 +โ”‚ | โ”œโ”€โ”€ Frames +โ”‚ | | โ”œโ”€โ”€ brush_hair +โ”‚ | | | โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ | | | | โ”œโ”€โ”€ 00001.png +โ”‚ | | | | โ”œโ”€โ”€ 00002.png +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00039.png +โ”‚ | | | | โ”œโ”€โ”€ 00040.png +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ wave +โ”‚ | | | โ”œโ”€โ”€ 21_wave_u_nm_np1_fr_goo_5 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Wie_man_winkt!!_wave_u_cm_np1_fr_med_0 +โ”‚ | โ”œโ”€โ”€ JHMDB-GT.pkl + +``` + +:::{note} +The `JHMDB-GT.pkl` exists as a cache, it contains 6 items as follows: + +1. `labels` (list): List of the 21 labels. +2. `gttubes` (dict): Dictionary that contains the ground truth tubes for each video. + A **gttube** is dictionary that associates with each index of label and a list of tubes. + A **tube** is a numpy array with `nframes` rows and 5 columns, each col is in format like ` `. +3. `nframes` (dict): Dictionary that contains the number of frames for each video, like `'walk/Panic_in_the_Streets_walk_u_cm_np1_ba_med_5': 16`. +4. `train_videos` (list): A list with `nsplits=1` elements, each one containing the list of training videos. +5. `test_videos` (list): A list with `nsplits=1` elements, each one containing the list of testing videos. +6. `resolution` (dict): Dictionary that outputs a tuple (h,w) of the resolution for each video, like `'pour/Bartender_School_Students_Practice_pour_u_cm_np1_fr_med_1': (240, 320)`. + +::: diff --git a/tools/data/jhmdb/README_zh-CN.md b/tools/data/jhmdb/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..9ba6bffea15349b6ff6a39585c05b2dc7342abe3 --- /dev/null +++ b/tools/data/jhmdb/README_zh-CN.md @@ -0,0 +1,98 @@ +# ๅ‡†ๅค‡ JHMDB + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{Jhuang:ICCV:2013, + title = {Towards understanding action recognition}, + author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black}, + booktitle = {International Conf. on Computer Vision (ICCV)}, + month = Dec, + pages = {3192-3199}, + year = {2013} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](http://jhmdb.is.tue.mpg.de/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/jhmdb/`ใ€‚ + +## ไธ‹่ฝฝๅ’Œ่งฃๅŽ‹ + +็”จๆˆทๅฏไปฅไปŽ [่ฟ™้‡Œ](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct) ไธ‹่ฝฝ RGB ๅธง๏ผŒๅ…‰ๆตๅ’Œ็œŸๅฎžๆ ‡็ญพๆ–‡ไปถใ€‚ +่ฏฅๆ•ฐๆฎ็”ฑ [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md) ไปฃ็ ๅบ“ๆไพ›๏ผŒๅ‚่€ƒ่‡ช [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector)ใ€‚ + +็”จๆˆทๅœจไธ‹่ฝฝ `JHMDB.tar.gz` ๆ–‡ไปถๅŽ๏ผŒ้œ€ๅฐ†ๅ…ถๆ”พ็ฝฎๅœจ `$MMACTION2/tools/data/jhmdb/` ็›ฎๅฝ•ไธ‹๏ผŒๅนถไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒ่งฃๅŽ‹๏ผš + +```shell +tar -zxvf JHMDB.tar.gz +``` + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/JHMDB/ +ln -s /mnt/SSD/JHMDB/ ../../../data/jhmdb +``` + +## ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅฎŒๆˆ่งฃๅŽ‹ๅŽ๏ผŒ็”จๆˆทๅฐ†ๅพ—ๅˆฐ `FlowBrox04` ๆ–‡ไปถๅคน๏ผŒ`Frames` ๆ–‡ไปถๅคนๅ’Œ `JHMDB-GT.pkl` ๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒJHMDB ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ jhmdb +โ”‚ | โ”œโ”€โ”€ FlowBrox04 +โ”‚ | | โ”œโ”€โ”€ brush_hair +โ”‚ | | | โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00039.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00040.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ wave +โ”‚ | | | โ”œโ”€โ”€ 21_wave_u_nm_np1_fr_goo_5 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Wie_man_winkt!!_wave_u_cm_np1_fr_med_0 +โ”‚ | โ”œโ”€โ”€ Frames +โ”‚ | | โ”œโ”€โ”€ brush_hair +โ”‚ | | | โ”œโ”€โ”€ April_09_brush_hair_u_nm_np1_ba_goo_0 +โ”‚ | | | | โ”œโ”€โ”€ 00001.png +โ”‚ | | | | โ”œโ”€โ”€ 00002.png +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00039.png +โ”‚ | | | | โ”œโ”€โ”€ 00040.png +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2 +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ wave +โ”‚ | | | โ”œโ”€โ”€ 21_wave_u_nm_np1_fr_goo_5 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ Wie_man_winkt!!_wave_u_cm_np1_fr_med_0 +โ”‚ | โ”œโ”€โ”€ JHMDB-GT.pkl + +``` + +**ๆณจๆ„**๏ผš`JHMDB-GT.pkl` ไฝœไธบไธ€ไธช็ผ“ๅญ˜ๆ–‡ไปถ๏ผŒๅฎƒๅŒ…ๅซ 6 ไธช้กน็›ฎ๏ผš + +1. `labels` (list)๏ผš21 ไธช่กŒไธบ็ฑปๅˆซๅ็งฐ็ป„ๆˆ็š„ๅˆ—่กจ +2. `gttubes` (dict)๏ผšๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅŸบๅ‡† tubes ็ป„ๆˆ็š„ๅญ—ๅ…ธ + **gttube** ๆ˜ฏ็”ฑๆ ‡็ญพ็ดขๅผ•ๅ’Œ tube ๅˆ—่กจ็ป„ๆˆ็š„ๅญ—ๅ…ธ + **tube** ๆ˜ฏไธ€ไธช `nframes` ่กŒๅ’Œ 5 ๅˆ—็š„ numpy array๏ผŒๆฏไธ€ๅˆ—็š„ๅฝขๅผๅฆ‚ ` ` +3. `nframes` (dict)๏ผš็”จไปฅ่กจ็คบๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅธงๆ•ฐ๏ผŒๅฆ‚ `'walk/Panic_in_the_Streets_walk_u_cm_np1_ba_med_5': 16` +4. `train_videos` (list)๏ผšๅŒ…ๅซ `nsplits=1` ็š„ๅ…ƒ็ด ๏ผŒๆฏไธ€้กน้ƒฝๅŒ…ๅซไบ†่ฎญ็ปƒ่ง†้ข‘็š„ๅˆ—่กจ +5. `test_videos` (list)๏ผšๅŒ…ๅซ `nsplits=1` ็š„ๅ…ƒ็ด ๏ผŒๆฏไธ€้กน้ƒฝๅŒ…ๅซไบ†ๆต‹่ฏ•่ง†้ข‘็š„ๅˆ—่กจ +6. `resolution` (dict)๏ผšๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅˆ†่พจ็އ๏ผˆๅฝขๅฆ‚ (h,w)๏ผ‰๏ผŒๅฆ‚ `'pour/Bartender_School_Students_Practice_pour_u_cm_np1_fr_med_1': (240, 320)` diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7c5bb8bb6166ff42b48c99f204a72b14fd0a29d8 --- /dev/null +++ b/tools/data/kinetics/README.md @@ -0,0 +1,184 @@ +# Preparing Kinetics-\[400/600/700\] + +## Introduction + + + +```BibTeX +@inproceedings{inproceedings, + author = {Carreira, J. and Zisserman, Andrew}, + year = {2017}, + month = {07}, + pages = {4724-4733}, + title = {Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset}, + doi = {10.1109/CVPR.2017.502} +} +``` + +For basic dataset information, please refer to the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/). + +:::{note} +Because of the expirations of some YouTube links, the sizes of kinetics dataset copies may be different. Here are the sizes of our kinetics dataset copies that used to train all checkpoints. + +| Dataset | training videos | validation videos | +| :---------: | :-------------: | :---------------: | +| kinetics400 | 240436 | 19796 | +| Kinetics600 | 383393 | 27910 | +| Kinetics700 | 542357 | 34824 | + +::: + +`````{tabs} + +````{group-tab} Download by MIM +:::{note} +All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version. +::: + +MIM supports downloading from OpenDataLab and preprocessing Kinetics-400/600/700 dataset with one command line. + +```Bash +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login +# download and preprocess Kinetics-400 by MIM. Note that this might take a long time. +mim download mmaction2 --dataset kinetics400 +# download and preprocess Kinetics-600 by MIM. Note that this might take a long time. +mim download mmaction2 --dataset kinetics600 +# download and preprocess Kinetics-700 by MIM. Note that this might take a long time. +mim download mmaction2 --dataset kinetics700 + +``` + +```` + +````{group-tab} Download form Official Source + +## Step 1. Prepare Annotations + +The scripts can be used for preparing kinetics400, kinetics600, kinetics700. To prepare different version of kinetics, you need to replace `${DATASET}` in the following examples with the specific dataset name. The choices of dataset names are `kinetics400`, `kinetics600` and `kinetics700`. +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/${DATASET}/`. + +First of all, you can run the following script to prepare annotations by downloading from the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/). + +```shell +bash download_annotations.sh ${DATASET} +``` + +Since some video urls are invalid, the number of video items in current official annotations are less than the original official ones. +So we provide an alternative way to download the older one as a reference. +Among these, the annotation files of Kinetics400 and Kinetics600 are from [official crawler](https://github.com/activitynet/ActivityNet/tree/199c9358907928a47cdfc81de4db788fddc2f91d/Crawler/Kinetics/data), +the annotation files of Kinetics700 are from [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) downloaded in 05/02/2021. + +```shell +bash download_backup_annotations.sh ${DATASET} +``` + +## Step 2. Prepare Videos + +you can run the following script to prepare videos. +The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. + +```shell +bash download_videos.sh ${DATASET} +``` + +**Important**: If you have already downloaded video dataset using the download script above, +you must replace all whitespaces in the class name for ease of processing by running + +```shell +bash rename_classnames.sh ${DATASET} +``` + +For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by: + +```bash +python ../resize_videos.py ../../../data/${DATASET}/videos_train/ ../../../data/${DATASET}/videos_train_256p_dense_cache --dense --level 2 +``` + +You can also download from [Academic Torrents](https://academictorrents.com/) ([kinetics400](https://academictorrents.com/details/184d11318372f70018cf9a72ef867e2fb9ce1d26) & [kinetics700](https://academictorrents.com/details/49f203189fb69ae96fb40a6d0e129949e1dfec98) with short edge 256 pixels are available) and [cvdfoundation/kinetics-dataset](https://github.com/cvdfoundation/kinetics-dataset) (Host by Common Visual Data Foundation and Kinetics400/Kinetics600/Kinetics-700-2020 are available) + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/${DATASET}_extracted_train/ +ln -s /mnt/SSD/${DATASET}_extracted_train/ ../../../data/${DATASET}/rawframes_train/ +mkdir /mnt/SSD/${DATASET}_extracted_val/ +ln -s /mnt/SSD/${DATASET}_extracted_val/ ../../../data/${DATASET}/rawframes_val/ +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh ${DATASET} +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +bash extract_rgb_frames_opencv.sh ${DATASET} +``` + +If both are required, run the following script to extract frames. + +```shell +bash extract_frames.sh ${DATASET} +``` + +The commands above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`. +More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md). + +## Step 4. Generate File List + +you can run the follow scripts to generate file list in the format of videos and rawframes, respectively. + +```shell +bash generate_videos_filelist.sh ${DATASET} +# execute the command below when rawframes are ready +bash generate_rawframes_filelist.sh ${DATASET} +``` + +```` +````` + +### Folder Structure + +After the whole data pipeline for Kinetics preparation. +you can get the rawframes (RGB + Flow), videos and annotation files for Kinetics. + +In the context of the whole project (for Kinetics only), the *minimal* folder structure will look like: +(*minimal* means that some data are not necessary: for example, you may want to evaluate kinetics using the original video format.) + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ${DATASET} +โ”‚ โ”‚ โ”œโ”€โ”€ ${DATASET}_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ ${DATASET}_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0wR5jVB-WPk_000417_000427.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wrapping_present +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ zumba +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_train (optional) +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_val (optional) + +``` + +For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..bf81e28c012425864a88daf31ccf31c7bd354fbf --- /dev/null +++ b/tools/data/kinetics/README_zh-CN.md @@ -0,0 +1,174 @@ +# ๅ‡†ๅค‡ Kinetics-\[400/600/700\] + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{inproceedings, + author = {Carreira, J. and Zisserman, Andrew}, + year = {2017}, + month = {07}, + pages = {4724-4733}, + title = {Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset}, + doi = {10.1109/CVPR.2017.502} +} +``` + +่ฏทๅ‚็…ง [ๅฎ˜ๆ–น็ฝ‘็ซ™](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) ไปฅ่Žทๅ–ๆ•ฐๆฎ้›†ๅŸบๆœฌไฟกๆฏใ€‚ + +:::{note} +็”ฑไบŽ้ƒจๅˆ† YouTube ้“พๆŽฅๅคฑๆ•ˆ๏ผŒ็ˆฌๅ–็š„ Kinetics ๆ•ฐๆฎ้›†ๅคงๅฐๅฏ่ƒฝไธŽๅŽŸ็‰ˆไธๅŒใ€‚ไปฅไธ‹ๆ˜ฏๆˆ‘ไปฌๆ‰€ไฝฟ็”จ Kinetics ๆ•ฐๆฎ้›†็š„ๅคงๅฐ๏ผš + +| ๆ•ฐๆฎ้›† | ่ฎญ็ปƒ่ง†้ข‘ | ้ชŒ่ฏ้›†่ง†้ข‘ | +| :---------: | :------: | :--------: | +| Kinetics400 | 240436 | 19796 | +| Kinetics600 | 383393 | 27910 | +| Kinetics700 | 542357 | 34824 | +| ::: | | | + +`````{tabs} + +````{group-tab} ไฝฟ็”จ MIM ไธ‹่ฝฝ +:::{note} +MMAction2 ไปฃ็ ไป“ๅบ“ไธญๆไพ›็š„ Kinetics ๅฎž้ชŒๆ€ง่ƒฝ๏ผŒ้ƒฝๆ˜ฏๅŸบไบŽ่ฟ™ไธช็‰ˆๆœฌ็š„ๆ•ฐๆฎๅพ—ๅˆฐ็š„ใ€‚ๆˆ‘ไปฌๅปบ่ฎฎ็”จๆˆทไฝฟ็”จ่ฟ™ไธช็‰ˆๆœฌ็š„ Kinetics ๆ•ฐๆฎ้›†่ฟ›่กŒๅฎž้ชŒใ€‚ +::: + +# MIM ๆ”ฏๆŒไธ‹่ฝฝ Kinetics-400/600/700 ๆ•ฐๆฎ้›†ใ€‚็”จๆˆทๅฏไปฅ้€š่ฟ‡ไธ€่กŒๅ‘ฝไปค๏ผŒไปŽ OpenDataLab ่ฟ›่กŒไธ‹่ฝฝ๏ผŒๅนถ่ฟ›่กŒ้ข„ๅค„็†ใ€‚ +```Bash +# ๅฎ‰่ฃ… OpenXLab CLI ๅทฅๅ…ท +pip install -U openxlab +# ็™ปๅฝ• OpenXLab +openxlab login +# ้€š่ฟ‡ MIM ่ฟ›่กŒ Kinetics-400 ๆ•ฐๆฎ้›†ไธ‹่ฝฝ๏ผŒ้ข„ๅค„็†ใ€‚ๆณจๆ„่ฟ™ๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ด +mim download mmaction2 --dataset kinetics400 +# ้€š่ฟ‡ MIM ่ฟ›่กŒ Kinetics-600 ๆ•ฐๆฎ้›†ไธ‹่ฝฝ๏ผŒ้ข„ๅค„็†ใ€‚ๆณจๆ„่ฟ™ๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ด +mim download mmaction2 --dataset kinetics600 +# ้€š่ฟ‡ MIM ่ฟ›่กŒ Kinetics-700 ๆ•ฐๆฎ้›†ไธ‹่ฝฝ๏ผŒ้ข„ๅค„็†ใ€‚ๆณจๆ„่ฟ™ๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ด +mim download mmaction2 --dataset kinetics700 +``` + +```` + +````{group-tab} ไปŽๅฎ˜ๆ–นๆบไธ‹่ฝฝ +## 1. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +ๆญค่„šๆœฌ็”จไบŽๅ‡†ๅค‡ๆ•ฐๆฎ้›† kinetics400๏ผŒkinetics600๏ผŒkinetics700ใ€‚ไธบๅ‡†ๅค‡ kinetics ๆ•ฐๆฎ้›†็š„ไธๅŒ็‰ˆๆœฌ๏ผŒ็”จๆˆท้œ€ๅฐ†่„šๆœฌไธญ็š„ `${DATASET}` ่ต‹ๅ€ผไธบๆ•ฐๆฎ้›†ๅฏนๅบ”็‰ˆๆœฌๅ็งฐ๏ผŒๅฏ้€‰้กนไธบ `kinetics400`๏ผŒ`kinetics600`๏ผŒ `kinetics700`ใ€‚ +ๅœจๅผ€ๅง‹ไน‹ๅ‰๏ผŒ็”จๆˆท้œ€็กฎไฟๅฝ“ๅ‰็›ฎๅฝ•ไธบ `$MMACTION2/tools/data/${DATASET}/`ใ€‚ +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌไปŽ [Kinetics ๆ•ฐๆฎ้›†ๅฎ˜็ฝ‘](https://deepmind.com/research/open-source/open-source-datasets/kinetics/)ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถๅนถ่ฟ›่กŒ้ข„ๅค„็†๏ผš + +```shell +bash download_annotations.sh ${DATASET} +``` + +็”ฑไบŽ้ƒจๅˆ†่ง†้ข‘็š„ URL ไธๅฏ็”จ๏ผŒๅฝ“ๅ‰ๅฎ˜ๆ–นๆ ‡ๆณจไธญๆ‰€ๅซ่ง†้ข‘ๆ•ฐ้‡ๅฏ่ƒฝๅฐไบŽๅˆๅง‹็‰ˆๆœฌใ€‚ๆ‰€ไปฅ MMAction2 ๆไพ›ไบ†ๅฆไธ€็งๆ–นๅผไปฅ่Žทๅ–ๅˆๅง‹็‰ˆๆœฌๆ ‡ๆณจไฝœไธบๅ‚่€ƒใ€‚ +ๅœจ่ฟ™ๅ…ถไธญ๏ผŒKinetics400 ๅ’Œ Kinetics600 ็š„ๆ ‡ๆณจๆ–‡ไปถๆฅ่‡ช [ๅฎ˜ๆ–น็ˆฌ่™ซ](https://github.com/activitynet/ActivityNet/tree/199c9358907928a47cdfc81de4db788fddc2f91d/Crawler/Kinetics/data)๏ผŒ +Kinetics700 ็š„ๆ ‡ๆณจๆ–‡ไปถไบŽ 05/02/2021 ไธ‹่ฝฝ่‡ช [็ฝ‘็ซ™](https://deepmind.com/research/open-source/open-source-datasets/kinetics/)ใ€‚ + +```shell +bash download_backup_annotations.sh ${DATASET} +``` + +## 2. ๅ‡†ๅค‡่ง†้ข‘ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘๏ผŒ่ง†้ข‘ๅ‡†ๅค‡ไปฃ็ ไฟฎๆ”น่‡ช [ๅฎ˜ๆ–น็ˆฌ่™ซ](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)ใ€‚ๆณจๆ„่ฟ™ไธ€ๆญฅ้ชคๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ดใ€‚ + +```shell +bash download_videos.sh ${DATASET} +``` + +**้‡่ฆๆ็คบ**๏ผšๅฆ‚ๆžœๅœจๆญคไน‹ๅ‰ๅทฒไธ‹่ฝฝๅฅฝ Kinetics ๆ•ฐๆฎ้›†็š„่ง†้ข‘๏ผŒ่ฟ˜้œ€ไฝฟ็”จ้‡ๅ‘ฝๅ่„šๆœฌๆฅๆ›ฟๆขๆމ็ฑปๅไธญ็š„็ฉบๆ ผ๏ผš + +```shell +bash rename_classnames.sh ${DATASET} +``` + +ไธบๆๅ‡่งฃ็ ้€Ÿๅบฆ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌๅฐ†ๅŽŸๅง‹่ง†้ข‘็ผฉๆ”พ่‡ณๆ›ดๅฐ็š„ๅˆ†่พจ็އ๏ผˆๅˆฉ็”จ็จ ๅฏ†็ผ–็ ๏ผ‰๏ผš + +```bash +python ../resize_videos.py ../../../data/${DATASET}/videos_train/ ../../../data/${DATASET}/videos_train_256p_dense_cache --dense --level 2 +``` + +ไนŸๅฏไปฅไปŽ [Academic Torrents](https://academictorrents.com/) ไธญไธ‹่ฝฝ็Ÿญ่พน้•ฟๅบฆไธบ 256 ็š„ [kinetics400](https://academictorrents.com/details/184d11318372f70018cf9a72ef867e2fb9ce1d26) ๅ’Œ [kinetics700](https://academictorrents.com/details/49f203189fb69ae96fb40a6d0e129949e1dfec98)๏ผŒๆˆ–ไปŽ Common Visual Data Foundation ็ปดๆŠค็š„ [cvdfoundation/kinetics-dataset](https://github.com/cvdfoundation/kinetics-dataset) ไธญไธ‹่ฝฝ Kinetics400/Kinetics600/Kinetics-700-2020ใ€‚ + +## 3. ๆๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทไป…ไฝฟ็”จ video loader๏ผŒๅˆ™ๅฏไปฅ่ทณ่ฟ‡ๆœฌๆญฅใ€‚ + +ๅœจๆๅ–ไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰่ถณๅคŸ็š„ SSD ็ฉบ้—ด๏ผŒ้‚ฃไนˆๅปบ่ฎฎๅฐ†่ง†้ข‘ๆŠฝๅ–ไธบ RGB ๅธงไปฅๆๅ‡ I/O ๆ€ง่ƒฝใ€‚็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌไธบๆŠฝๅ–ๅพ—ๅˆฐ็š„ๅธงๆ–‡ไปถๅคนๅปบ็ซ‹่ฝฏ่ฟžๆŽฅ๏ผš + +```shell +# ๆ‰ง่กŒไปฅไธ‹่„šๆœฌ (ๅ‡่ฎพ SSD ่ขซๆŒ‚่ฝฝๅœจ "/mnt/SSD/") +mkdir /mnt/SSD/${DATASET}_extracted_train/ +ln -s /mnt/SSD/${DATASET}_extracted_train/ ../../../data/${DATASET}/rawframes_train/ +mkdir /mnt/SSD/${DATASET}_extracted_val/ +ln -s /mnt/SSD/${DATASET}_extracted_val/ ../../../data/${DATASET}/rawframes_val/ +``` + +ๅฆ‚ๆžœ็”จๆˆทๅชไฝฟ็”จ RGB ๅธง๏ผˆ็”ฑไบŽๅ…‰ๆตๆๅ–้žๅธธ่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘ๆ‰ง่กŒไปฅไธ‹่„šๆœฌ๏ผŒไป…็”จ denseflow ๆๅ– RGB ๅธง๏ผš + +```shell +bash extract_rgb_frames.sh ${DATASET} +``` + +ๅฆ‚ๆžœ็”จๆˆทๆœชๅฎ‰่ฃ… denseflow๏ผŒไปฅไธ‹่„šๆœฌๅฏไปฅไฝฟ็”จ OpenCV ่ฟ›่กŒ RGB ๅธง็š„ๆๅ–๏ผŒไฝ†่ง†้ข‘ๅŽŸๅˆ†่พจ็އๅคงๅฐไผš่ขซไฟ็•™๏ผš + +```shell +bash extract_rgb_frames_opencv.sh ${DATASET} +``` + +ๅฆ‚ๆžœๅŒๆ—ถ้œ€่ฆ RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅฏไฝฟ็”จๅฆ‚ไธ‹่„šๆœฌๆŠฝๅธง๏ผš + +```shell +bash extract_frames.sh ${DATASET} +``` + +ไปฅไธŠ็š„ๅ‘ฝไปค็”Ÿๆˆ็Ÿญ่พน้•ฟๅบฆไธบ 256 ็š„ RGB ๅธงๅ’Œๅ…‰ๆตๅธงใ€‚ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆ็”Ÿๆˆ็Ÿญ่พน้•ฟๅบฆไธบ 320 ็š„ๅธง (320p)๏ผŒๆˆ–ๆ˜ฏๅ›บๅฎšๅˆ†่พจ็އไธบ 340 x 256 ็š„ๅธง๏ผŒๅฏๆ”นๅ˜ๅ‚ๆ•ฐ `--new-short 256` ไธบ `--new-short 320` ๆˆ– `--new-width 340 --new-height 256`ใ€‚ +ๆ›ดๅคš็ป†่Š‚ๅฏไปฅๅ‚่€ƒ [ๆ•ฐๆฎๅ‡†ๅค‡](/docs/zh_cn/user_guides/prepare_dataset.md)ใ€‚ + +## 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ไธคไธช่„šๆœฌๅˆ†ๅˆซไธบ่ง†้ข‘ๅ’Œๅธงๆ–‡ไปถๅคน็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ๏ผš + +```shell +bash generate_videos_filelist.sh ${DATASET} +# ไธบๅธงๆ–‡ไปถๅคน็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ +bash generate_rawframes_filelist.sh ${DATASET} +``` + +```` +````` + +### ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆ•ดๅฎŒๆˆ Kinetics ็š„ๆ•ฐๆฎๅค„็†ๅŽ๏ผŒๅฐ†ๅพ—ๅˆฐๅธงๆ–‡ไปถๅคน๏ผˆRGB ๅธงๅ’Œๅ…‰ๆตๅธง๏ผ‰๏ผŒ่ง†้ข‘ไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็›ฎๅฝ•ไธ‹๏ผˆไป…้’ˆๅฏน Kinetics๏ผ‰๏ผŒ*ๆœ€็ฎ€* ็›ฎๅฝ•็ป“ๆž„ๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ${DATASET} +โ”‚ โ”‚ โ”œโ”€โ”€ ${DATASET}_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ ${DATASET}_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ 0wR5jVB-WPk_000417_000427.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ wrapping_present +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ zumba +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_train๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes_val๏ผˆๅฏ้€‰๏ผ‰ + +``` + +ๅ…ณไบŽ Kinetics ๆ•ฐๆฎ้›†ไธŠ็š„่ฎญ็ปƒไธŽๆต‹่ฏ•๏ผŒ่ฏทๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/kinetics/download.py b/tools/data/kinetics/download.py new file mode 100644 index 0000000000000000000000000000000000000000..7e152eb7cb6fda48a8e6d8deeb762fadb4c95e95 --- /dev/null +++ b/tools/data/kinetics/download.py @@ -0,0 +1,230 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/activitynet/ActivityNet/ +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ +import argparse +import glob +import json +import os +import shutil +import ssl +import subprocess +import uuid +from collections import OrderedDict + +import pandas as pd +from joblib import Parallel, delayed + +ssl._create_default_https_context = ssl._create_unverified_context + + +def create_video_folders(dataset, output_dir, tmp_dir): + """Creates a directory for each label name in the dataset.""" + if 'label-name' not in dataset.columns: + this_dir = os.path.join(output_dir, 'test') + if not os.path.exists(this_dir): + os.makedirs(this_dir) + # I should return a dict but ... + return this_dir + if not os.path.exists(output_dir): + os.makedirs(output_dir) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + label_to_dir = {} + for label_name in dataset['label-name'].unique(): + this_dir = os.path.join(output_dir, label_name) + if not os.path.exists(this_dir): + os.makedirs(this_dir) + label_to_dir[label_name] = this_dir + return label_to_dir + + +def construct_video_filename(row, label_to_dir, trim_format='%06d'): + """Given a dataset row, this function constructs the output filename for a + given video.""" + basename = '%s_%s_%s.mp4' % (row['video-id'], + trim_format % row['start-time'], + trim_format % row['end-time']) + if not isinstance(label_to_dir, dict): + dirname = label_to_dir + else: + dirname = label_to_dir[row['label-name']] + output_filename = os.path.join(dirname, basename) + return output_filename + + +def download_clip(video_identifier, + output_filename, + start_time, + end_time, + tmp_dir='/tmp/kinetics/.tmp_dir', + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + start_time: float + Indicates the beginning time in seconds from where the video + will be trimmed. + end_time: float + Indicates the ending time in seconds of the trimmed video. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + # Construct command line for getting the direct video link. + tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4()) + + if not os.path.exists(output_filename): + if not os.path.exists(tmp_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', + '--no-check-certificate', '-f', 'mp4', '-o', + '"%s"' % tmp_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + attempts += 1 + if attempts == num_attempts: + return status, err.output + else: + break + + tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] + # Construct command to trim the videos (ffmpeg required). + command = [ + 'ffmpeg', '-i', + '"%s"' % tmp_filename, '-ss', + str(start_time), '-t', + str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '1', '-loglevel', 'panic', + '"%s"' % output_filename + ] + command = ' '.join(command) + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + return status, err.output + + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + os.remove(tmp_filename) + return status, 'Downloaded' + + +def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): + """Wrapper for parallel processing purposes.""" + output_filename = construct_video_filename(row, label_to_dir, trim_format) + clip_id = os.path.basename(output_filename).split('.mp4')[0] + if os.path.exists(output_filename): + status = tuple([clip_id, True, 'Exists']) + return status + + downloaded, log = download_clip( + row['video-id'], + output_filename, + row['start-time'], + row['end-time'], + tmp_dir=tmp_dir) + status = tuple([clip_id, downloaded, log]) + return status + + +def parse_kinetics_annotations(input_csv, ignore_is_cc=False): + """Returns a parsed DataFrame. + arguments: + --------- + input_csv: str + Path to CSV file containing the following columns: + 'YouTube Identifier,Start time,End time,Class label' + returns: + ------- + dataset: DataFrame + Pandas with the following columns: + 'video-id', 'start-time', 'end-time', 'label-name' + """ + df = pd.read_csv(input_csv) + if 'youtube_id' in df.columns: + columns = OrderedDict([('youtube_id', 'video-id'), + ('time_start', 'start-time'), + ('time_end', 'end-time'), + ('label', 'label-name')]) + df.rename(columns=columns, inplace=True) + if ignore_is_cc: + df = df.loc[:, df.columns.tolist()[:-1]] + return df + + +def main(input_csv, + output_dir, + trim_format='%06d', + num_jobs=24, + tmp_dir='/tmp/kinetics'): + tmp_dir = os.path.join(tmp_dir, '.tmp_dir') + + # Reading and parsing Kinetics. + dataset = parse_kinetics_annotations(input_csv) + + # Creates folders where videos will be saved later. + label_to_dir = create_video_folders(dataset, output_dir, tmp_dir) + + # Download all clips. + if num_jobs == 1: + status_list = [] + for _, row in dataset.iterrows(): + status_list.append( + download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir)) + else: + status_list = Parallel( + n_jobs=num_jobs)(delayed(download_clip_wrapper)( + row, label_to_dir, trim_format, tmp_dir) + for i, row in dataset.iterrows()) + + # Clean tmp dir. + shutil.rmtree(tmp_dir) + + # Save download report. + with open('download_report.json', 'w') as fobj: + fobj.write(json.dumps(status_list)) + + +if __name__ == '__main__': + description = 'Helper script for downloading and trimming kinetics videos.' + p = argparse.ArgumentParser(description=description) + p.add_argument( + 'input_csv', + type=str, + help=('CSV file containing the following format: ' + 'YouTube Identifier,Start time,End time,Class label')) + p.add_argument( + 'output_dir', + type=str, + help='Output directory where videos will be saved.') + p.add_argument( + '-f', + '--trim-format', + type=str, + default='%06d', + help=('This will be the format for the ' + 'filename of trimmed videos: ' + 'videoid_%0xd(start_time)_%0xd(end_time).mp4')) + p.add_argument('-n', '--num-jobs', type=int, default=24) + p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics') + # help='CSV file of the previous version of Kinetics.') + main(**vars(p.parse_args())) diff --git a/tools/data/kinetics/download_annotations.sh b/tools/data/kinetics/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..1afc7b110b6c5aeb924e326248ad359e5a14a3cc --- /dev/null +++ b/tools/data/kinetics/download_annotations.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +DATA_DIR="../../../data/${DATASET}/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://storage.googleapis.com/deepmind-media/Datasets/${DATASET}.tar.gz + +tar -zxvf ${DATASET}.tar.gz --strip-components 1 -C ${DATA_DIR}/ +mv ${DATA_DIR}/train.csv ${DATA_DIR}/kinetics_train.csv +mv ${DATA_DIR}/validate.csv ${DATA_DIR}/kinetics_val.csv +mv ${DATA_DIR}/test.csv ${DATA_DIR}/kinetics_test.csv + +rm ${DATASET}.tar.gz +rm ${DATA_DIR}/*.json diff --git a/tools/data/kinetics/download_backup_annotations.sh b/tools/data/kinetics/download_backup_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..67745b5a0844d825d6338a108683a2154eafd70f --- /dev/null +++ b/tools/data/kinetics/download_backup_annotations.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +DATA_DIR="../../../data/${DATASET}/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + + +wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_train.csv +wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_val.csv +wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_test.csv + +mv kinetics_train.csv ${DATA_DIR}/kinetics_train.csv +mv kinetics_val.csv ${DATA_DIR}/kinetics_val.csv +mv kinetics_test.csv ${DATA_DIR}/kinetics_test.csv diff --git a/tools/data/kinetics/download_videos.sh b/tools/data/kinetics/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..deb8094c20a4c6ce286950d662e455376f1e8349 --- /dev/null +++ b/tools/data/kinetics/download_videos.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate kinetics +pip install --upgrade youtube-dl + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +DATA_DIR="../../../data/${DATASET}" +ANNO_DIR="../../../data/${DATASET}/annotations" +python download.py ${ANNO_DIR}/kinetics_train.csv ${DATA_DIR}/videos_train +python download.py ${ANNO_DIR}/kinetics_val.csv ${DATA_DIR}/videos_val + +source deactivate kinetics +conda remove -n kinetics --all diff --git a/tools/data/kinetics/environment.yml b/tools/data/kinetics/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6d9959e88a91881de1be2d38928c63e9aa79938 --- /dev/null +++ b/tools/data/kinetics/environment.yml @@ -0,0 +1,36 @@ +name: kinetics +channels: + - anaconda + - menpo + - conda-forge + - defaults +dependencies: + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl diff --git a/tools/data/kinetics/extract_frames.sh b/tools/data/kinetics/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..9fd1c51b3e56926b7db061f7277b379c0a5e0224 --- /dev/null +++ b/tools/data/kinetics/extract_frames.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../ +python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated for train set" + +python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --flow-type tvl1 --ext mp4 --task both --new-short 256 +echo "Raw frames (RGB and tv-l1) Generated for val set" + +cd ${DATASET}/ diff --git a/tools/data/kinetics/extract_rgb_frames.sh b/tools/data/kinetics/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..64997930303e74b8b636de59f8f174e822f29a19 --- /dev/null +++ b/tools/data/kinetics/extract_rgb_frames.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../ +python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2 --ext mp4 --task rgb --new-short 256 +echo "Raw frames (RGB only) generated for train set" + +python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --ext mp4 --task rgb --new-short 256 +echo "Raw frames (RGB only) generated for val set" + +cd ${DATASET}/ diff --git a/tools/data/kinetics/extract_rgb_frames_opencv.sh b/tools/data/kinetics/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa066e4307eb95e61eab33e1bfab8b8eb2f85b1a --- /dev/null +++ b/tools/data/kinetics/extract_rgb_frames_opencv.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../ +python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2 --ext mp4 --task rgb --new-short 256 --use-opencv +echo "Raw frames (RGB only) generated for train set" + +python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --ext mp4 --task rgb --new-short 256 --use-opencv +echo "Raw frames (RGB only) generated for val set" + +cd ${DATASET}/ diff --git a/tools/data/kinetics/generate_rawframes_filelist.sh b/tools/data/kinetics/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..dc41bfc69fbb86c2439eaff2d6c717a691679c43 --- /dev/null +++ b/tools/data/kinetics/generate_rawframes_filelist.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/rawframes_train/ --level 2 --format rawframes --num-split 1 --subset train --shuffle +echo "Train filelist for rawframes generated." + +PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/rawframes_val/ --level 2 --format rawframes --num-split 1 --subset val --shuffle +echo "Val filelist for rawframes generated." +cd tools/data/${DATASET}/ diff --git a/tools/data/kinetics/generate_videos_filelist.sh b/tools/data/kinetics/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..252e828428173bb4c07e5ea2d5436899e3712bc5 --- /dev/null +++ b/tools/data/kinetics/generate_videos_filelist.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/videos_train/ --level 2 --format videos --num-split 1 --subset train --shuffle +echo "Train filelist for video generated." + +PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/videos_val/ --level 2 --format videos --num-split 1 --subset val --shuffle +echo "Val filelist for video generated." +cd tools/data/kinetics/ diff --git a/tools/data/kinetics/label_map_k400.txt b/tools/data/kinetics/label_map_k400.txt new file mode 100644 index 0000000000000000000000000000000000000000..9193a07c6bda30b85b591da52e5e4cb375c31c06 --- /dev/null +++ b/tools/data/kinetics/label_map_k400.txt @@ -0,0 +1,400 @@ +abseiling +air drumming +answering questions +applauding +applying cream +archery +arm wrestling +arranging flowers +assembling computer +auctioning +baby waking up +baking cookies +balloon blowing +bandaging +barbequing +bartending +beatboxing +bee keeping +belly dancing +bench pressing +bending back +bending metal +biking through snow +blasting sand +blowing glass +blowing leaves +blowing nose +blowing out candles +bobsledding +bookbinding +bouncing on trampoline +bowling +braiding hair +breading or breadcrumbing +breakdancing +brush painting +brushing hair +brushing teeth +building cabinet +building shed +bungee jumping +busking +canoeing or kayaking +capoeira +carrying baby +cartwheeling +carving pumpkin +catching fish +catching or throwing baseball +catching or throwing frisbee +catching or throwing softball +celebrating +changing oil +changing wheel +checking tires +cheerleading +chopping wood +clapping +clay pottery making +clean and jerk +cleaning floor +cleaning gutters +cleaning pool +cleaning shoes +cleaning toilet +cleaning windows +climbing a rope +climbing ladder +climbing tree +contact juggling +cooking chicken +cooking egg +cooking on campfire +cooking sausages +counting money +country line dancing +cracking neck +crawling baby +crossing river +crying +curling hair +cutting nails +cutting pineapple +cutting watermelon +dancing ballet +dancing charleston +dancing gangnam style +dancing macarena +deadlifting +decorating the christmas tree +digging +dining +disc golfing +diving cliff +dodgeball +doing aerobics +doing laundry +doing nails +drawing +dribbling basketball +drinking +drinking beer +drinking shots +driving car +driving tractor +drop kicking +drumming fingers +dunking basketball +dying hair +eating burger +eating cake +eating carrots +eating chips +eating doughnuts +eating hotdog +eating ice cream +eating spaghetti +eating watermelon +egg hunting +exercising arm +exercising with an exercise ball +extinguishing fire +faceplanting +feeding birds +feeding fish +feeding goats +filling eyebrows +finger snapping +fixing hair +flipping pancake +flying kite +folding clothes +folding napkins +folding paper +front raises +frying vegetables +garbage collecting +gargling +getting a haircut +getting a tattoo +giving or receiving award +golf chipping +golf driving +golf putting +grinding meat +grooming dog +grooming horse +gymnastics tumbling +hammer throw +headbanging +headbutting +high jump +high kick +hitting baseball +hockey stop +holding snake +hopscotch +hoverboarding +hugging +hula hooping +hurdling +hurling (sport) +ice climbing +ice fishing +ice skating +ironing +javelin throw +jetskiing +jogging +juggling balls +juggling fire +juggling soccer ball +jumping into pool +jumpstyle dancing +kicking field goal +kicking soccer ball +kissing +kitesurfing +knitting +krumping +laughing +laying bricks +long jump +lunge +making a cake +making a sandwich +making bed +making jewelry +making pizza +making snowman +making sushi +making tea +marching +massaging back +massaging feet +massaging legs +massaging person's head +milking cow +mopping floor +motorcycling +moving furniture +mowing lawn +news anchoring +opening bottle +opening present +paragliding +parasailing +parkour +passing American football (in game) +passing American football (not in game) +peeling apples +peeling potatoes +petting animal (not cat) +petting cat +picking fruit +planting trees +plastering +playing accordion +playing badminton +playing bagpipes +playing basketball +playing bass guitar +playing cards +playing cello +playing chess +playing clarinet +playing controller +playing cricket +playing cymbals +playing didgeridoo +playing drums +playing flute +playing guitar +playing harmonica +playing harp +playing ice hockey +playing keyboard +playing kickball +playing monopoly +playing organ +playing paintball +playing piano +playing poker +playing recorder +playing saxophone +playing squash or racquetball +playing tennis +playing trombone +playing trumpet +playing ukulele +playing violin +playing volleyball +playing xylophone +pole vault +presenting weather forecast +pull ups +pumping fist +pumping gas +punching bag +punching person (boxing) +push up +pushing car +pushing cart +pushing wheelchair +reading book +reading newspaper +recording music +riding a bike +riding camel +riding elephant +riding mechanical bull +riding mountain bike +riding mule +riding or walking with horse +riding scooter +riding unicycle +ripping paper +robot dancing +rock climbing +rock scissors paper +roller skating +running on treadmill +sailing +salsa dancing +sanding floor +scrambling eggs +scuba diving +setting table +shaking hands +shaking head +sharpening knives +sharpening pencil +shaving head +shaving legs +shearing sheep +shining shoes +shooting basketball +shooting goal (soccer) +shot put +shoveling snow +shredding paper +shuffling cards +side kick +sign language interpreting +singing +situp +skateboarding +ski jumping +skiing (not slalom or crosscountry) +skiing crosscountry +skiing slalom +skipping rope +skydiving +slacklining +slapping +sled dog racing +smoking +smoking hookah +snatch weight lifting +sneezing +sniffing +snorkeling +snowboarding +snowkiting +snowmobiling +somersaulting +spinning poi +spray painting +spraying +springboard diving +squat +sticking tongue out +stomping grapes +stretching arm +stretching leg +strumming guitar +surfing crowd +surfing water +sweeping floor +swimming backstroke +swimming breast stroke +swimming butterfly stroke +swing dancing +swinging legs +swinging on something +sword fighting +tai chi +taking a shower +tango dancing +tap dancing +tapping guitar +tapping pen +tasting beer +tasting food +testifying +texting +throwing axe +throwing ball +throwing discus +tickling +tobogganing +tossing coin +tossing salad +training dog +trapezing +trimming or shaving beard +trimming trees +triple jump +tying bow tie +tying knot (not on a tie) +tying tie +unboxing +unloading truck +using computer +using remote controller (not gaming) +using segway +vault +waiting in line +walking the dog +washing dishes +washing feet +washing hair +washing hands +water skiing +water sliding +watering plants +waxing back +waxing chest +waxing eyebrows +waxing legs +weaving basket +welding +whistling +windsurfing +wrapping present +wrestling +writing +yawning +yoga +zumba diff --git a/tools/data/kinetics/label_map_k600.txt b/tools/data/kinetics/label_map_k600.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6bbf70797ecbfed445cc6b02039f3475ac3a502 --- /dev/null +++ b/tools/data/kinetics/label_map_k600.txt @@ -0,0 +1,600 @@ +abseiling +acting in play +adjusting glasses +air drumming +alligator wrestling +answering questions +applauding +applying cream +archaeological excavation +archery +arguing +arm wrestling +arranging flowers +assembling bicycle +assembling computer +attending conference +auctioning +backflip (human) +baking cookies +bandaging +barbequing +bartending +base jumping +bathing dog +battle rope training +beatboxing +bee keeping +belly dancing +bench pressing +bending back +bending metal +biking through snow +blasting sand +blowdrying hair +blowing bubble gum +blowing glass +blowing leaves +blowing nose +blowing out candles +bobsledding +bodysurfing +bookbinding +bottling +bouncing on bouncy castle +bouncing on trampoline +bowling +braiding hair +breading or breadcrumbing +breakdancing +breaking boards +breathing fire +brush painting +brushing hair +brushing teeth +building cabinet +building lego +building sandcastle +building shed +bull fighting +bulldozing +bungee jumping +burping +busking +calculating +calligraphy +canoeing or kayaking +capoeira +capsizing +card stacking +card throwing +carrying baby +cartwheeling +carving ice +carving pumpkin +casting fishing line +catching fish +catching or throwing baseball +catching or throwing frisbee +catching or throwing softball +celebrating +changing gear in car +changing oil +changing wheel (not on bike) +checking tires +cheerleading +chewing gum +chiseling stone +chiseling wood +chopping meat +chopping vegetables +chopping wood +clam digging +clapping +clay pottery making +clean and jerk +cleaning gutters +cleaning pool +cleaning shoes +cleaning toilet +cleaning windows +climbing a rope +climbing ladder +climbing tree +coloring in +combing hair +contact juggling +contorting +cooking egg +cooking on campfire +cooking sausages (not on barbeque) +cooking scallops +cosplaying +counting money +country line dancing +cracking back +cracking knuckles +cracking neck +crawling baby +crossing eyes +crossing river +crying +cumbia +curling (sport) +curling hair +cutting apple +cutting nails +cutting orange +cutting pineapple +cutting watermelon +dancing ballet +dancing charleston +dancing gangnam style +dancing macarena +deadlifting +decorating the christmas tree +delivering mail +dining +directing traffic +disc golfing +diving cliff +docking boat +dodgeball +doing aerobics +doing jigsaw puzzle +doing laundry +doing nails +drawing +dribbling basketball +drinking shots +driving car +driving tractor +drooling +drop kicking +drumming fingers +dumpster diving +dunking basketball +dyeing eyebrows +dyeing hair +eating burger +eating cake +eating carrots +eating chips +eating doughnuts +eating hotdog +eating ice cream +eating spaghetti +eating watermelon +egg hunting +embroidering +exercising with an exercise ball +extinguishing fire +faceplanting +falling off bike +falling off chair +feeding birds +feeding fish +feeding goats +fencing (sport) +fidgeting +finger snapping +fixing bicycle +fixing hair +flint knapping +flipping pancake +fly tying +flying kite +folding clothes +folding napkins +folding paper +front raises +frying vegetables +geocaching +getting a haircut +getting a piercing +getting a tattoo +giving or receiving award +gold panning +golf chipping +golf driving +golf putting +gospel singing in church +grinding meat +grooming dog +grooming horse +gymnastics tumbling +hammer throw +hand washing clothes +head stand +headbanging +headbutting +high jump +high kick +historical reenactment +hitting baseball +hockey stop +holding snake +home roasting coffee +hopscotch +hoverboarding +huddling +hugging (not baby) +hugging baby +hula hooping +hurdling +hurling (sport) +ice climbing +ice fishing +ice skating +ice swimming +inflating balloons +installing carpet +ironing +ironing hair +javelin throw +jaywalking +jetskiing +jogging +juggling balls +juggling fire +juggling soccer ball +jumping bicycle +jumping into pool +jumping jacks +jumpstyle dancing +karaoke +kicking field goal +kicking soccer ball +kissing +kitesurfing +knitting +krumping +land sailing +laughing +lawn mower racing +laying bricks +laying concrete +laying stone +laying tiles +leatherworking +licking +lifting hat +lighting fire +lock picking +long jump +longboarding +looking at phone +luge +lunge +making a cake +making a sandwich +making balloon shapes +making bubbles +making cheese +making horseshoes +making jewelry +making paper aeroplanes +making pizza +making snowman +making sushi +making tea +making the bed +marching +marriage proposal +massaging back +massaging feet +massaging legs +massaging neck +massaging person's head +milking cow +moon walking +mopping floor +mosh pit dancing +motorcycling +mountain climber (exercise) +moving furniture +mowing lawn +mushroom foraging +needle felting +news anchoring +opening bottle (not wine) +opening door +opening present +opening refrigerator +opening wine bottle +packing +paragliding +parasailing +parkour +passing American football (in game) +passing american football (not in game) +passing soccer ball +peeling apples +peeling potatoes +person collecting garbage +petting animal (not cat) +petting cat +photobombing +photocopying +picking fruit +pillow fight +pinching +pirouetting +planing wood +planting trees +plastering +playing accordion +playing badminton +playing bagpipes +playing basketball +playing bass guitar +playing beer pong +playing blackjack +playing cello +playing chess +playing clarinet +playing controller +playing cricket +playing cymbals +playing darts +playing didgeridoo +playing dominoes +playing drums +playing field hockey +playing flute +playing gong +playing guitar +playing hand clapping games +playing harmonica +playing harp +playing ice hockey +playing keyboard +playing kickball +playing laser tag +playing lute +playing maracas +playing marbles +playing monopoly +playing netball +playing ocarina +playing organ +playing paintball +playing pan pipes +playing piano +playing pinball +playing ping pong +playing poker +playing polo +playing recorder +playing rubiks cube +playing saxophone +playing scrabble +playing squash or racquetball +playing tennis +playing trombone +playing trumpet +playing ukulele +playing violin +playing volleyball +playing with trains +playing xylophone +poking bellybutton +pole vault +polishing metal +popping balloons +pouring beer +preparing salad +presenting weather forecast +pull ups +pumping fist +pumping gas +punching bag +punching person (boxing) +push up +pushing car +pushing cart +pushing wheelbarrow +pushing wheelchair +putting in contact lenses +putting on eyeliner +putting on foundation +putting on lipstick +putting on mascara +putting on sari +putting on shoes +raising eyebrows +reading book +reading newspaper +recording music +repairing puncture +riding a bike +riding camel +riding elephant +riding mechanical bull +riding mule +riding or walking with horse +riding scooter +riding snow blower +riding unicycle +ripping paper +roasting marshmallows +roasting pig +robot dancing +rock climbing +rock scissors paper +roller skating +rolling pastry +rope pushdown +running on treadmill +sailing +salsa dancing +sanding floor +sausage making +sawing wood +scrambling eggs +scrapbooking +scrubbing face +scuba diving +separating eggs +setting table +sewing +shaking hands +shaking head +shaping bread dough +sharpening knives +sharpening pencil +shaving head +shaving legs +shearing sheep +shining flashlight +shining shoes +shooting basketball +shooting goal (soccer) +shopping +shot put +shoveling snow +shucking oysters +shuffling cards +shuffling feet +side kick +sign language interpreting +singing +sipping cup +situp +skateboarding +ski jumping +skiing crosscountry +skiing mono +skiing slalom +skipping rope +skipping stone +skydiving +slacklining +slapping +sled dog racing +sleeping +smashing +smelling feet +smoking +smoking hookah +smoking pipe +snatch weight lifting +sneezing +snorkeling +snowboarding +snowkiting +snowmobiling +somersaulting +spelunking +spinning poi +spray painting +springboard diving +square dancing +squat +standing on hands +staring +steer roping +sticking tongue out +stomping grapes +stretching arm +stretching leg +sucking lolly +surfing crowd +surfing water +sweeping floor +swimming backstroke +swimming breast stroke +swimming butterfly stroke +swimming front crawl +swing dancing +swinging baseball bat +swinging on something +sword fighting +sword swallowing +tackling +tagging graffiti +tai chi +talking on cell phone +tango dancing +tap dancing +tapping guitar +tapping pen +tasting beer +tasting food +tasting wine +testifying +texting +threading needle +throwing axe +throwing ball (not baseball or American football) +throwing discus +throwing knife +throwing snowballs +throwing tantrum +throwing water balloon +tickling +tie dying +tightrope walking +tiptoeing +tobogganing +tossing coin +training dog +trapezing +trimming or shaving beard +trimming shrubs +trimming trees +triple jump +twiddling fingers +tying bow tie +tying knot (not on a tie) +tying necktie +tying shoe laces +unboxing +unloading truck +using a microscope +using a paint roller +using a power drill +using a sledge hammer +using a wrench +using atm +using bagging machine +using circular saw +using inhaler +using puppets +using remote controller (not gaming) +using segway +vacuuming floor +visiting the zoo +wading through mud +wading through water +waiting in line +waking up +walking the dog +walking through snow +washing dishes +washing feet +washing hair +washing hands +watching tv +water skiing +water sliding +watering plants +waving hand +waxing back +waxing chest +waxing eyebrows +waxing legs +weaving basket +weaving fabric +welding +whistling +windsurfing +winking +wood burning (art) +wrapping present +wrestling +writing +yarn spinning +yawning +yoga +zumba diff --git a/tools/data/kinetics/label_map_k700.txt b/tools/data/kinetics/label_map_k700.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbcedf99bf9e4bbb71bb061401e422c98a300956 --- /dev/null +++ b/tools/data/kinetics/label_map_k700.txt @@ -0,0 +1,700 @@ +abseiling +acting in play +adjusting glasses +air drumming +alligator wrestling +answering questions +applauding +applying cream +archaeological excavation +archery +arguing +arm wrestling +arranging flowers +arresting +assembling bicycle +assembling computer +attending conference +auctioning +baby waking up +backflip (human) +baking cookies +bandaging +barbequing +bartending +base jumping +bathing dog +battle rope training +beatboxing +bee keeping +being excited +being in zero gravity +belly dancing +bench pressing +bending back +bending metal +biking through snow +blasting sand +blending fruit +blowdrying hair +blowing bubble gum +blowing glass +blowing leaves +blowing nose +blowing out candles +bobsledding +bodysurfing +bookbinding +bottling +bouncing ball (not juggling) +bouncing on bouncy castle +bouncing on trampoline +bowling +braiding hair +breading or breadcrumbing +breakdancing +breaking boards +breaking glass +breathing fire +brush painting +brushing floor +brushing hair +brushing teeth +building cabinet +building lego +building sandcastle +building shed +bulldozing +bungee jumping +burping +busking +calculating +calligraphy +canoeing or kayaking +capoeira +capsizing +card stacking +card throwing +carrying baby +carrying weight +cartwheeling +carving ice +carving marble +carving pumpkin +carving wood with a knife +casting fishing line +catching fish +catching or throwing baseball +catching or throwing frisbee +catching or throwing softball +celebrating +changing gear in car +changing oil +changing wheel (not on bike) +chasing +checking tires +checking watch +cheerleading +chewing gum +chiseling stone +chiseling wood +chopping meat +chopping wood +clam digging +clapping +clay pottery making +clean and jerk +cleaning gutters +cleaning pool +cleaning shoes +cleaning toilet +cleaning windows +climbing a rope +climbing ladder +climbing tree +closing door +coloring in +combing hair +contact juggling +contorting +cooking chicken +cooking egg +cooking on campfire +cooking sausages (not on barbeque) +cooking scallops +cosplaying +coughing +counting money +country line dancing +cracking back +cracking knuckles +cracking neck +crawling baby +crocheting +crossing eyes +crossing river +crying +cumbia +curling (sport) +curling eyelashes +curling hair +cutting apple +cutting cake +cutting nails +cutting orange +cutting pineapple +cutting watermelon +dancing ballet +dancing charleston +dancing gangnam style +dancing macarena +deadlifting +dealing cards +decorating the christmas tree +decoupage +delivering mail +digging +dining +directing traffic +disc golfing +diving cliff +docking boat +dodgeball +doing aerobics +doing jigsaw puzzle +doing laundry +doing nails +doing sudoku +drawing +dribbling basketball +drinking shots +driving car +driving tractor +drooling +drop kicking +drumming fingers +dumpster diving +dunking basketball +dyeing eyebrows +dyeing hair +eating burger +eating cake +eating carrots +eating chips +eating doughnuts +eating hotdog +eating ice cream +eating nachos +eating spaghetti +eating watermelon +egg hunting +embroidering +entering church +exercising arm +exercising with an exercise ball +extinguishing fire +faceplanting +falling off bike +falling off chair +feeding birds +feeding fish +feeding goats +fencing (sport) +fidgeting +filling cake +filling eyebrows +finger snapping +fixing bicycle +fixing hair +flint knapping +flipping bottle +flipping pancake +fly tying +flying kite +folding clothes +folding napkins +folding paper +front raises +frying vegetables +gargling +geocaching +getting a haircut +getting a piercing +getting a tattoo +giving or receiving award +gold panning +golf chipping +golf driving +golf putting +gospel singing in church +grinding meat +grooming cat +grooming dog +grooming horse +gymnastics tumbling +hammer throw +hand washing clothes +head stand +headbanging +headbutting +helmet diving +herding cattle +high fiving +high jump +high kick +historical reenactment +hitting baseball +hockey stop +holding snake +home roasting coffee +hopscotch +hoverboarding +huddling +hugging (not baby) +hugging baby +hula hooping +hurdling +hurling (sport) +ice climbing +ice fishing +ice skating +ice swimming +inflating balloons +installing carpet +ironing +ironing hair +javelin throw +jaywalking +jetskiing +jogging +juggling balls +juggling fire +juggling soccer ball +jumping bicycle +jumping into pool +jumping jacks +jumping sofa +jumpstyle dancing +karaoke +kicking field goal +kicking soccer ball +kissing +kitesurfing +knitting +krumping +land sailing +laughing +lawn mower racing +laying bricks +laying concrete +laying decking +laying stone +laying tiles +leatherworking +letting go of balloon +licking +lifting hat +lighting candle +lighting fire +listening with headphones +lock picking +long jump +longboarding +looking at phone +looking in mirror +luge +lunge +making a cake +making a sandwich +making balloon shapes +making bubbles +making cheese +making horseshoes +making jewelry +making latte art +making paper aeroplanes +making pizza +making slime +making snowman +making sushi +making tea +making the bed +marching +marriage proposal +massaging back +massaging feet +massaging legs +massaging neck +massaging person's head +metal detecting +milking cow +milking goat +mixing colours +moon walking +mopping floor +mosh pit dancing +motorcycling +mountain climber (exercise) +moving baby +moving child +moving furniture +mowing lawn +mushroom foraging +needle felting +news anchoring +opening bottle (not wine) +opening coconuts +opening door +opening present +opening refrigerator +opening wine bottle +packing +paragliding +parasailing +parkour +passing American football (in game) +passing American football (not in game) +passing soccer ball +peeling apples +peeling banana +peeling potatoes +person collecting garbage +petting animal (not cat) +petting cat +petting horse +photobombing +photocopying +picking apples +picking blueberries +pillow fight +pinching +pirouetting +planing wood +planting trees +plastering +playing accordion +playing american football +playing badminton +playing bagpipes +playing basketball +playing bass guitar +playing beer pong +playing billiards +playing blackjack +playing cards +playing cello +playing checkers +playing chess +playing clarinet +playing controller +playing cricket +playing cymbals +playing darts +playing didgeridoo +playing dominoes +playing drums +playing field hockey +playing flute +playing gong +playing guitar +playing hand clapping games +playing harmonica +playing harp +playing ice hockey +playing keyboard +playing kickball +playing laser tag +playing lute +playing mahjong +playing maracas +playing marbles +playing monopoly +playing netball +playing nose flute +playing oboe +playing ocarina +playing organ +playing paintball +playing pan pipes +playing piano +playing piccolo +playing pinball +playing ping pong +playing poker +playing polo +playing recorder +playing road hockey +playing rounders +playing rubiks cube +playing saxophone +playing scrabble +playing shuffleboard +playing slot machine +playing squash or racquetball +playing tennis +playing trombone +playing trumpet +playing ukulele +playing violin +playing volleyball +playing with trains +playing xylophone +poaching eggs +poking bellybutton +pole vault +polishing furniture +polishing metal +popping balloons +pouring beer +pouring milk +pouring wine +preparing salad +presenting weather forecast +pretending to be a statue +pull ups +pulling espresso shot +pulling rope (game) +pumping fist +pumping gas +punching bag +punching person (boxing) +push up +pushing car +pushing cart +pushing wheelbarrow +pushing wheelchair +putting in contact lenses +putting on eyeliner +putting on foundation +putting on lipstick +putting on mascara +putting on sari +putting on shoes +putting wallpaper on wall +raising eyebrows +reading book +reading newspaper +recording music +repairing puncture +riding a bike +riding camel +riding elephant +riding mechanical bull +riding mule +riding or walking with horse +riding scooter +riding snow blower +riding unicycle +ripping paper +roasting marshmallows +roasting pig +robot dancing +rock climbing +rock scissors paper +roller skating +rolling eyes +rolling pastry +rope pushdown +running on treadmill +sailing +salsa dancing +saluting +sanding floor +sanding wood +sausage making +sawing wood +scrambling eggs +scrapbooking +scrubbing face +scuba diving +seasoning food +separating eggs +setting table +sewing +shaking hands +shaking head +shaping bread dough +sharpening knives +sharpening pencil +shaving head +shaving legs +shearing sheep +shining flashlight +shining shoes +shoot dance +shooting basketball +shooting goal (soccer) +shooting off fireworks +shopping +shot put +shouting +shoveling snow +shredding paper +shucking oysters +shuffling cards +shuffling feet +side kick +sieving +sign language interpreting +silent disco +singing +sipping cup +situp +skateboarding +ski ballet +ski jumping +skiing crosscountry +skiing mono +skiing slalom +skipping rope +skipping stone +skydiving +slacklining +slapping +sled dog racing +sleeping +slicing onion +smashing +smelling feet +smoking +smoking hookah +smoking pipe +snatch weight lifting +sneezing +snorkeling +snowboarding +snowkiting +snowmobiling +somersaulting +spelunking +spinning plates +spinning poi +splashing water +spray painting +spraying +springboard diving +square dancing +squat +squeezing orange +stacking cups +stacking dice +standing on hands +staring +steer roping +steering car +sticking tongue out +stomping grapes +stretching arm +stretching leg +sucking lolly +surfing crowd +surfing water +surveying +sweeping floor +swimming backstroke +swimming breast stroke +swimming butterfly stroke +swimming front crawl +swimming with dolphins +swimming with sharks +swing dancing +swinging baseball bat +swinging on something +sword fighting +sword swallowing +tackling +tagging graffiti +tai chi +taking photo +talking on cell phone +tango dancing +tap dancing +tapping guitar +tapping pen +tasting beer +tasting food +tasting wine +testifying +texting +threading needle +throwing axe +throwing ball (not baseball or American football) +throwing discus +throwing knife +throwing snowballs +throwing tantrum +throwing water balloon +tickling +tie dying +tightrope walking +tiptoeing +tobogganing +tossing coin +tossing salad +training dog +trapezing +treating wood +trimming or shaving beard +trimming shrubs +trimming trees +triple jump +twiddling fingers +tying bow tie +tying knot (not on a tie) +tying necktie +tying shoe laces +unboxing +uncorking champagne +unloading truck +using a microscope +using a paint roller +using a power drill +using a sledge hammer +using a wrench +using atm +using bagging machine +using circular saw +using inhaler +using megaphone +using puppets +using remote controller (not gaming) +using segway +vacuuming car +vacuuming floor +visiting the zoo +wading through mud +wading through water +waiting in line +waking up +walking on stilts +walking the dog +walking through snow +walking with crutches +washing dishes +washing feet +washing hair +washing hands +watching tv +water skiing +water sliding +watering plants +waving hand +waxing armpits +waxing back +waxing chest +waxing eyebrows +waxing legs +weaving basket +weaving fabric +welding +whistling +windsurfing +winking +wood burning (art) +wrapping present +wrestling +writing +yarn spinning +yawning +yoga +zumba diff --git a/tools/data/kinetics/preprocess_k400.sh b/tools/data/kinetics/preprocess_k400.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a886511365d25186e13e1a9d5b3adbc0ca2e5d7 --- /dev/null +++ b/tools/data/kinetics/preprocess_k400.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -x + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics-400/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +mv $(dirname $DATA_ROOT)/Kinetics-400 $DATA_ROOT diff --git a/tools/data/kinetics/preprocess_k600.sh b/tools/data/kinetics/preprocess_k600.sh new file mode 100644 index 0000000000000000000000000000000000000000..2af2318f269cd23520126076340a515ad23f6a86 --- /dev/null +++ b/tools/data/kinetics/preprocess_k600.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -x + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics600/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +mv $(dirname $DATA_ROOT)/Kinetics600 $DATA_ROOT diff --git a/tools/data/kinetics/preprocess_k700.sh b/tools/data/kinetics/preprocess_k700.sh new file mode 100644 index 0000000000000000000000000000000000000000..601ce75837b13c802b019d50e7cb2f21249ca7c0 --- /dev/null +++ b/tools/data/kinetics/preprocess_k700.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -x + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics_700/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +mv $(dirname $DATA_ROOT)/Kinetics_700 $DATA_ROOT diff --git a/tools/data/kinetics/rename_classnames.sh b/tools/data/kinetics/rename_classnames.sh new file mode 100644 index 0000000000000000000000000000000000000000..5a338ac0fe7f9b246d20111cf06f4df339e01551 --- /dev/null +++ b/tools/data/kinetics/rename_classnames.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Rename classname for convenience +DATASET=$1 +if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then + echo "We are processing $DATASET" +else + echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700" + exit 0 +fi + +cd ../../../data/${DATASET}/ +ls ./videos_train | while read class; do \ + newclass=`echo $class | tr " " "_" `; + if [ "${class}" != "${newclass}" ] + then + mv "videos_train/${class}" "videos_train/${newclass}"; + fi +done + +ls ./videos_val | while read class; do \ + newclass=`echo $class | tr " " "_" `; + if [ "${class}" != "${newclass}" ] + then + mv "videos_val/${class}" "videos_val/${newclass}"; + fi +done + +cd ../../tools/data/kinetics/ diff --git a/tools/data/kinetics710/README.md b/tools/data/kinetics710/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7494f41dfd05b74a29cbe5929e344c259eecddd5 --- /dev/null +++ b/tools/data/kinetics710/README.md @@ -0,0 +1,91 @@ +# Preparing Kinetics-710 + +## Introduction + + + +```BibTeX +@misc{li2022uniformerv2, + title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer}, + author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao}, + year={2022}, + eprint={2211.09552}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +For basic dataset information, please refer to the [paper](https://arxiv.org/pdf/2211.09552.pdf). The scripts can be used for preparing kinetics-710. MMAction2 supports Kinetics-710 +dataset as a concat dataset, which means only provides a list of annotation files, and makes use of the original data of Kinetics-400/600/700 dataset. You could refer to the [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py) +for details, which also provides a template config about how to use concat dataset in MMAction2. +Before we start, please make sure that the directory is located at `$MMACTION2`. + +## Step 1. Download Kinetics 400/600/700 + +Kinetics-710 is a video benchmark based on Kinetics-400/600/700, which merges the training set of these Kinetics datasets, and deletes the repeated videos according to Youtube IDs. MMAction2 provides an annotation file based on the Kinetics-400/600/700 on [OpenDataLab](https://opendatalab.com/). So we suggest you download Kinetics-400/600/700 first from OpenDataLab by [MIM](https://github.com/open-mmlab/mim). + +```shell +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login +# download Kinetics-400/600/700, note that this might take a long time. +mim download mmaction2 --dataset kinetics400 +mim download mmaction2 --dataset kinetics600 +mim download mmaction2 --dataset kinetics700 + +``` + +## Step 2. Download Kinetics-710 Annotations + +We provide the annotation list of Kinetics-710 corresponding to OpenDataLab version Kinetics, you could download it from aliyun and unzip it to the `$MMACTION2/data/` + +```shell +wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip +cd data && unzip annotations.zip && cd .. + +``` + +## Step 3. Folder Structure + +After the whole data pipeline for Kinetics preparation. +you can get the videos and annotation files for Kinetics-710. + +In the context of the whole project (for Kinetics only), the *minimal* folder structure will look like: +(*minimal* means that some data are not necessary: for example, you may want to evaluate kinetics using the original video format.) + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ kinetics400 +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ jf7RDuUTrsQ.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”œโ”€โ”€ kinetics600 +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol_00 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ -A5JFdMXB_k_000018_000028.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol63 +โ”‚ โ”œโ”€โ”€ kinetics700 +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol_00 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ -Paa0R0tQ1w_000009_000019.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol63 +โ”‚ โ”œโ”€โ”€ kinetics710 +โ”‚ โ”‚ โ”œโ”€โ”€ k400_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k400_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k600_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k600_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k700_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k700_val_list_videos.txt +``` + +For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/kinetics710/README_zh-CN.md b/tools/data/kinetics710/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..17b817b0beea8c0eeeb79b4aea46c584fe96fb97 --- /dev/null +++ b/tools/data/kinetics710/README_zh-CN.md @@ -0,0 +1,89 @@ +# ๅ‡†ๅค‡ Kinetics-710 + +## ไป‹็ป + + + +```BibTeX +@misc{li2022uniformerv2, + title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer}, + author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao}, + year={2022}, + eprint={2211.09552}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +ๅ…ณไบŽๅŸบๆœฌๆ•ฐๆฎ้›†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎบๆ–‡](https://arxiv.org/pdf/2211.09552.pdf)ใ€‚่ฟ™ไบ›่„šๆœฌๅฏไปฅ็”จไบŽๅ‡†ๅค‡ kinetics-710ใ€‚MMAction2 ไปฅ Concat Daataset ็š„ๅฝขๅผๆ”ฏๆŒไบ† Kinetics-710 ๆ•ฐๆฎ้›†๏ผŒๆˆ‘ไปฌๅชๆไพ›ไธ€ไธชๆณจ้‡Šๆ–‡ไปถๅˆ—่กจ๏ผŒๅนถๅˆฉ็”จ Kinetics-400/600/700 ๆ•ฐๆฎ้›†็š„ๅŽŸๅง‹ๆ•ฐๆฎใ€‚ไฝ ๅฏไปฅๅ‚่€ƒ [้…็ฝฎ](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py) ไบ†่งฃ่ฏฆๆƒ…๏ผŒๅฎƒไนŸๆไพ›ไบ†ไธ€ไธชๆจกๆฟ้…็ฝฎ๏ผŒ่ฏดๆ˜Žไบ†ๅฆ‚ไฝ•ๅœจ MMAction2 ไธญไฝฟ็”จ Concat Datasetใ€‚ +ๅœจๆˆ‘ไปฌๅผ€ๅง‹ไน‹ๅ‰๏ผŒ่ฏท็กฎไฟ็›ฎๅฝ•ไฝไบŽ `$MMACTION2`ใ€‚ + +## ็ฌฌไธ€ๆญฅ๏ผšไธ‹่ฝฝ Kinetics 400/600/700 + +Kinetics-710 ๆ˜ฏๅŸบไบŽ Kinetics-400/600/700 ็š„่ง†้ข‘ๆ•ฐๆฎ้›†๏ผŒๅฎƒๅˆๅนถไบ†่ฟ™ไบ› Kinetics ๆ•ฐๆฎ้›†็š„่ฎญ็ปƒ้›†๏ผŒๅนถๆ นๆฎ Youtube ID ๅˆ ้™คไบ†้‡ๅค็š„่ง†้ข‘ใ€‚MMAction2 ๆไพ›ไบ†ไธ€ไธชๅŸบไบŽ Kinetics-400/600/700 ็š„ OpenDataLab ็‰ˆๆœฌ็š„ๆ ‡ๆณจๆ–‡ไปถ๏ผŒไฝ ๅฏไปฅ้€š่ฟ‡ [MIM](https://github.com/open-mmlab/mim) ไปŽ OpenDataLab ไธ‹่ฝฝใ€‚ + +```shell +# ๅฎ‰่ฃ… OpenXLab CLI ๅทฅๅ…ท +pip install -U openxlab +# ็™ปๅฝ• OpenXLab +openxlab login +# ไธ‹่ฝฝ Kinetics-400/600/700๏ผŒๆณจๆ„่ฟ™ๅฏ่ƒฝ้œ€่ฆๅพˆ้•ฟๆ—ถ้—ดใ€‚ +mim download mmaction2 --dataset kinetics400 +mim download mmaction2 --dataset kinetics600 +mim download mmaction2 --dataset kinetics700 + +``` + +## ็ฌฌไบŒๆญฅ๏ผšไธ‹่ฝฝ Kinetics-710 ๆ ‡ๆณจๆ–‡ไปถ + +ๆˆ‘ไปฌๆไพ›ไบ†ไธŽ OpenDataLab ็‰ˆๆœฌ Kinetics ็›ธๅฏนๅบ”็š„ Kinetics-710 ๆ ‡ๆณจๆ–‡ไปถๅˆ—่กจ๏ผŒไฝ ๅฏไปฅไปŽ้˜ฟ้‡Œไบ‘ไธ‹่ฝฝๅฎƒ๏ผŒๅนถๅฐ†ๅ…ถ่งฃๅŽ‹ๅˆฐ `$MMACTION2/data/` + +```shell +wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip +cd data && unzip annotations.zip && cd .. + +``` + +## ็ฌฌไธ‰ๆญฅ๏ผšๆ–‡ไปถๅคน็ป“ๆž„ + +ๅฎŒๆˆ Kinetics ๅ‡†ๅค‡็š„ๆ•ดไธชๆ•ฐๆฎๆต็จ‹ๅŽใ€‚ +ไฝ ๅฏไปฅๅพ—ๅˆฐ Kinetics-710 ็š„่ง†้ข‘ๅ’Œๆณจ้‡Šๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็›ฎๅฝ•ไธ‹๏ผˆไป…้’ˆๅฏน Kinetics๏ผ‰๏ผŒ*ๆœ€ๅฐ*็š„ๆ–‡ไปถๅคน็ป“ๆž„ๅฆ‚ไธ‹๏ผš +๏ผˆ*ๆœ€ๅฐ*ๆ„ๅ‘ณ็€ไธ€ไบ›ๆ•ฐๆฎๆ˜ฏไธๅฟ…่ฆ็š„๏ผšไพ‹ๅฆ‚๏ผŒไฝ ๅฏ่ƒฝๆƒณ่ฆไฝฟ็”จๅŽŸๅง‹่ง†้ข‘ๆ ผๅผ่ฏ„ไผฐ kineticsใ€‚๏ผ‰ + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ kinetics400 +โ”‚ โ”‚ โ”œโ”€โ”€ videos_train +โ”‚ โ”‚ โ”œโ”€โ”€ videos_val +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ jf7RDuUTrsQ.mp4 +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”œโ”€โ”€ kinetics600 +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol_00 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ -A5JFdMXB_k_000018_000028.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol63 +โ”‚ โ”œโ”€โ”€ kinetics700 +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol_00 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ -Paa0R0tQ1w_000009_000019.mp4 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ vol63 +โ”‚ โ”œโ”€โ”€ kinetics710 +โ”‚ โ”‚ โ”œโ”€โ”€ k400_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k400_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k600_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k600_val_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k700_train_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ k700_val_list_videos.txt +``` + +ๅ…ณไบŽๅœจ Kinetics ไธŠ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ่ฏ„ไผฐ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/kinetics710/label_map_k710.txt b/tools/data/kinetics710/label_map_k710.txt new file mode 100644 index 0000000000000000000000000000000000000000..12834df684e8e8095835d351ab66f3964a5e4e6f --- /dev/null +++ b/tools/data/kinetics710/label_map_k710.txt @@ -0,0 +1,710 @@ +abseiling +air drumming +answering questions +applauding +applying cream +archery +arm wrestling +arranging flowers +assembling computer +auctioning +baby waking up +baking cookies +inflating balloons +bandaging +barbequing +bartending +beatboxing +bee keeping +belly dancing +bench pressing +bending back +bending metal +biking through snow +blasting sand +blowing glass +blowing leaves +blowing nose +blowing out candles +bobsledding +bookbinding +bouncing on trampoline +bowling +braiding hair +breading or breadcrumbing +breakdancing +brush painting +brushing hair +brushing teeth +building cabinet +building shed +bungee jumping +busking +canoeing or kayaking +capoeira +carrying baby +cartwheeling +carving pumpkin +catching fish +catching or throwing baseball +catching or throwing frisbee +catching or throwing softball +celebrating +changing oil +changing wheel (not on bike) +checking tires +cheerleading +chopping wood +clapping +clay pottery making +clean and jerk +cleaning floor +cleaning gutters +cleaning pool +cleaning shoes +cleaning toilet +cleaning windows +climbing a rope +climbing ladder +climbing tree +contact juggling +cooking chicken +cooking egg +cooking on campfire +cooking sausages (not on barbeque) +counting money +country line dancing +cracking neck +crawling baby +crossing river +crying +curling hair +cutting nails +cutting pineapple +cutting watermelon +dancing ballet +dancing charleston +dancing gangnam style +dancing macarena +deadlifting +decorating the christmas tree +digging +dining +disc golfing +diving cliff +dodgeball +doing aerobics +doing laundry +doing nails +drawing +dribbling basketball +sipping cup +drinking beer +drinking shots +driving car +driving tractor +drop kicking +drumming fingers +dunking basketball +dyeing hair +eating burger +eating cake +eating carrots +eating chips +eating doughnuts +eating hotdog +eating ice cream +eating spaghetti +eating watermelon +egg hunting +exercising arm +exercising with an exercise ball +extinguishing fire +faceplanting +feeding birds +feeding fish +feeding goats +filling eyebrows +finger snapping +fixing hair +flipping pancake +flying kite +folding clothes +folding napkins +folding paper +front raises +frying vegetables +person collecting garbage +gargling +getting a haircut +getting a tattoo +giving or receiving award +golf chipping +golf driving +golf putting +grinding meat +grooming dog +grooming horse +gymnastics tumbling +hammer throw +headbanging +headbutting +high jump +high kick +hitting baseball +hockey stop +holding snake +hopscotch +hoverboarding +hugging (not baby) +hula hooping +hurdling +hurling (sport) +ice climbing +ice fishing +ice skating +ironing +javelin throw +jetskiing +jogging +juggling balls +juggling fire +juggling soccer ball +jumping into pool +jumpstyle dancing +kicking field goal +kicking soccer ball +kissing +kitesurfing +knitting +krumping +laughing +laying bricks +long jump +lunge +making a cake +making a sandwich +making the bed +making jewelry +making pizza +making snowman +making sushi +making tea +marching +massaging back +massaging feet +massaging legs +massaging person's head +milking cow +mopping floor +motorcycling +moving furniture +mowing lawn +news anchoring +opening bottle (not wine) +opening present +paragliding +parasailing +parkour +passing American football (in game) +passing American football (not in game) +peeling apples +peeling potatoes +petting animal (not cat) +petting cat +picking apples +planting trees +plastering +playing accordion +playing badminton +playing bagpipes +playing basketball +playing bass guitar +playing cards +playing cello +playing chess +playing clarinet +playing controller +playing cricket +playing cymbals +playing didgeridoo +playing drums +playing flute +playing guitar +playing harmonica +playing harp +playing ice hockey +playing keyboard +playing kickball +playing monopoly +playing organ +playing paintball +playing piano +playing poker +playing recorder +playing saxophone +playing squash or racquetball +playing tennis +playing trombone +playing trumpet +playing ukulele +playing violin +playing volleyball +playing xylophone +pole vault +presenting weather forecast +pull ups +pumping fist +pumping gas +punching bag +punching person (boxing) +push up +pushing car +pushing cart +pushing wheelchair +reading book +reading newspaper +recording music +riding a bike +riding camel +riding elephant +riding mechanical bull +riding mountain bike +riding mule +riding or walking with horse +riding scooter +riding unicycle +ripping paper +robot dancing +rock climbing +rock scissors paper +roller skating +running on treadmill +sailing +salsa dancing +sanding floor +scrambling eggs +scuba diving +setting table +shaking hands +shaking head +sharpening knives +sharpening pencil +shaving head +shaving legs +shearing sheep +shining shoes +shooting basketball +shooting goal (soccer) +shot put +shoveling snow +shredding paper +shuffling cards +side kick +sign language interpreting +singing +situp +skateboarding +ski jumping +skiing mono +skiing crosscountry +skiing slalom +skipping rope +skydiving +slacklining +slapping +sled dog racing +smoking +smoking hookah +snatch weight lifting +sneezing +smelling feet +snorkeling +snowboarding +snowkiting +snowmobiling +somersaulting +spinning poi +spray painting +spraying +springboard diving +squat +sticking tongue out +stomping grapes +stretching arm +stretching leg +strumming guitar +surfing crowd +surfing water +sweeping floor +swimming backstroke +swimming breast stroke +swimming butterfly stroke +swing dancing +swinging legs +swinging on something +sword fighting +tai chi +taking a shower +tango dancing +tap dancing +tapping guitar +tapping pen +tasting beer +tasting food +testifying +texting +throwing axe +throwing ball +throwing discus +tickling +tobogganing +tossing coin +tossing salad +training dog +trapezing +trimming or shaving beard +trimming trees +triple jump +tying bow tie +tying knot (not on a tie) +tying necktie +unboxing +unloading truck +using computer +using remote controller (not gaming) +using segway +vault +waiting in line +walking the dog +washing dishes +washing feet +washing hair +washing hands +water skiing +water sliding +watering plants +waxing back +waxing chest +waxing eyebrows +waxing legs +weaving basket +welding +whistling +windsurfing +wrapping present +wrestling +writing +yawning +yoga +zumba +poaching eggs +playing nose flute +entering church +closing door +helmet diving +doing sudoku +coughing +seasoning food +peeling banana +eating nachos +waxing armpits +shouting +silent disco +polishing furniture +taking photo +dealing cards +putting wallpaper on wall +uncorking champagne +curling eyelashes +brushing floor +pulling espresso shot +playing american football +grooming cat +playing checkers +moving child +stacking cups +squeezing orange +opening coconuts +rolling eyes +picking blueberries +playing road hockey +carving wood with a knife +slicing onion +saluting +letting go of balloon +breaking glass +carrying weight +mixing colours +moving baby +blending fruit +pouring milk +surveying +making slime +sieving +walking with crutches +flipping bottle +playing billiards +arresting +listening with headphones +spinning plates +carving marble +cutting cake +shoot dance +being excited +petting horse +splashing water +filling cake +stacking dice +checking watch +treating wood +laying decking +shooting off fireworks +pouring wine +pretending to be a statue +steering car +playing rounders +looking in mirror +jumping sofa +lighting candle +walking on stilts +crocheting +playing piccolo +vacuuming car +high fiving +playing shuffleboard +chasing +pulling rope (game) +being in zero gravity +sanding wood +decoupage +using megaphone +making latte art +ski ballet +playing oboe +bouncing ball (not juggling) +playing mahjong +herding cattle +swimming with sharks +milking goat +swimming with dolphins +metal detecting +playing slot machine +polishing metal +throwing tantrum +lawn mower racing +laying stone +cutting orange +skipping stone +pouring beer +making bubbles +jaywalking +leatherworking +card stacking +putting on eyeliner +card throwing +chewing gum +falling off bike +repairing puncture +dumpster diving +tiptoeing +sleeping +using circular saw +cracking knuckles +pinching +chiseling wood +playing rubiks cube +weaving fabric +fencing (sport) +sword swallowing +lighting fire +vacuuming floor +combing hair +building lego +playing pinball +fly tying +playing lute +opening door +waving hand +rolling pastry +chiseling stone +threading needle +playing dominoes +opening wine bottle +playing with trains +steer roping +playing field hockey +separating eggs +sewing +talking on cell phone +needle felting +pushing wheelbarrow +using a paint roller +playing netball +lifting hat +massaging neck +blowing bubble gum +walking through snow +docking boat +clam digging +marriage proposal +packing +sausage making +licking +scrapbooking +flint knapping +lock picking +putting on lipstick +sawing wood +playing hand clapping games +geocaching +looking at phone +making cheese +poking bellybutton +contorting +fixing bicycle +using a microscope +using a wrench +doing jigsaw puzzle +making horseshoes +cooking scallops +square dancing +getting a piercing +playing ocarina +making paper aeroplanes +playing scrabble +visiting the zoo +crossing eyes +jumping bicycle +throwing water balloon +bodysurfing +pirouetting +luge +spelunking +watching tv +attending conference +curling (sport) +directing traffic +swimming front crawl +ice swimming +battle rope training +putting on mascara +bouncing on bouncy castle +smoking pipe +pillow fight +putting on sari +calligraphy +roasting pig +cracking back +shopping +burping +using bagging machine +staring +shucking oysters +blowdrying hair +smashing +playing laser tag +wading through mud +rope pushdown +preparing salad +making balloon shapes +tagging graffiti +adjusting glasses +using a power drill +trimming shrubs +popping balloons +playing pan pipes +using puppets +arguing +backflip (human) +riding snow blower +hand washing clothes +calculating +gospel singing in church +standing on hands +tasting wine +shaping bread dough +wading through water +falling off chair +throwing snowballs +building sandcastle +land sailing +tying shoe laces +jumping jacks +wood burning (art) +putting on foundation +putting on shoes +cumbia +archaeological excavation +mountain climber (exercise) +assembling bicycle +head stand +cutting apple +shuffling feet +bottling +breathing fire +using inhaler +historical reenactment +hugging baby +mushroom foraging +delivering mail +laying tiles +using atm +chopping meat +tightrope walking +mosh pit dancing +photobombing +coloring in +huddling +playing gong +laying concrete +breaking boards +acting in play +base jumping +tie dying +using a sledge hammer +playing ping pong +photocopying +winking +waking up +swinging baseball bat +twiddling fingers +playing polo +longboarding +ironing hair +bathing dog +moon walking +playing marbles +embroidering +playing beer pong +home roasting coffee +gold panning +karaoke +changing gear in car +raising eyebrows +yarn spinning +scrubbing face +fidgeting +planing wood +cosplaying +capsizing +tackling +shining flashlight +dyeing eyebrows +drooling +alligator wrestling +playing blackjack +carving ice +playing maracas +opening refrigerator +throwing knife +putting in contact lenses +passing soccer ball +casting fishing line +sucking lolly +installing carpet +bulldozing +roasting marshmallows +playing darts +chopping vegetables +bull fighting diff --git a/tools/data/mit/README.md b/tools/data/mit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..61dd24ba5d24dfb63f61028c3274d40f7e5a65ef --- /dev/null +++ b/tools/data/mit/README.md @@ -0,0 +1,128 @@ +# Preparing Moments in Time + +## Introduction + + + +```BibTeX +@article{monfortmoments, + title={Moments in Time Dataset: one million videos for event understanding}, + author={Monfort, Mathew and Andonian, Alex and Zhou, Bolei and Ramakrishnan, Kandan and Bargal, Sarah Adel and Yan, Tom and Brown, Lisa and Fan, Quanfu and Gutfruend, Dan and Vondrick, Carl and others}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year={2019}, + issn={0162-8828}, + pages={1--8}, + numpages={8}, + doi={10.1109/TPAMI.2019.2901464}, +} +``` + +For basic dataset information, you can refer to the dataset [website](http://moments.csail.mit.edu/). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/mit/`. + +## Step 1. Prepare Annotations and Videos + +First of all, you have to visit the official [website](http://moments.csail.mit.edu/), fill in an application form for downloading the dataset. Then you will get the download link. You can use `bash preprocess_data.sh` to prepare annotations and videos. However, the download command is missing in that script. Remember to download the dataset to the proper place follow the comment in this script. + +For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by: + +```shell +python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_256p_dense_cache --dense --level 2 +``` + +## Step 2. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/mit_extracted/ +ln -s /mnt/SSD/mit_extracted/ ../../../data/mit/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames. + +```shell +bash extract_frames.sh +``` + +## Step 4. Generate File List + +you can run the follow script to generate file list in the format of rawframes and videos. + +```shell +bash generate_{rawframes, videos}_filelist.sh +``` + +## Step 5. Check Directory Structure + +After the whole data process for Moments in Time preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for Moments in Time. + +In the context of the whole project (for Moments in Time only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ data +โ”‚ย ย  โ””โ”€โ”€ mit +โ”‚ย ย  โ”œโ”€โ”€ annotations +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ license.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ moments_categories.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ README.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ trainingSet.csv +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ validationSet.csv +โ”‚ย ย  โ”œโ”€โ”€ mit_train_rawframe_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_train_video_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_val_rawframe_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_val_video_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ rawframes +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ training +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ adult+female+singing +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ 0P3XG_vf91c_35 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ img_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ img_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yawning +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ _8zmP1e-EjU_2 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ validation +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ””โ”€โ”€ videos +โ”‚ย ย  โ”œโ”€โ”€ training +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ adult+female+singing +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ 0P3XG_vf91c_35.mp4 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56.mp4 +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yawning +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ””โ”€โ”€ validation +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ””โ”€โ”€ mmaction +โ””โ”€โ”€ ... + +``` + +For training and evaluating on Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/mit/README_zh-CN.md b/tools/data/mit/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4761c49559b3bd4316135930d9324d0d8bcbc126 --- /dev/null +++ b/tools/data/mit/README_zh-CN.md @@ -0,0 +1,130 @@ +# ๅ‡†ๅค‡ Moments in Time + +## ็ฎ€ไป‹ + + + +```BibTeX +@article{monfortmoments, + title={Moments in Time Dataset: one million videos for event understanding}, + author={Monfort, Mathew and Andonian, Alex and Zhou, Bolei and Ramakrishnan, Kandan and Bargal, Sarah Adel and Yan, Tom and Brown, Lisa and Fan, Quanfu and Gutfruend, Dan and Vondrick, Carl and others}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year={2019}, + issn={0162-8828}, + pages={1--8}, + numpages={8}, + doi={10.1109/TPAMI.2019.2901464}, +} +``` + +็”จๆˆทๅฏไปฅๅ‚็…งๆ•ฐๆฎ้›† [ๅฎ˜็ฝ‘](http://moments.csail.mit.edu/)๏ผŒ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅ‡†ๅค‡ๆ•ฐๆฎ้›†ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/mit/`ใ€‚ + +## ๆญฅ้ชค 1. ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถๅ’Œ่ง†้ข‘ๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆท้œ€่ฆ่ฎฟ้—ฎ[ๅฎ˜็ฝ‘](http://moments.csail.mit.edu/)๏ผŒๅกซๅ†™็”ณ่ฏท่กจๆฅไธ‹่ฝฝๆ•ฐๆฎ้›†ใ€‚ +ๅœจๅพ—ๅˆฐไธ‹่ฝฝ้“พๆŽฅๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จ `bash preprocess_data.sh` ๆฅๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถๅ’Œ่ง†้ข‘ใ€‚ +่ฏทๆณจๆ„ๆญค่„šๆœฌๅนถๆฒกๆœ‰ไธ‹่ฝฝๆ ‡ๆณจๅ’Œ่ง†้ข‘ๆ–‡ไปถ๏ผŒ็”จๆˆท้œ€่ฆๆ นๆฎ่„šๆœฌๆ–‡ไปถไธญ็š„ๆณจ้‡Š๏ผŒๆๅ‰ไธ‹่ฝฝๅฅฝๆ•ฐๆฎ้›†๏ผŒๅนถๆ”พ/่ฝฏ้“พๆŽฅๅˆฐๅˆ้€‚็š„ไฝ็ฝฎใ€‚ + +ไธบๅŠ ๅฟซ่ง†้ข‘่งฃ็ ้€Ÿๅบฆ๏ผŒ็”จๆˆท้œ€่ฆ็ผฉๅฐๅŽŸ่ง†้ข‘็š„ๅฐบๅฏธ๏ผŒๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปค่Žทๅ–ๅฏ†้›†็ผ–็ ็‰ˆ่ง†้ข‘๏ผš + +```shell +python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_256p_dense_cache --dense --level 2 +``` + +## Step 2. ๆŠฝๅ–ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธŠใ€‚ +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒๆŒ‡ไปค่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"ไธŠ๏ผ‰ +mkdir /mnt/SSD/mit_extracted/ +ln -s /mnt/SSD/mit_extracted/ ../../../data/mit/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +bash extract_frames.sh +``` + +## ๆญฅ้ชค 3. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +bash generate_{rawframes, videos}_filelist.sh +``` + +## ๆญฅ้ชค 4. ๆฃ€ๆŸฅ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆˆ Moments in Time ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅๅพ—ๅˆฐ Moments in Time ็š„ RGB ๅธง + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒMoments in Time ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ data +โ”‚ย ย  โ””โ”€โ”€ mit +โ”‚ย ย  โ”œโ”€โ”€ annotations +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ license.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ moments_categories.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ README.txt +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ trainingSet.csv +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ validationSet.csv +โ”‚ย ย  โ”œโ”€โ”€ mit_train_rawframe_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_train_video_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_val_rawframe_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ mit_val_video_anno.txt +โ”‚ย ย  โ”œโ”€โ”€ rawframes +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ training +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ adult+female+singing +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ 0P3XG_vf91c_35 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ img_00001.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ img_00002.jpg +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yawning +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ _8zmP1e-EjU_2 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ validation +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ””โ”€โ”€ videos +โ”‚ย ย  โ”œโ”€โ”€ training +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ adult+female+singing +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ 0P3XG_vf91c_35.mp4 +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56.mp4 +โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ yawning +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ”‚ย ย  โ””โ”€โ”€ validation +โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... +โ””โ”€โ”€ mmaction +โ””โ”€โ”€ ... + +``` + +ๅ…ณไบŽๅฏน Moments in Times ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒๅฏไปฅๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/mit/extract_frames.sh b/tools/data/mit/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d5f76347feea8215193f7f197b719161e6133b1 --- /dev/null +++ b/tools/data/mit/extract_frames.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --flow-type tvl1 --ext mp4 --task both +echo "Raw frames (RGB and tv-l1) Generated for train set" + +python build_rawframes.py ../../data/mit/vides/validation/ ../../data/mit/rawframes/validation/ --level 2 --flow-type tvl1 --ext mp4 --task both +echo "Raw frames (RGB and tv-l1) Generated for val set" + +cd mit/ diff --git a/tools/data/mit/extract_rgb_frames.sh b/tools/data/mit/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..a043d7d081f7774a8b2e97fee1846fc1743e6f02 --- /dev/null +++ b/tools/data/mit/extract_rgb_frames.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --ext mp4 --task rgb +echo "Raw frames (RGB only) generated for train set" + +python build_rawframes.py ../../data/mit/videos/validation ../../data/mit/rawframes/validation/ --level 2 --ext mp4 --task rgb +echo "Raw frames (RGB only) generated for val set" + +cd mit/ diff --git a/tools/data/mit/extract_rgb_frames_opencv.sh b/tools/data/mit/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..487952945191b9ddbef59570f83659e1ec180951 --- /dev/null +++ b/tools/data/mit/extract_rgb_frames_opencv.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --ext mp4 --task rgb --use-opencv +echo "Raw frames (RGB only) generated for train set" + +python build_rawframes.py ../../data/mit/videos/validation ../../data/mit/rawframes/validation/ --level 2 --ext mp4 --task rgb --use-opencv +echo "Raw frames (RGB only) generated for val set" + +cd mit/ diff --git a/tools/data/mit/generate_rawframes_filelist.sh b/tools/data/mit/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..9f24a5338fcea348b15ad7fa85900105dffb5262 --- /dev/null +++ b/tools/data/mit/generate_rawframes_filelist.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/rawframes/training/ --level 2 --format rawframes --num-split 1 --subset train --shuffle +echo "Train filelist for rawframes generated." + +PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/rawframes/validation/ --level 2 --format rawframes --num-split 1 --subset val --shuffle +echo "Val filelist for rawframes generated." +cd tools/data/mit/ diff --git a/tools/data/mit/generate_videos_filelist.sh b/tools/data/mit/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..705aa144e52a84064e359a5e48b4764a397aef84 --- /dev/null +++ b/tools/data/mit/generate_videos_filelist.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/videos/training/ --level 2 --format videos --num-split 1 --subset train --shuffle +echo "Train filelist for videos generated." + +PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/videos/validation/ --level 2 --format videos --num-split 1 --subset val --shuffle +echo "Val filelist for videos generated." +cd tools/data/mit/ diff --git a/tools/data/mit/label_map.txt b/tools/data/mit/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..898789370cdc65f0ee92926ce9df213b5ff60bee --- /dev/null +++ b/tools/data/mit/label_map.txt @@ -0,0 +1,339 @@ +clapping +praying +dropping +burying +covering +flooding +leaping +drinking +slapping +cuddling +sleeping +preaching +raining +stitching +spraying +twisting +coaching +submerging +breaking +tuning +boarding +running +destroying +competing +giggling +shoveling +chasing +flicking +pouring +buttoning +hammering +carrying +surfing +pulling +squatting +aiming +crouching +tapping +skipping +washing +winking +queuing +locking +stopping +sneezing +flipping +sewing +clipping +working +rocking +asking +playing+fun +camping +plugging +pedaling +constructing +slipping +sweeping +screwing +shrugging +hitchhiking +cracking +scratching +trimming +selling +marching +stirring +kissing +jumping +starting +clinging +socializing +picking +splashing +licking +kicking +sliding +filming +driving +handwriting +steering +filling +crashing +stealing +pressing +shouting +hiking +vacuuming +pointing +giving +diving +hugging +building +swerving +dining +floating +cheerleading +leaning +sailing +singing +playing +hitting +bubbling +joining +bathing +raising +sitting +drawing +protesting +rinsing +coughing +smashing +slicing +balancing +rafting +kneeling +dunking +brushing +crushing +rubbing +punting +watering +playing+music +removing +tearing +imitating +teaching +cooking +reaching +studying +serving +bulldozing +shaking +discussing +dragging +gardening +performing +officiating +photographing +sowing +dripping +writing +clawing +bending +boxing +mopping +gripping +flowing +digging +tripping +cheering +buying +bicycling +feeding +emptying +unpacking +sketching +standing +weeding +stacking +drying +crying +spinning +frying +cutting +paying +eating +lecturing +dancing +adult+female+speaking +boiling +peeling +wrapping +wetting +attacking +welding +putting +swinging +carving +walking +dressing +inflating +climbing +shredding +reading +sanding +frowning +closing +hunting +clearing +launching +packaging +fishing +spilling +leaking +knitting +boating +sprinkling +baptizing +playing+sports +rolling +spitting +dipping +riding +chopping +extinguishing +applauding +calling +talking +adult+male+speaking +snowing +shaving +marrying +rising +laughing +crawling +flying +assembling +injecting +landing +operating +packing +descending +falling +entering +pushing +sawing +smelling +overflowing +fighting +waking +barbecuing +skating +painting +drilling +punching +tying +manicuring +plunging +grilling +pitching +towing +telephoning +crafting +knocking +playing+videogames +storming +placing +turning +barking +child+singing +opening +waxing +juggling +mowing +shooting +sniffing +interviewing +stomping +chewing +arresting +grooming +rowing +bowing +gambling +saluting +fueling +autographing +throwing +drenching +waving +signing +repairing +baking +smoking +skiing +drumming +child+speaking +blowing +cleaning +combing +spreading +racing +combusting +adult+female+singing +fencing +swimming +adult+male+singing +snuggling +shopping +bouncing +dusting +stroking +snapping +biting +roaring +guarding +unloading +lifting +instructing +folding +measuring +whistling +exiting +stretching +taping +squinting +catching +draining +massaging +scrubbing +handcuffing +celebrating +jogging +colliding +bowling +resting +blocking +smiling +tattooing +erupting +howling +parading +grinning +sprinting +hanging +planting +speaking +ascending +yawning +cramming +burning +wrestling +poking +tickling +exercising +loading +piloting +typing diff --git a/tools/data/mit/preprocess_data.sh b/tools/data/mit/preprocess_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..5e5bd8ec1e697d38e1127f7a4c9c3ea0d18eb8b2 --- /dev/null +++ b/tools/data/mit/preprocess_data.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/mit/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +# Download the Moments_in_Time_Raw.zip here manually +unzip Moments_in_Time_Raw.zip +rm Moments_in_Time_Raw.zip + +if [ ! -d "./videos" ]; then + mkdir ./videos +fi +mv ./training ./videos && mv ./validation ./video + +if [ ! -d "./annotations" ]; then + mkdir ./annotations +fi + +mv *.txt annotations && mv *.csv annotations + +cd "../../tools/data/mit" diff --git a/tools/data/mmit/README.md b/tools/data/mmit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6b48614ea47fb8a4ed524ad92bfe3465019a60dc --- /dev/null +++ b/tools/data/mmit/README.md @@ -0,0 +1,113 @@ +# Preparing Multi-Moments in Time + +## Introduction + + + +```BibTeX +@misc{monfort2019multimoments, + title={Multi-Moments in Time: Learning and Interpreting Models for Multi-Action Video Understanding}, + author={Mathew Monfort and Kandan Ramakrishnan and Alex Andonian and Barry A McNamara and Alex Lascelles, Bowen Pan, Quanfu Fan, Dan Gutfreund, Rogerio Feris, Aude Oliva}, + year={2019}, + eprint={1911.00232}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +For basic dataset information, you can refer to the dataset [website](http://moments.csail.mit.edu). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/mmit/`. + +## Step 1. Prepare Annotations and Videos + +First of all, you have to visit the official [website](http://moments.csail.mit.edu/), fill in an application form for downloading the dataset. Then you will get the download link. You can use `bash preprocess_data.sh` to prepare annotations and videos. However, the download command is missing in that script. Remember to download the dataset to the proper place follow the comment in this script. + +For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by: + +``` +python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_256p_dense_cache --dense --level 2 +``` + +## Step 2. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +First, you can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/mmit_extracted/ +ln -s /mnt/SSD/mmit_extracted/ ../../../data/mmit/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames using "tvl1" algorithm. + +```shell +bash extract_frames.sh +``` + +## Step 3. Generate File List + +you can run the follow script to generate file list in the format of rawframes or videos. + +```shell +bash generate_rawframes_filelist.sh +bash generate_videos_filelist.sh +``` + +## Step 4. Check Directory Structure + +After the whole data process for Multi-Moments in Time preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for Multi-Moments in Time. + +In the context of the whole project (for Multi-Moments in Time only), the folder structure will look like: + +``` +mmaction2/ +โ””โ”€โ”€ data + โ””โ”€โ”€ mmit + โ”œโ”€โ”€ annotations + โ”‚ย ย  โ”œโ”€โ”€ moments_categories.txt + โ”‚ย ย  โ”œโ”€โ”€ trainingSet.txt + โ”‚ย ย  โ””โ”€โ”€ validationSet.txt + โ”œโ”€โ”€ mmit_train_rawframes.txt + โ”œโ”€โ”€ mmit_train_videos.txt + โ”œโ”€โ”€ mmit_val_rawframes.txt + โ”œโ”€โ”€ mmit_val_videos.txt + โ”œโ”€โ”€ rawframes + โ”‚ย ย  โ”œโ”€โ”€ 0-3-6-2-9-1-2-6-14603629126_5 + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ img_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ img_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56 + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ””โ”€โ”€ ... + + โ””โ”€โ”€ videos + โ””โ”€โ”€ adult+female+singing + โ”œโ”€โ”€ 0-3-6-2-9-1-2-6-14603629126_5.mp4 + โ””โ”€โ”€ yt-zxQfALnTdfc_56.mp4 + โ””โ”€โ”€ ... +``` + +For training and evaluating on Multi-Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/mmit/README_zh-CN.md b/tools/data/mmit/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..5b90efdb0b7f20087c2378c9cd5e3a15beb30d7d --- /dev/null +++ b/tools/data/mmit/README_zh-CN.md @@ -0,0 +1,115 @@ +# ๅ‡†ๅค‡ Multi-Moments in Time + +## ็ฎ€ไป‹ + + + +```BibTeX +@misc{monfort2019multimoments, + title={Multi-Moments in Time: Learning and Interpreting Models for Multi-Action Video Understanding}, + author={Mathew Monfort and Kandan Ramakrishnan and Alex Andonian and Barry A McNamara and Alex Lascelles, Bowen Pan, Quanfu Fan, Dan Gutfreund, Rogerio Feris, Aude Oliva}, + year={2019}, + eprint={1911.00232}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +็”จๆˆทๅฏไปฅๅ‚็…งๆ•ฐๆฎ้›† [ๅฎ˜็ฝ‘](http://moments.csail.mit.edu/)๏ผŒ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅ‡†ๅค‡ๆ•ฐๆฎ้›†ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/mmit/`ใ€‚ + +## ๆญฅ้ชค 1. Prepare Annotations and Videos + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆท้œ€่ฆ่ฎฟ้—ฎ[ๅฎ˜็ฝ‘](http://moments.csail.mit.edu/)๏ผŒๅกซๅ†™็”ณ่ฏท่กจๆฅไธ‹่ฝฝๆ•ฐๆฎ้›†ใ€‚ +ๅœจๅพ—ๅˆฐไธ‹่ฝฝ้“พๆŽฅๅŽ๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จ `bash preprocess_data.sh` ๆฅๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถๅ’Œ่ง†้ข‘ใ€‚ +่ฏทๆณจๆ„ๆญค่„šๆœฌๅนถๆฒกๆœ‰ไธ‹่ฝฝๆ ‡ๆณจๅ’Œ่ง†้ข‘ๆ–‡ไปถ๏ผŒ็”จๆˆท้œ€่ฆๆ นๆฎ่„šๆœฌๆ–‡ไปถไธญ็š„ๆณจ้‡Š๏ผŒๆๅ‰ไธ‹่ฝฝๅฅฝๆ•ฐๆฎ้›†๏ผŒๅนถๆ”พ/่ฝฏ้“พๆŽฅๅˆฐๅˆ้€‚็š„ไฝ็ฝฎใ€‚ + +ไธบๅŠ ๅฟซ่ง†้ข‘่งฃ็ ้€Ÿๅบฆ๏ผŒ็”จๆˆท้œ€่ฆ็ผฉๅฐๅŽŸ่ง†้ข‘็š„ๅฐบๅฏธ๏ผŒๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปค่Žทๅ–ๅฏ†้›†็ผ–็ ็‰ˆ่ง†้ข‘๏ผš + +``` +python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_256p_dense_cache --dense --level 2 +``` + +## Step 2. ๆŠฝๅ–ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธŠใ€‚ +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒๆŒ‡ไปค่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"ไธŠ๏ผ‰ +mkdir /mnt/SSD/mmit_extracted/ +ln -s /mnt/SSD/mmit_extracted/ ../../../data/mmit/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +bash extract_frames.sh +``` + +## ๆญฅ้ชค 3. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +bash generate_rawframes_filelist.sh +bash generate_videos_filelist.sh +``` + +## ๆญฅ้ชค 4. ๆฃ€ๆŸฅ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆˆ Multi-Moments in Time ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅๅพ—ๅˆฐ Multi-Moments in Time ็š„ RGB ๅธง + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒMulti-Moments in Time ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2/ +โ””โ”€โ”€ data + โ””โ”€โ”€ mmit + โ”œโ”€โ”€ annotations + โ”‚ย ย  โ”œโ”€โ”€ moments_categories.txt + โ”‚ย ย  โ”œโ”€โ”€ trainingSet.txt + โ”‚ย ย  โ””โ”€โ”€ validationSet.txt + โ”œโ”€โ”€ mmit_train_rawframes.txt + โ”œโ”€โ”€ mmit_train_videos.txt + โ”œโ”€โ”€ mmit_val_rawframes.txt + โ”œโ”€โ”€ mmit_val_videos.txt + โ”œโ”€โ”€ rawframes + โ”‚ย ย  โ”œโ”€โ”€ 0-3-6-2-9-1-2-6-14603629126_5 + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_x_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ flow_y_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ img_00001.jpg + โ”‚ย ย  โ”‚ย ย  โ””โ”€โ”€ img_00002.jpg + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ””โ”€โ”€ yt-zxQfALnTdfc_56 + โ”‚ย ย  โ”‚ย ย  โ”œโ”€โ”€ ... + โ”‚ย ย  โ””โ”€โ”€ ... + + โ””โ”€โ”€ videos + โ””โ”€โ”€ adult+female+singing + โ”œโ”€โ”€ 0-3-6-2-9-1-2-6-14603629126_5.mp4 + โ””โ”€โ”€ yt-zxQfALnTdfc_56.mp4 + โ””โ”€โ”€ ... +``` + +ๅ…ณไบŽๅฏน Multi-Moments in Time ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒๅฏไปฅๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/mmit/extract_frames.sh b/tools/data/mmit/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..259c46baecb78a406062fd18781fde0059ecf5ed --- /dev/null +++ b/tools/data/mmit/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mmit/videos/ ../../../data/mmit/rawframes/ --task both --level 2 --flow-type tvl1 --ext mp4 +echo "Raw frames (RGB and Flow) Generated" +cd mmit/ diff --git a/tools/data/mmit/extract_rgb_frames.sh b/tools/data/mmit/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..571adb8817004e38e23cac5478d9f7a0b68641e0 --- /dev/null +++ b/tools/data/mmit/extract_rgb_frames.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mmit/videos/ ../../data/mmit/rawframes/ --task rgb --level 2 --ext mp4 + +echo "Genearte raw frames (RGB only)" + +cd mmit/ diff --git a/tools/data/mmit/extract_rgb_frames_opencv.sh b/tools/data/mmit/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..835292718c65e40f55297452551eb5605cbbbab5 --- /dev/null +++ b/tools/data/mmit/extract_rgb_frames_opencv.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/mmit/videos/ ../../data/mmit/rawframes/ --task rgb --level 2 --ext mp4 --use-opencv + +echo "Genearte raw frames (RGB only)" + +cd mmit/ diff --git a/tools/data/mmit/generate_rawframes_filelist.sh b/tools/data/mmit/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..aaed71bb082376c26311c23120827c7fba9b811d --- /dev/null +++ b/tools/data/mmit/generate_rawframes_filelist.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/rawframes/ --level 2 --format rawframes --num-split 1 --subset train --shuffle +echo "Train filelist for rawframes generated." + +PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/rawframes/ --level 2 --format rawframes --num-split 1 --subset val --shuffle +echo "Val filelist for rawframes generated." +cd tools/data/mmit/ diff --git a/tools/data/mmit/generate_videos_filelist.sh b/tools/data/mmit/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..49460f4c5b436c1fb774f6917b747fcb045479d1 --- /dev/null +++ b/tools/data/mmit/generate_videos_filelist.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/videos/ --level 2 --format videos --num-split 1 --subset train --shuffle +echo "Train filelist for videos generated." + +PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/videos/ --level 2 --format videos --num-split 1 --subset val --shuffle +echo "Val filelist for videos generated." +cd tools/data/mmit/ diff --git a/tools/data/mmit/label_map.txt b/tools/data/mmit/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..756feba2e004da2f343f20d1ba85fb2062dd9000 --- /dev/null +++ b/tools/data/mmit/label_map.txt @@ -0,0 +1,313 @@ +crafting +paddling +raining +weightlifting +clawing +hitchhiking +autographing +cooking +gripping +swerving +frowning +giving +tattooing +dipping +leaking +plunging +barking +stroking/petting +piloting +camping +towing +loading +parading +submerging +squeezing +sculpting +stomping +punting +kissing +smoking +pouring +texting +adult+male+speaking +adult+female+speaking +crying +unpacking +pointing +boating +landing +ironing +crouching +slapping +typing +ice+skating +boiling +chopping +bowling +fighting/attacking +tapping +applauding +driving +sprinting +slicing +approaching +waving +dusting +wrapping +knocking +snapping +gardening +combing +tickling +carving +smashing +smiling/grinning +dressing +pressing +lecturing +telephoning +exercising +riding +draining +flying +wrestling +boxing +rinsing +overflowing +inflating +picking +sowing +shaving +baking +shaking +running +throwing +stacking/piling +buttoning +leaping +fueling +pitching +child+speaking +breaking/destroying +lifting +filming/photographing +singing +reading +chewing +operating +bubbling +waxing +cleaning/washing +scooping +erasing +steering +playing+videogames +crashing +constructing/assembling +flooding +drinking +praying +shouting +winking +dining +repairing +tying +juggling +rolling +studying +marching +socializing +ascending/rising +arresting +cracking +laying +clinging +frying +vacuuming +combusting/burning +filling +standing +howling +dunking +spraying +bandaging +shivering +slipping +racing +roaring +planting +yawning +grilling +squinting +skiing +taping +trimming +preaching +resting +descending/lowering +clearing +screwing +chasing +speaking +manicuring +tripping +performing +teaching/instructing +blowing +painting +sneezing +packaging +punching +clapping +rotating/spinning +skating +cheerleading +balancing +child+singing +covering +snuggling/cuddling/hugging +bulldozing +jumping +sliding +barbecuing +weeding +swimming +shooting +dialing +measuring +pulling +celebrating +playing+fun +knitting +spreading +erupting +snowboarding +swinging +protesting +sitting +inserting +bouncing +surfing +extinguishing +unloading +aiming +bathing +hammering +fishing +opening +biting +packing +saluting +rafting +laughing +bicycling +rocking +storming +wetting +shrugging +handwriting +gambling +writing +skipping +dragging +unplugging +kicking +sawing +grooming +whistling +floating +diving +rubbing +bending +shoveling/digging +peeling +catching +closing +eating/feeding +falling +discussing +sweeping +massaging +locking +dancing +mowing +clipping +hanging +burying +reaching +kayaking +snowing +sleeping +climbing +flipping +tearing/ripping +folding +signing +cutting +stretching +stirring +licking +kneeling +sewing +dripping +queuing +pushing +pedaling +flossing +buying/selling/shopping +smelling/sniffing +emptying +sanding +smacking +carrying +adult+male+singing +poking +brushing +adult+female+singing +scratching +welding +crawling +skateboarding +turning +dropping +hunting +cheering +drawing +sprinkling +spitting +competing +bowing +hiking +drying +launching +twisting +crushing +hitting/colliding +shredding +plugging +gasping +rowing +calling +drumming +walking +removing +waking +stitching +coughing +playing+music +playing+sports +interviewing +scrubbing +splashing +officiating +mopping +flowing +sailing +drilling +squatting +handcuffing +spilling +marrying +injecting +jogging diff --git a/tools/data/mmit/preprocess_data.sh b/tools/data/mmit/preprocess_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..4d54052c04babaa2e0fda1ccc6b2ae2c27ff0584 --- /dev/null +++ b/tools/data/mmit/preprocess_data.sh @@ -0,0 +1,20 @@ +DATA_DIR="../../../data/mmit/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +# Download the Multi_Moments_in_Time_Raw.zip here manually +unzip Multi_Moments_in_Time_Raw.zip +rm Multi_Moments_in_Time.zip + +if [ ! -d "./annotations" ]; then + mkdir ./annotations +fi + +mv *.txt annotations && mv *.csv annotations + +cd - diff --git a/tools/data/msrvtt/README.md b/tools/data/msrvtt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1699cb3f3ff19c3e5881c56a1ecdaba103fbb515 --- /dev/null +++ b/tools/data/msrvtt/README.md @@ -0,0 +1,68 @@ +# Preparing MSR-VTT Retrieval/ Video Question-Answering Dataset + +## Introduction + + + +```BibTeX +@inproceedings{xu2016msr, + title={Msr-vtt: A large video description dataset for bridging video and language}, + author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong}, + booktitle={CVPR}, + pages={5288--5296}, + year={2016} +} +``` + +Before preparing the dataset, please make sure that the directory is located at `$MMACTION2/tools/data/msrvtt/`. + +## Step 1. Download Annotation Files + +You can directly download the following annotation files related to MSR-VTT from the [Google Drive link](https://drive.google.com/drive/folders/12cr94wT8j7pR09AR2nmQg6o26Y1arI50) provided by [VindLU](https://github.com/klauscc) and place them in the `$MMACTION2/tools/data/msrvtt/annotations` directory: + +- [msrvtt_qa_train.json](https://drive.google.com/file/d/12dJq5_7v8FytrJwrPB_f22tET1MmGCNh/view?usp=drive_link) +- [msrvtt_qa_val.json](https://drive.google.com/file/d/138q-A-V8fCC2nBYJgqkQa3gBfXVNbNNd/view?usp=drive_link) +- [msrvtt_qa_test.json](https://drive.google.com/file/d/13IiEcUMHiNppWhGwVY1eAaip6iSJM35A/view?usp=drive_link) +- [msrvtt_qa_answer_list.json](https://drive.google.com/file/d/131euz_dssRkDTk3-ioAS5ZsvIxS_Tt4M/view?usp=drive_link) +- [msrvtt_mc_test.json](https://drive.google.com/file/d/13FrUQ2ZDsNDraP7lfnKvTArPIgdtHuLC/view?usp=drive_link) +- [msrvtt_ret_train9k.json](https://drive.google.com/file/d/13OVo0XRdVWTHlFFxbKg3daYCHsMbJxyd/view?usp=drive_link) +- [msrvtt_ret_train7k.json](https://drive.google.com/file/d/13ID97BX4ExO6mWPIUMp-GzXcPBkviSLx/view?usp=drive_link) +- [msrvtt_ret_test1k.json](https://drive.google.com/file/d/13FLrjI-aleKeU7LbJMDrYgktX7MbTbzu/view?usp=drive_link) +- [msrvtt_test1k.json](https://drive.google.com/file/d/12z6y-DNwIfICSzOhekbJwSbf7z2hlibE/view?usp=drive_link) + +## Step 2. Prepare Video Data + +You can refer to the [official website](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/) of this dataset for basic information. Run the following commands to prepare the MSRVTT video files: + +```shell +# Download original videos +bash download_msrvtt.sh +# Preprocess videos to lower FPS and dimensions +bash compress_msrvtt.sh +``` + +After completing the above preparation steps, the directory structure will be as follows: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ””โ”€โ”€ msrvtt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_train.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_val.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_test.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_answer_list.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_mc_test.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_train9k.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_train7k.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_test1k.json +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ msrvtt_test1k.json +โ”‚ โ”‚ โ””โ”€โ”€ videos_2fps_224 +โ”‚ โ”‚ โ”œโ”€โ”€ video0.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ video1.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ video9999.mp4 +``` diff --git a/tools/data/msrvtt/README_zh-CN.md b/tools/data/msrvtt/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..16f8e9ba05d770b66ab6b81438981817d8a850ef --- /dev/null +++ b/tools/data/msrvtt/README_zh-CN.md @@ -0,0 +1,68 @@ +# ๅ‡†ๅค‡ MSR-VTT ๆฃ€็ดข/่ง†้ข‘้—ฎ็ญ”ๆ•ฐๆฎ้›† + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{xu2016msr, + title={Msr-vtt: A large video description dataset for bridging video and language}, + author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong}, + booktitle={CVPR}, + pages={5288--5296}, + year={2016} +} +``` + +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/msrvtt/`ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +็”จๆˆทๅฏไปŽ [VindLU](https://github.com/klauscc/VindLU) ๆไพ›็š„ [Google Drive ้“พๆŽฅ](https://drive.google.com/drive/folders/12cr94wT8j7pR09AR2nmQg6o26Y1arI50)ไธญ็›ดๆŽฅไธ‹่ฝฝไปฅไธ‹ไธŽ MSR-VTT ็›ธๅ…ณ็š„ๆ ‡ๆณจๆ–‡ไปถ, ๅนถๆ”พ็ฝฎๅˆฐ `$MMACTION2/tools/data/msrvtt/annotations` ่ทฏๅพ„ไธ‹: + +- [msrvtt_qa_train.json](https://drive.google.com/file/d/12dJq5_7v8FytrJwrPB_f22tET1MmGCNh/view?usp=drive_link) +- [msrvtt_qa_val.json](https://drive.google.com/file/d/138q-A-V8fCC2nBYJgqkQa3gBfXVNbNNd/view?usp=drive_link) +- [msrvtt_qa_test.json](https://drive.google.com/file/d/13IiEcUMHiNppWhGwVY1eAaip6iSJM35A/view?usp=drive_link) +- [msrvtt_qa_answer_list.json](https://drive.google.com/file/d/131euz_dssRkDTk3-ioAS5ZsvIxS_Tt4M/view?usp=drive_link) +- [msrvtt_mc_test.json](https://drive.google.com/file/d/13FrUQ2ZDsNDraP7lfnKvTArPIgdtHuLC/view?usp=drive_link) +- [msrvtt_ret_train9k.json](https://drive.google.com/file/d/13OVo0XRdVWTHlFFxbKg3daYCHsMbJxyd/view?usp=drive_link) +- [msrvtt_ret_train7k.json](https://drive.google.com/file/d/13ID97BX4ExO6mWPIUMp-GzXcPBkviSLx/view?usp=drive_link) +- [msrvtt_ret_test1k.json](https://drive.google.com/file/d/13FLrjI-aleKeU7LbJMDrYgktX7MbTbzu/view?usp=drive_link) +- [msrvtt_test1k.json](https://drive.google.com/file/d/12z6y-DNwIfICSzOhekbJwSbf7z2hlibE/view?usp=drive_link) + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘ๆ•ฐๆฎ + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„[ๅฎ˜็ฝ‘](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚่ฟ่กŒไธ‹้ข็š„ๅ‘ฝไปคๅ‡†ๅค‡ MSRVTT ่ง†้ข‘ๆ–‡ไปถ: + +```shell +# download original videos +bash download_msrvtt.sh +# preprocess videos to lower FPS and dimension +bash compress_msrvtt.sh +``` + +ๅฎŒๆˆไธŠ่ฟฐๅ‡†ๅค‡ๆญฅ้ชคๅŽ๏ผŒๆ–‡ไปถ็›ฎๅฝ•ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ””โ”€โ”€ msrvtt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_train.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_val.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_test.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_qa_answer_list.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_mc_test.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_train9k.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_train7k.json +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ msrvtt_ret_test1k.json +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ msrvtt_test1k.json +โ”‚ โ”‚ โ””โ”€โ”€ videos_2fps_224 +โ”‚ โ”‚ โ”œโ”€โ”€ video0.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ video1.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ video9999.mp4 +``` diff --git a/tools/data/msrvtt/compress.py b/tools/data/msrvtt/compress.py new file mode 100644 index 0000000000000000000000000000000000000000..5daca13f95967832d888e9086dd9aa4f438f38dd --- /dev/null +++ b/tools/data/msrvtt/compress.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Used to compress videos (FPS and dimensions) in the Singularity project. + +copied from https://github.com/klauscc/VindLU +""" +import argparse +import os +import shutil +import subprocess +from multiprocessing import Pool +from os.path import exists, join +from pathlib import Path + +try: + from psutil import cpu_count +except ImportError: + from multiprocessing import cpu_count + +from functools import partial + +from PIL import Image +from tqdm import tqdm + + +def resize_image(input_path, output_path, size=224): + with Image.open(input_path) as img: + w, h = img.width, img.height + r = 1. * w / h + if w > h: + h = size + w = r * size + else: + h = size / r + w = size + + img_resized = img.resize((int(w), int(h))) + img_resized.save(output_path) + + +def _compress_images(input_output_pair, size=224): + """Scale and downsample an input image to a given fps and size (shorter + side size). + + This also removes the audio from the image. + """ + input_image_path, output_image_path = input_output_pair + try: + resize_image(input_image_path, output_image_path, size) + except Exception as e: + print(f'Caught Exception {e}') + + +def _compress_videos(input_output_pair, size=224, fps=3): + """Scale and downsample an input video to a given fps and size (shorter + side size). + + This also removes the audio from the video. + """ + input_file_path, output_file_path = input_output_pair + try: + command = [ + 'ffmpeg', + '-y', # (optional) overwrite output file if it exists + '-i', + input_file_path, + '-filter:v', # no audio + f"scale='if(gt(a,1),trunc(oh*a/2)*2,{size})':'if(gt(a,1),{size},trunc(ow*a/2)*2)'", # noqa: E501 + '-map', + '0:v', # no audio + '-r', + str(fps), # frames per second + # '-g', str(16), + output_file_path, + ] + subprocess.run( + command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + except Exception as e: + raise e + + +def _compress(input_output_pair, fps=3, size=224, file_type='image'): + if file_type == 'image': + _compress_images(input_output_pair, size) + elif file_type == 'video': + _compress_videos(input_output_pair, size, fps) + + +def prepare_input_output_pairs(input_root, + output_root, + input_file_list_path=None): + # filename list in `input_file_list_path` can be created very fast using `ls -U . >> ../video_filenames.txt` # noqa: E501 + if input_file_list_path: + with open(input_file_list_path, 'r') as f: + filenames = [s.strip() for s in f.readlines()] + else: + filenames = [ + video_path.name for video_path in Path(input_root).glob('*.mp4') + ] + print(f'There are {len(filenames)} video/images files loaded from list.') + input_file_path_list = [] + output_file_path_list = [] + for e in tqdm(filenames, desc='find un-processed videos/images'): + input_file_path = join(input_root, e) + output_file_path = join(output_root, e) + if not exists(output_file_path): + input_file_path_list.append(input_file_path) + output_file_path_list.append(output_file_path) + return input_file_path_list, output_file_path_list + + +def run_compress(): + parser = argparse.ArgumentParser( + description='Compress videos/images for speed-up') + parser.add_argument( + '--input_root', type=str, help='input root', required=True) + parser.add_argument( + '--input_file_list_path', + type=str, + default=None, + help='list of video filenames under args.input_root, it can be ' + 'created efficiently with `ls -U /path/to/video >> /path/to/video_filenames.txt`' # noqa: E501 + ) + parser.add_argument( + '--output_root', type=str, help='output root', required=True) + parser.add_argument( + '--size', + type=int, + default=224, + help='shorter side size, aspect ratio is kept') + parser.add_argument('--num_workers', type=int, default=24, help='#workers') + parser.add_argument( + '--fps', + type=int, + default=3, + help='fps for output video, ignored if file_type == image') + parser.add_argument( + '--file_type', + type=str, + choices=['image', 'video'], + help='input file type') + args = parser.parse_args() + + # set paths + input_root = args.input_root + output_root = args.output_root + assert input_root != output_root + if not exists(output_root): + os.makedirs(output_root, exist_ok=True) + + # prepare and find un-processed + input_file_path_list, output_file_path_list = prepare_input_output_pairs( + input_root, + output_root, + input_file_list_path=args.input_file_list_path, + ) + print(f'input_file_path_list[:3] {input_file_path_list[:3]}') + print(f'output_file_path_list[:3] {output_file_path_list[:3]}') + print('Total videos/images need to process: {}'.format( + len(input_file_path_list))) + + # start parallel jobs + num_cores = cpu_count() + num_workers = args.num_workers + print( + f'Begin with {num_cores}-core logical processor, {num_workers} workers' + ) + compress = partial( + _compress, fps=args.fps, size=args.size, file_type=args.file_type) + input_pairs = list(zip(input_file_path_list, output_file_path_list)) + with Pool(num_workers) as pool, tqdm( + total=len(input_file_path_list), + desc='re-encoding videos/images') as pbar: + for idx, _ in enumerate( + pool.imap_unordered(compress, input_pairs, chunksize=32)): + pbar.update(1) + + # copy-paste failed files + print('Compress finished, copy-paste failed files...') + copy_count = 0 + for input_file_path, output_file_path in zip(input_file_path_list, + output_file_path_list): + if exists(input_file_path): + if exists(output_file_path) is False or os.path.getsize( + output_file_path) < 1.: + copy_count += 1 + shutil.copyfile(input_file_path, output_file_path) + print('Copy and replace file: {}'.format(output_file_path)) + print(f'copy_count {copy_count}') + + +if __name__ == '__main__': + run_compress() diff --git a/tools/data/msrvtt/compress_msrvtt.sh b/tools/data/msrvtt/compress_msrvtt.sh new file mode 100644 index 0000000000000000000000000000000000000000..b62744fbe977960a9d90e254460c441508a28561 --- /dev/null +++ b/tools/data/msrvtt/compress_msrvtt.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +FPS=2 +SIZE=224 +DATA_DIR="../../../data/msrvtt/videos" +OUT_DIR="../../../data/msrvtt/videos_2fps_224" + +python compress.py \ + --input_root=${DATA_DIR} --output_root=${OUT_DIR} \ + --fps=${FPS} --size=${SIZE} --file_type=video --num_workers 24 diff --git a/tools/data/msrvtt/download_msrvtt.sh b/tools/data/msrvtt/download_msrvtt.sh new file mode 100644 index 0000000000000000000000000000000000000000..9a7d1890855560b573b8b75544fe30541f7230da --- /dev/null +++ b/tools/data/msrvtt/download_msrvtt.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/msrvtt" +mkdir -p ${DATA_DIR} + +if [ -f "MSRVTT.zip" ]; then + echo "MSRVTT.zip exists, skip downloading!" +else + echo "Downloading MSRVTT.zip." + wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip +fi + +echo "Processing videos started." +unzip -q MSRVTT.zip -d ${DATA_DIR} +mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/MSRVTT/videos/all" -name "video*.mp4" -exec mv {} "${DATA_DIR}/videos/" \; +echo "Processing videos completed." + +rm -rf "${DATA_DIR}/MSRVTT" +rm -rf "${DATA_DIR}/msrvtt_data" +rm msrvtt_data.zip +rm MSRVTT.zip +echo "The preparation of the msrvtt dataset has been successfully completed." diff --git a/tools/data/multisports/README.md b/tools/data/multisports/README.md new file mode 100644 index 0000000000000000000000000000000000000000..54f903e18ec3f574c366f9ef332a4bb6ef1bd5e7 --- /dev/null +++ b/tools/data/multisports/README.md @@ -0,0 +1,111 @@ +# Preparing Multisports + +## Introduction + + + +```BibTeX +@inproceedings{li2021multisports, + title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions}, + author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={13536--13545}, + year={2021} +} +``` + +For basic dataset information, please refer to the official [project](https://deeperaction.github.io/datasets/multisports.html) and the [paper](https://arxiv.org/abs/2105.07404). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/multisports/`. + +## Step 1. Prepare Annotations + +First of all, you have to download annotations and videos to `$MMACTION2/data/multisports` on the official [website](https://github.com/MCG-NJU/MultiSports), please also download the Person Boxes and put it to `$MMACTION2/data/multisports`. + +## Step 2. Prepare Videos + +Before this step, please make sure the folder structure looks like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ multisports +โ”‚ | โ”œโ”€โ”€ MultiSports_box.zip +โ”‚ | โ”œโ”€โ”€ trainval +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics.zip +โ”‚ | | โ”œโ”€โ”€ basketball.zip +โ”‚ | | โ”œโ”€โ”€ multisports_GT.pkl +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ test +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics.zip +โ”‚ | | โ”œโ”€โ”€ basketball.zip +โ”‚ | | โ”œโ”€โ”€... +``` + +Then, you can use the following command to uncompress. + +```shell +cd $MMACTION2/data/multisports/ +unzip MultiSports_box.zip +cd $MMACTION2/data/multisports/trainval +find . -name '*.zip' -exec unzip {} \; +cd $MMACTION2/data/multisports/test +find . -name '*.zip' -exec unzip {} \; +cd $MMACTION2/tools/data/multisports/ +``` + +## Step 3. Convert Annotations + +you can run the following script to convert annotations and proposals as we need. + +```shell +cd $MMACTION2/tools/data/multisports/ +python parse_anno.py +``` + +## Step 5. Check Directory Structure + +After the whole data process, you will get the videos and annotation files for MultiSports. + +In the context of the whole project (for MultiSports only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ multisports +โ”‚ | โ”œโ”€โ”€ annotations +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_test.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_train.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_val.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_GT.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_train.csv +| โ”‚ | โ”œโ”€โ”€ multisports_val.csv +โ”‚ | โ”œโ”€โ”€ trainval +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics +| โ”‚ | | โ”œโ”€โ”€ v__wAgwttPYaQ_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v__wAgwttPYaQ_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ basketball +| โ”‚ | | โ”œโ”€โ”€ v_-6Os86HzwCs_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_-6Os86HzwCs_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ multisports_GT.pkl +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ test +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics +| โ”‚ | | โ”œโ”€โ”€ v_2KroSzspz-c_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_2KroSzspz-c_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ basketball +| โ”‚ | | โ”œโ”€โ”€ v_1tefH1iPbGM_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_1tefH1iPbGM_c002.mp4 +โ”‚ | | โ”œโ”€โ”€... +``` + +We don't need the zip files under the project, you can handle them as you want. +For training and evaluating on MultiSports, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/multisports/README_zh-CN.md b/tools/data/multisports/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..39a35e9b4d918491af9aae89ddc3d0774c43c8a2 --- /dev/null +++ b/tools/data/multisports/README_zh-CN.md @@ -0,0 +1,111 @@ +# ๅ‡†ๅค‡ MultiSports + +## ไป‹็ป + + + +```BibTeX +@inproceedings{li2021multisports, + title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions}, + author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={13536--13545}, + year={2021} +} +``` + +ๅ…ณไบŽๅŸบๆœฌๆ•ฐๆฎ้›†ไฟกๆฏ๏ผŒ่ฏทๅ‚่€ƒๅฎ˜ๆ–น [้กน็›ฎ](https://deeperaction.github.io/datasets/multisports.html) ๅ’Œ [่ฎบๆ–‡](https://arxiv.org/abs/2105.07404)ใ€‚ +ๅœจๆˆ‘ไปฌๅผ€ๅง‹ไน‹ๅ‰๏ผŒ่ฏท็กฎไฟ็›ฎๅฝ•ไฝไบŽ `$MMACTION2/tools/data/multisports/`ใ€‚ + +## ็ฌฌไธ€ๆญฅ๏ผšๅ‡†ๅค‡ๆ ‡ๆณจ + +้ฆ–ๅ…ˆ๏ผŒไฝ ๅฟ…้กปไปŽๅฎ˜ๆ–น [็ฝ‘็ซ™](https://github.com/MCG-NJU/MultiSports) ไธ‹่ฝฝๆ ‡ๆณจๅ’Œ่ง†้ข‘ๅˆฐ `$MMACTION2/data/multisports`๏ผŒ่ฏทๅŒๆ—ถไธ‹่ฝฝไบบ็‰ฉๆฃ€ๆต‹ๆก†ๅนถๅฐ†ๅ…ถๆ”พๅˆฐ `$MMACTION2/data/multisports`ใ€‚ + +## ็ฌฌไบŒๆญฅ๏ผšๅ‡†ๅค‡่ง†้ข‘ + +ๅœจ่ฟ™ไธ€ๆญฅไน‹ๅ‰๏ผŒ่ฏท็กฎไฟๆ–‡ไปถๅคน็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ multisports +โ”‚ | โ”œโ”€โ”€ MultiSports_box.zip +โ”‚ | โ”œโ”€โ”€ trainval +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics.zip +โ”‚ | | โ”œโ”€โ”€ basketball.zip +โ”‚ | | โ”œโ”€โ”€ multisports_GT.pkl +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ test +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics.zip +โ”‚ | | โ”œโ”€โ”€ basketball.zip +โ”‚ | | โ”œโ”€โ”€... +``` + +็„ถๅŽ๏ผŒไฝ ๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปค่ฟ›่กŒ่งฃๅŽ‹ใ€‚ + +```shell +cd $MMACTION2/data/multisports/ +unzip MultiSports_box.zip +cd $MMACTION2/data/multisports/trainval +find . -name '*.zip' -exec unzip {} \; +cd $MMACTION2/data/multisports/test +find . -name '*.zip' -exec unzip {} \; +cd $MMACTION2/tools/data/multisports/ +``` + +## ็ฌฌไธ‰ๆญฅ๏ผš่ฝฌๆขๆ ‡ๆณจๆ–‡ไปถ + +ไฝ ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌๆฅ่ฝฌๆขๆˆ‘ไปฌ้œ€่ฆ็š„ๆ ‡ๆณจๆ–‡ไปถๅ’Œๅ€™้€‰ๆก†ใ€‚ + +```shell +cd $MMACTION2/tools/data/multisports/ +python parse_anno.py +``` + +## ็ฌฌไบ”ๆญฅ๏ผšๆฃ€ๆŸฅ็›ฎๅฝ•็ป“ๆž„ + +ๅฎŒๆˆๆ•ดไธชๆ•ฐๆฎๅค„็†ๅŽ๏ผŒไฝ ๅฐ†ๅพ—ๅˆฐ MultiSports ๆ•ฐๆฎ้›†็š„่ง†้ข‘ๅ’Œๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช้กน็›ฎ็š„็›ฎๅฝ•ไธญ๏ผˆไป…้’ˆๅฏน MultiSports๏ผ‰๏ผŒๆ–‡ไปถๅคน็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ multisports +โ”‚ | โ”œโ”€โ”€ annotations +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_test.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_train.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_dense_proposals_val.recall_96.13.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_GT.pkl +| โ”‚ | โ”œโ”€โ”€ multisports_train.csv +| โ”‚ | โ”œโ”€โ”€ multisports_val.csv +โ”‚ | โ”œโ”€โ”€ trainval +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics +| โ”‚ | | โ”œโ”€โ”€ v__wAgwttPYaQ_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v__wAgwttPYaQ_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ basketball +| โ”‚ | | โ”œโ”€โ”€ v_-6Os86HzwCs_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_-6Os86HzwCs_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ multisports_GT.pkl +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ test +โ”‚ | | โ”œโ”€โ”€ aerobic_gymnastics +| โ”‚ | | โ”œโ”€โ”€ v_2KroSzspz-c_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_2KroSzspz-c_c002.mp4 +| โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ basketball +| โ”‚ | | โ”œโ”€โ”€ v_1tefH1iPbGM_c001.mp4 +| โ”‚ | | โ”œโ”€โ”€ v_1tefH1iPbGM_c002.mp4 +โ”‚ | | โ”œโ”€โ”€... +``` + +ๆˆ‘ไปฌไธ้œ€่ฆ้กน็›ฎไธ‹็š„ zip ๆ–‡ไปถ๏ผŒไฝ ๅฏไปฅๆŒ‰็…ง่‡ชๅทฑ็š„ๆ„ๆ„ฟๅค„็†ๅฎƒไปฌใ€‚ +ๅ…ณไบŽๅœจ MultiSports ไธŠ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ่ฏ„ไผฐ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/multisports/format_det_result.py b/tools/data/multisports/format_det_result.py new file mode 100644 index 0000000000000000000000000000000000000000..693036f0d6b2777aa59b0f9bc5f9959f7b4977e9 --- /dev/null +++ b/tools/data/multisports/format_det_result.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from argparse import ArgumentParser + +import numpy as np +from mmengine import dump, load +from rich.progress import track + +from mmaction.evaluation import link_tubes + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('test-result', help='path of dumped reuslts') + parser.add_argument( + '--anno-path', + default='data/multisports/videos/trainval/multisports_GT.pkl') + parser.add_argument( + '--frm_out_path', + default=None, + help='frame-level detection results output path') + parser.add_argument( + '--tube_out_path', + default=None, + help='tube-level detection results output path') + args = parser.parse_args() + if not args.frm_out_path: + args.frm_out_path = args.test_result[:-4] + '-formated.pkl' + if not args.tube_out_path: + args.tube_out_path = args.test_result[:-4] + '_vid_dets.pkl' + return args + + +def format_det_result(): + """convert test results to specified format in MultiSports competition.""" + test_results = load(args.test_result) + annos = load(args.anno_path) + test_videos = annos['test_videos'][0] + resolutions = annos['resolution'] + frm_dets = [] + for pred in track(test_results, description='formating...'): + video_key = pred['video_id'].split('.mp4')[0] + frm_num = pred['timestamp'] + bboxes = pred['pred_instances']['bboxes'] + cls_scores = pred['pred_instances']['scores'] + for bbox, cls_score in zip(bboxes, cls_scores): + video_idx = test_videos.index(video_key) + pred_label = np.argmax(cls_score) + score = cls_score[pred_label] + h, w = resolutions[video_key] + bbox *= np.array([w, h, w, h]) + instance_result = np.array( + [video_idx, frm_num, pred_label, score, *bbox]) + frm_dets.append(instance_result) + frm_dets = np.array(frm_dets) + video_tubes = link_tubes(annos, frm_dets, K=1) + dump(frm_dets, args.frm_out_path) + dump(video_tubes, args.tube_out_path) + + +if __name__ == '__main__': + args = parse_args() + format_det_result() diff --git a/tools/data/multisports/label_map.txt b/tools/data/multisports/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a4428a54a612eb653f9aae9005d72ccad7530b4 --- /dev/null +++ b/tools/data/multisports/label_map.txt @@ -0,0 +1,66 @@ +0: aerobic_push_up +1: aerobic_explosive_push_up +2: aerobic_explosive_support +3: aerobic_leg_circle +4: aerobic_helicopter +5: aerobic_support +6: aerobic_v_support +7: aerobic_horizontal_support +8: aerobic_straight_jump +9: aerobic_illusion +10: aerobic_bent_leg(s)_jump +11: aerobic_pike_jump +12: aerobic_straddle_jump +13: aerobic_split_jump +14: aerobic_scissors_leap +15: aerobic_kick_jump +16: aerobic_off_axis_jump +17: aerobic_butterfly_jump +18: aerobic_split +19: aerobic_turn +20: aerobic_balance_turn +21: volleyball_serve +22: volleyball_block +23: volleyball_first_pass +24: volleyball_defend +25: volleyball_protect +26: volleyball_second_pass +27: volleyball_adjust +28: volleyball_save +29: volleyball_second_attack +30: volleyball_spike +31: volleyball_dink +32: volleyball_no_offensive_attack +33: football_shoot +34: football_long_pass +35: football_short_pass +36: football_through_pass +37: football_cross +38: football_dribble +39: football_trap +40: football_throw +41: football_diving +42: football_tackle +43: football_steal +44: football_clearance +45: football_block +46: football_press +47: football_aerial_duels +48: basketball_pass +49: basketball_drive +50: basketball_dribble +51: basketball_3-point_shot +52: basketball_2-point_shot +53: basketball_free_throw +54: basketball_block +55: basketball_offensive_rebound +56: basketball_defensive_rebound +57: basketball_pass_steal +58: basketball_dribble_steal +59: basketball_interfere_shot +60: basketball_pick-and-roll_defensive +61: basketball_sag +62: basketball_screen +63: basketball_pass-inbound +64: basketball_save +65: basketball_jump_ball diff --git a/tools/data/multisports/parse_anno.py b/tools/data/multisports/parse_anno.py new file mode 100644 index 0000000000000000000000000000000000000000..4987bc385996d835b9bb5e385fe5f494c9933bbe --- /dev/null +++ b/tools/data/multisports/parse_anno.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import csv +import os +import os.path as osp +from argparse import ArgumentParser + +import numpy as np +from mmengine import dump, list_dir_or_file, load + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + '--data-root', + default='data/multisports', + help='the directory to multisports annotations') + parser.add_argument( + '--out-root', + default='data/multisports', + help='output directory of output annotation files') + parser.add_argument('--dump-proposals', action='store_true') + args = parser.parse_args() + return args + + +def parse_anno(args): + if not osp.exists(args.out_root): + os.makedirs(osp.join(args.out_root, 'annotations')) + + anno_path = osp.join(args.data_root, 'annotations/multisports_GT.pkl') + annos = load(anno_path) + + # convert key in proposal file to filename + key2filename = { + video.split('/')[1]: video + '.mp4' + for video in annos['nframes'].keys() + } + test_videos = [ + file for file in list_dir_or_file( + osp.join(args.data_root, 'test'), recursive=True) + if file.endswith('.mp4') + ] + key2filename.update( + {video.split('/')[1][:-4]: video + for video in test_videos}) + # convert proposal bboxes + if args.dump_proposals: + proposals_path = osp.join(args.data_root, + 'annotations/MultiSports_box') + for proposals in os.listdir(proposals_path): + proposal_info = load(osp.join(proposals_path, proposals)) + proposal_out = dict() + for key in proposal_info.keys(): + key_split = key.split(',') + if key_split[0] in key2filename.keys(): + new_key = \ + f'{key2filename[key_split[0]]},{int(key_split[1]):04d}' + proposal_out[new_key] = proposal_info[key] + target_path = osp.join(args.out_root, 'annotations', + 'multisports_dense_proposals_' + proposals) + dump(proposal_out, target_path) + # dump train and val list + for split in ['train', 'val']: + out_anno_path = osp.join(args.out_root, 'annotations', + f'multisports_{split}.csv') + with open(out_anno_path, 'w') as csv_f: + writer = csv.writer(csv_f) + if split == 'train': + video_list = annos['train_videos'][0] + elif split == 'val': + video_list = annos['test_videos'][0] + gt_tubes = annos['gttubes'] + resolutions = annos['resolution'] + for video_id in video_list: + vid_tubes = gt_tubes[video_id] + h, w = resolutions[video_id] + for label, tubes in vid_tubes.items(): + entity_id = 0 + for tube in tubes: + for frame_anno in tube: + frame_stamp = int(frame_anno[0]) + entity_box = frame_anno[1:] + entity_box /= np.array([w, h, w, h]) + entity_box = [f'{num:.3f}' for num in entity_box] + filename = video_id + '.mp4' + anno_line = [ + filename, frame_stamp, *entity_box, label, + entity_id + ] + writer.writerow(anno_line) + entity_id += 1 + + +if __name__ == '__main__': + args = parse_args() + parse_anno(args) diff --git a/tools/data/omnisource/README.md b/tools/data/omnisource/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1e0ea002937683e520b5c6a69ba2d7cc5901c52d --- /dev/null +++ b/tools/data/omnisource/README.md @@ -0,0 +1,150 @@ +# Preparing OmniSource + +## Introduction + + + +```BibTeX +@article{duan2020omni, + title={Omni-sourced Webly-supervised Learning for Video Recognition}, + author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua}, + journal={arXiv preprint arXiv:2003.13042}, + year={2020} +} +``` + +We release a subset of the OmniSource web dataset used in the paper [Omni-sourced Webly-supervised Learning for Video Recognition](https://arxiv.org/abs/2003.13042). Since all web dataset in OmniSource are built based on the Kinetics-400 taxonomy, we select those web data related to the 200 classes in Mini-Kinetics subset (which is proposed in [Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification](https://arxiv.org/pdf/1712.04851.pdf)). + +We provide data from all sources that are related to the 200 classes in Mini-Kinetics (including Kinetics trimmed clips, Kinetics untrimmed videos, images from Google and Instagram, video clips from Instagram). To obtain this dataset, please first fill in the [request form](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link). We will share the download link to you after your request is received. Since we release all data crawled from the web without any filtering, the dataset is large and it may take some time to download them. We describe the size of the datasets in the following table: + +| Dataset Name | #samples | Size | Teacher Model | #samples after filtering | #samples similar to k200_val | +| :-------------: | :------: | :-----: | :--------------: | :----------------------: | :--------------------------: | +| k200_train | 76030 | 45.6G | N/A | N/A | N/A | +| k200_val | 4838 | 2.9G | N/A | N/A | N/A | +| googleimage_200 | 3050880 | 265.5G | TSN-R50-8seg | 1188695 | 967 | +| insimage_200 | 3654650 | 224.4G | TSN-R50-8seg | 879726 | 116 | +| insvideo_200 | 732855 | 1487.6G | SlowOnly-8x8-R50 | 330680 | 956 | +| k200_raw_train | 76027 | 963.5G | SlowOnly-8x8-R50 | N/A | N/A | + +The file structure of our uploaded OmniSource dataset looks like: + +``` +OmniSource/ +โ”œโ”€โ”€ annotations +โ”‚ โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”‚ โ”œโ”€โ”€ googleimage_200.txt File list of all valid images crawled from Google. +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_googleimage_200_duplicate.txt Positive file list of images crawled from Google, which is similar to a validation example. +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_googleimage_200.txt Positive file list of images crawled from Google, filtered by the teacher model. +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_googleimage_200_wodup.txt Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication. +โ”‚ โ”œโ”€โ”€ insimage_200 +โ”‚ โ”‚ โ”œโ”€โ”€ insimage_200.txt +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_insimage_200_duplicate.txt +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_insimage_200.txt +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_insimage_200_wodup.txt +โ”‚ โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”‚ โ”œโ”€โ”€ insvideo_200.txt +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_8x8_insvideo_200_duplicate.txt +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_8x8_insvideo_200.txt +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_insvideo_200_wodup.txt +โ”‚ โ”œโ”€โ”€ k200_actions.txt The list of action names of the 200 classes in MiniKinetics. +โ”‚ โ”œโ”€โ”€ K400_to_MiniKinetics_classidx_mapping.json The index mapping from Kinetics-400 to MiniKinetics. +โ”‚ โ”œโ”€โ”€ kinetics_200 +โ”‚ โ”‚ โ”œโ”€โ”€ k200_train.txt +โ”‚ โ”‚ โ””โ”€โ”€ k200_val.txt +โ”‚ โ”œโ”€โ”€ kinetics_raw_200 +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_kinetics_raw_200.json Kinetics Raw Clips filtered by the teacher model. +โ”‚ โ””โ”€โ”€ webimage_200 +โ”‚ โ””โ”€โ”€ tsn_8seg_webimage_200_wodup.txt The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt` +โ”œโ”€โ”€ googleimage_200 (10 volumes) +โ”‚ โ”œโ”€โ”€ vol_0.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_9.tar +โ”œโ”€โ”€ insimage_200 (10 volumes) +โ”‚ โ”œโ”€โ”€ vol_0.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_9.tar +โ”œโ”€โ”€ insvideo_200 (20 volumes) +โ”‚ โ”œโ”€โ”€ vol_00.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_19.tar +โ”œโ”€โ”€ kinetics_200_train +โ”‚ โ””โ”€โ”€ kinetics_200_train.tar +โ”œโ”€โ”€ kinetics_200_val +โ”‚ โ””โ”€โ”€ kinetics_200_val.tar +โ””โ”€โ”€ kinetics_raw_200_train (16 volumes) + โ”œโ”€โ”€ vol_0.tar + โ”œโ”€โ”€ ... + โ””โ”€โ”€ vol_15.tar +``` + +## Data Preparation + +For data preparation, you need to first download those data. For `kinetics_200` and 3 web datasets: `googleimage_200`, `insimage_200` and `insvideo_200`, you just need to extract each volume and merge their contents. + +For Kinetics raw videos, since loading long videos is very heavy, you need to first trim it into clips. Here we provide a script named `trim_raw_video.py`. It trims a long video into 10-second clips and remove the original raw video. You can use it to trim the Kinetics raw video. + +The data should be placed in `data/OmniSource/`. When data preparation finished, the folder structure of `data/OmniSource` looks like (We omit the files not needed in training & testing for simplicity): + +``` +data/OmniSource/ +โ”œโ”€โ”€ annotations +โ”‚ โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_googleimage_200_wodup.txt Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication. +โ”‚ โ”œโ”€โ”€ insimage_200 +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_insimage_200_wodup.txt +โ”‚ โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_insvideo_200_wodup.txt +โ”‚ โ”œโ”€โ”€ kinetics_200 +โ”‚ โ”‚ โ”œโ”€โ”€ k200_train.txt +โ”‚ โ”‚ โ””โ”€โ”€ k200_val.txt +โ”‚ โ”œโ”€โ”€ kinetics_raw_200 +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_kinetics_raw_200.json Kinetics Raw Clips filtered by the teacher model. +โ”‚ โ””โ”€โ”€ webimage_200 +โ”‚ โ””โ”€โ”€ tsn_8seg_webimage_200_wodup.txt The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt` +โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ 00 +| โ”‚ โ”‚ โ”œโ”€โ”€ 000001.jpg +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ 000901.jpg +| โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”œโ”€โ”€ 19 +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ insimage_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ abseil +| โ”‚ โ”‚ โ”œโ”€โ”€ 1J9tKWCNgV_0.jpg +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ 1J9tKWCNgV_0.jpg +| โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ abseil +| โ”‚ โ”‚ โ”œโ”€โ”€ B00arxogubl.mp4 +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ BzYsP0HIvbt.mp4 +| โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ kinetics_200_train +โ”‚ โ”œโ”€โ”€ 0074cdXclLU.mp4 +| โ”œโ”€โ”€ ... +| โ”œโ”€โ”€ zzzlyL61Fyo.mp4 +โ”œโ”€โ”€ kinetics_200_val +โ”‚ โ”œโ”€โ”€ 01fAWEHzudA.mp4 +| โ”œโ”€โ”€ ... +| โ”œโ”€โ”€ zymA_6jZIz4.mp4 +โ””โ”€โ”€ kinetics_raw_200_train +โ”‚ โ”œโ”€โ”€ pref_ +โ”‚ | โ”œโ”€โ”€ ___dTOdxzXY +| โ”‚ โ”‚ โ”œโ”€โ”€ part_0.mp4 +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ”œโ”€โ”€ part_6.mp4 +โ”‚ | โ”œโ”€โ”€ ... +โ”‚ | โ””โ”€โ”€ _zygwGDE2EM +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ prefZ +``` diff --git a/tools/data/omnisource/README_zh-CN.md b/tools/data/omnisource/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..c2aeb5c1c2211d2e7c6002ff560b381e8302a2c2 --- /dev/null +++ b/tools/data/omnisource/README_zh-CN.md @@ -0,0 +1,149 @@ +# ๅ‡†ๅค‡ OmniSource + +## ็ฎ€ไป‹ + + + +```BibTeX +@article{duan2020omni, + title={Omni-sourced Webly-supervised Learning for Video Recognition}, + author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua}, + journal={arXiv preprint arXiv:2003.13042}, + year={2020} +} +``` + +MMAction2 ไธญๅ‘ๅธƒไบ† OmniSource ็ฝ‘็ปœๆ•ฐๆฎ้›†็š„ไธ€ไธชๅญ้›† (ๆฅ่‡ช่ฎบๆ–‡ [Omni-sourced Webly-supervised Learning for Video Recognition](https://arxiv.org/abs/2003.13042))ใ€‚ +OmniSource ๆ•ฐๆฎ้›†ไธญๆ‰€ๆœ‰็ฑปๅˆซๅ‡ๆฅ่‡ช Kinetics-400ใ€‚MMAction2 ๆ‰€ๆไพ›็š„ๅญ้›†ๅŒ…ๅซๅฑžไบŽ Mini-Kinetics ๆ•ฐๆฎ้›† 200 ็ฑปๅŠจไฝœ็š„็ฝ‘็ปœๆ•ฐๆฎ (Mini-inetics ๆ•ฐๆฎ้›†็”ฑ่ฎบๆ–‡ [Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification](https://arxiv.org/pdf/1712.04851.pdf) ๆๅ‡บ)ใ€‚ + +MMAction2 ๆไพ›ๆ‰€ๆœ‰ๆ•ฐๆฎๆบไธญๅฑžไบŽ Mini-Kinetics 200 ็ฑปๅŠจไฝœ็š„ๆ•ฐๆฎ๏ผŒ่ฟ™ไบ›ๆ•ฐๆฎๆบๅŒ…ๅซ๏ผšKinetics ๆ•ฐๆฎ้›†๏ผŒKinetics ๅŽŸๅง‹ๆ•ฐๆฎ้›†๏ผˆๆœช็ป่ฃๅ‰ช็š„้•ฟ่ง†้ข‘๏ผ‰๏ผŒๆฅ่‡ช Google ๅ’Œ Instagram ็š„็ฝ‘็ปœๅ›พ็‰‡๏ผŒๆฅ่‡ช Instagram ็š„็ฝ‘็ปœ่ง†้ข‘ใ€‚ไธบ่Žทๅ–่ฟ™ไธ€ๆ•ฐๆฎ้›†๏ผŒ็”จๆˆท้œ€ๅ…ˆๅกซๅ†™ [ๆ•ฐๆฎ็”ณ่ฏท่กจ](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link)ใ€‚ๅœจๆŽฅๆ”ถๅˆฐ็”ณ่ฏทๅŽ๏ผŒไธ‹่ฝฝ้“พๆŽฅๅฐ†่ขซๅ‘้€่‡ณ็”จๆˆท้‚ฎ็ฎฑใ€‚็”ฑไบŽๅ‘ๅธƒ็š„ๆ•ฐๆฎ้›†ๅ‡ไธบ็ˆฌๅ–ๆ‰€ๅพ—็š„ๅŽŸๅง‹ๆ•ฐๆฎ๏ผŒๆ•ฐๆฎ้›†่พƒๅคง๏ผŒไธ‹่ฝฝ้œ€่ฆไธ€ๅฎšๆ—ถ้—ดใ€‚ไธ‹่กจไธญๆไพ›ไบ† OmniSource ๆ•ฐๆฎ้›†ๅ„ไธชๅˆ†้‡็š„็ปŸ่ฎกไฟกๆฏใ€‚ + +| ๆ•ฐๆฎ้›†ๅ็งฐ | ๆ ทๆœฌไธชๆ•ฐ | ๆ‰€ๅ ็ฉบ้—ด | ่ฟ‡ๆปคไฝฟ็”จ็š„ Teacher ๆจกๅž‹ | ่ฟ‡ๆปคๅŽ็š„ๆ ทๆœฌไธชๆ•ฐ | ไธŽ k200_val ไธญๆ ทๆœฌ็›ธไผผ๏ผˆ็–‘ไผผ้‡ๅค๏ผ‰็š„ๆ ทๆœฌไธชๆ•ฐ | +| :-------------: | :------: | :------: | :---------------------: | :--------------: | :------------------------------------------: | +| k200_train | 76030 | 45.6G | N/A | N/A | N/A | +| k200_val | 4838 | 2.9G | N/A | N/A | N/A | +| googleimage_200 | 3050880 | 265.5G | TSN-R50-8seg | 1188695 | 967 | +| insimage_200 | 3654650 | 224.4G | TSN-R50-8seg | 879726 | 116 | +| insvideo_200 | 732855 | 1487.6G | SlowOnly-8x8-R50 | 330680 | 956 | +| k200_raw_train | 76027 | 963.5G | SlowOnly-8x8-R50 | N/A | N/A | + +MMAction2 ๆ‰€ๅ‘ๅธƒ็š„ OmniSource ๆ•ฐๆฎ้›†็›ฎๅฝ•็ป“ๆž„ๅฆ‚ไธ‹ๆ‰€็คบ๏ผš + +``` +OmniSource/ +โ”œโ”€โ”€ annotations +โ”‚ โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”‚ โ”œโ”€โ”€ googleimage_200.txt ไปŽ Google ็ˆฌๅ–ๅˆฐ็š„ๆ‰€ๆœ‰ๅ›พ็‰‡ๅˆ—่กจ +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_googleimage_200_duplicate.txt ไปŽ Google ็ˆฌๅ–ๅˆฐ็š„๏ผŒ็–‘ไผผไธŽ k200-val ไธญๆ ทๆœฌ้‡ๅค็š„ๆญฃๆ ทๆœฌๅˆ—่กจ +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_googleimage_200.txt ไปŽ Google ็ˆฌๅ–ๅˆฐ็š„๏ผŒ็ป่ฟ‡ teacher ๆจกๅž‹่ฟ‡ๆปค็š„ๆญฃๆ ทๆœฌๅˆ—่กจ +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_googleimage_200_wodup.txt ไปŽ Google ็ˆฌๅ–ๅˆฐ็š„๏ผŒ็ป่ฟ‡ teacher ๆจกๅž‹่ฟ‡ๆปคๅŠๅŽป้‡็š„ๆญฃๆ ทๆœฌๅˆ—่กจ +โ”‚ โ”œโ”€โ”€ insimage_200 +โ”‚ โ”‚ โ”œโ”€โ”€ insimage_200.txt +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_insimage_200_duplicate.txt +โ”‚ โ”‚ โ”œโ”€โ”€ tsn_8seg_insimage_200.txt +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_insimage_200_wodup.txt +โ”‚ โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”‚ โ”œโ”€โ”€ insvideo_200.txt +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_8x8_insvideo_200_duplicate.txt +โ”‚ โ”‚ โ”œโ”€โ”€ slowonly_8x8_insvideo_200.txt +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_insvideo_200_wodup.txt +โ”‚ โ”œโ”€โ”€ k200_actions.txt MiniKinetics ไธญ 200 ็ฑปๅŠจไฝœ็š„ๅ็งฐ +โ”‚ โ”œโ”€โ”€ K400_to_MiniKinetics_classidx_mapping.json Kinetics ไธญ็š„็ฑป็ดขๅผ•่‡ณ MiniKinetics ไธญ็š„็ฑป็ดขๅผ•็š„ๆ˜ ๅฐ„ +โ”‚ โ”œโ”€โ”€ kinetics_200 +โ”‚ โ”‚ โ”œโ”€โ”€ k200_train.txt +โ”‚ โ”‚ โ””โ”€โ”€ k200_val.txt +โ”‚ โ””โ”€โ”€ kinetics_raw_200 +โ”‚ โ””โ”€โ”€ slowonly_8x8_kinetics_raw_200.json ็ป teacher ๆจกๅž‹่ฟ‡ๆปคๅŽ็š„ Kinetics ๅŽŸๅง‹่ง†้ข‘็‰‡ๆฎต +โ”œโ”€โ”€ googleimage_200 ๅ…ฑ 10 ๅท +โ”‚ โ”œโ”€โ”€ vol_0.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_9.tar +โ”œโ”€โ”€ insimage_200 ๅ…ฑ 10 ๅท +โ”‚ โ”œโ”€โ”€ vol_0.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_9.tar +โ”œโ”€โ”€ insvideo_200 ๅ…ฑ 20 ๅท +โ”‚ โ”œโ”€โ”€ vol_00.tar +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ vol_19.tar +โ”œโ”€โ”€ kinetics_200_train +โ”‚ โ””โ”€โ”€ kinetics_200_train.tar +โ”œโ”€โ”€ kinetics_200_val +โ”‚ โ””โ”€โ”€ kinetics_200_val.tar +โ””โ”€โ”€ kinetics_raw_200_train ๅ…ฑ 16 ๅท + โ”œโ”€โ”€ vol_0.tar + โ”œโ”€โ”€ ... + โ””โ”€โ”€ vol_15.tar +``` + +## ๆ•ฐๆฎๅ‡†ๅค‡ + +็”จๆˆท้œ€่ฆ้ฆ–ๅ…ˆๅฎŒๆˆๆ•ฐๆฎไธ‹่ฝฝ๏ผŒๅฏนไบŽ `kinetics_200` ๅ’Œไธ‰ไธช็ฝ‘็ปœๆ•ฐๆฎ้›† `googleimage_200`, `insimage_200`, `insvideo_200`๏ผŒ็”จๆˆทไป…้œ€่งฃๅŽ‹ๅ„ๅŽ‹็ผฉๅทๅนถๅฐ†ๅ…ถๅˆๅนถ่‡ณไธ€ๅค„ใ€‚ + +ๅฏนไบŽ Kinetics ๅŽŸๅง‹่ง†้ข‘๏ผŒ็”ฑไบŽ็›ดๆŽฅ่ฏปๅ–้•ฟ่ง†้ข‘้žๅธธ่€—ๆ—ถ๏ผŒ็”จๆˆท้œ€่ฆๅ…ˆๅฐ†ๅ…ถๅˆ†ๅ‰ฒไธบๅฐๆฎตใ€‚MMAction2 ๆไพ›ไบ†ๅไธบ `trim_raw_video.py` ็š„่„šๆœฌ๏ผŒ็”จไบŽๅฐ†้•ฟ่ง†้ข‘ๅˆ†ๅ‰ฒ่‡ณ 10 ็ง’็š„ๅฐๆฎต๏ผˆๅˆ†ๅ‰ฒๅฎŒๆˆๅŽๅˆ ้™ค้•ฟ่ง†้ข‘๏ผ‰ใ€‚็”จๆˆทๅฏๅˆฉ็”จ่ฟ™ไธ€่„šๆœฌๅˆ†ๅ‰ฒ้•ฟ่ง†้ข‘ใ€‚ + +ๆ‰€ๆœ‰ๆ•ฐๆฎๅบ”ไฝไบŽ `data/OmniSource/` ็›ฎๅฝ•ไธ‹ใ€‚ๅฎŒๆˆๆ•ฐๆฎๅ‡†ๅค‡ๅŽ๏ผŒ`data/OmniSource/` ็›ฎๅฝ•็š„็ป“ๆž„ๅบ”ๅฆ‚ไธ‹ๆ‰€็คบ๏ผˆไธบ็ฎ€ๆด๏ผŒ็œๅŽปไบ†่ฎญ็ปƒๅŠๆต‹่ฏ•ๆ—ถๆœชไฝฟ็”จ็š„ๆ–‡ไปถ๏ผ‰๏ผš + +``` +data/OmniSource/ +โ”œโ”€โ”€ annotations +โ”‚ โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_googleimage_200_wodup.txt Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication. +โ”‚ โ”œโ”€โ”€ insimage_200 +โ”‚ โ”‚ โ””โ”€โ”€ tsn_8seg_insimage_200_wodup.txt +โ”‚ โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_insvideo_200_wodup.txt +โ”‚ โ”œโ”€โ”€ kinetics_200 +โ”‚ โ”‚ โ”œโ”€โ”€ k200_train.txt +โ”‚ โ”‚ โ””โ”€โ”€ k200_val.txt +โ”‚ โ”œโ”€โ”€ kinetics_raw_200 +โ”‚ โ”‚ โ””โ”€โ”€ slowonly_8x8_kinetics_raw_200.json Kinetics Raw Clips filtered by the teacher model. +โ”‚ โ””โ”€โ”€ webimage_200 +โ”‚ โ””โ”€โ”€ tsn_8seg_webimage_200_wodup.txt The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt` +โ”œโ”€โ”€ googleimage_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ 00 +| โ”‚ โ”‚ โ”œโ”€โ”€ 000001.jpg +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ 000901.jpg +| โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”œโ”€โ”€ 19 +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ insimage_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ abseil +| โ”‚ โ”‚ โ”œโ”€โ”€ 1J9tKWCNgV_0.jpg +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ 1J9tKWCNgV_0.jpg +| โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ insvideo_200 +โ”‚ โ”œโ”€โ”€ 000 +| โ”‚ โ”œโ”€โ”€ abseil +| โ”‚ โ”‚ โ”œโ”€โ”€ B00arxogubl.mp4 +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ””โ”€โ”€ BzYsP0HIvbt.mp4 +| โ”‚ โ”œโ”€โ”€ abseiling +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ 199 +โ”œโ”€โ”€ kinetics_200_train +โ”‚ โ”œโ”€โ”€ 0074cdXclLU.mp4 +| โ”œโ”€โ”€ ... +| โ”œโ”€โ”€ zzzlyL61Fyo.mp4 +โ”œโ”€โ”€ kinetics_200_val +โ”‚ โ”œโ”€โ”€ 01fAWEHzudA.mp4 +| โ”œโ”€โ”€ ... +| โ”œโ”€โ”€ zymA_6jZIz4.mp4 +โ””โ”€โ”€ kinetics_raw_200_train +โ”‚ โ”œโ”€โ”€ pref_ +โ”‚ | โ”œโ”€โ”€ ___dTOdxzXY +| โ”‚ โ”‚ โ”œโ”€โ”€ part_0.mp4 +| โ”‚ โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ โ”œโ”€โ”€ part_6.mp4 +โ”‚ | โ”œโ”€โ”€ ... +โ”‚ | โ””โ”€โ”€ _zygwGDE2EM +โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ””โ”€โ”€ prefZ +``` diff --git a/tools/data/omnisource/trim_raw_video.py b/tools/data/omnisource/trim_raw_video.py new file mode 100644 index 0000000000000000000000000000000000000000..a4405f58e0771b26aaef24b10e662ac75af0f972 --- /dev/null +++ b/tools/data/omnisource/trim_raw_video.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import sys +from subprocess import check_output + +import mmengine + + +def get_duration(vid_name): + command = f'ffprobe -i {vid_name} 2>&1 | grep "Duration"' + output = str(check_output(command, shell=True)) + output = output.split(',')[0].split('Duration:')[1].strip() + h, m, s = output.split(':') + duration = int(h) * 3600 + int(m) * 60 + float(s) + return duration + + +def trim(vid_name): + try: + lt = get_duration(vid_name) + except Exception: + print(f'get_duration failed for video {vid_name}', flush=True) + return + + i = 0 + name, _ = osp.splitext(vid_name) + + # We output 10-second clips into the folder `name` + dest = name + mmengine.mkdir_or_exist(dest) + + command_tmpl = ('ffmpeg -y loglevel error -i {} -ss {} -t {} -crf 18 ' + '-c:v libx264 {}/part_{}.mp4') + while i * 10 < lt: + os.system(command_tmpl.format(vid_name, i * 10, 10, dest, i)) + i += 1 + + # remove a raw video after decomposing it into 10-second clip to save space + os.remove(vid_name) + + +if __name__ == '__main__': + vid_name = sys.argv[1] + trim(vid_name) diff --git a/tools/data/parse_file_list.py b/tools/data/parse_file_list.py new file mode 100644 index 0000000000000000000000000000000000000000..ecb4e6cecefe42eadb9efd436f5f4473b4d386b9 --- /dev/null +++ b/tools/data/parse_file_list.py @@ -0,0 +1,535 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import csv +import fnmatch +import glob +import json +import os +import os.path as osp + + +def parse_directory(path, + rgb_prefix='img_', + flow_x_prefix='flow_x_', + flow_y_prefix='flow_y_', + level=1): + """Parse directories holding extracted frames from standard benchmarks. + + Args: + path (str): Directory path to parse frames. + rgb_prefix (str): Prefix of generated rgb frames name. + default: 'img_'. + flow_x_prefix (str): Prefix of generated flow x name. + default: `flow_x_`. + flow_y_prefix (str): Prefix of generated flow y name. + default: `flow_y_`. + level (int): Directory level for glob searching. Options are 1 and 2. + default: 1. + + Returns: + dict: frame info dict with video id as key and tuple(path(str), + rgb_num(int), flow_x_num(int)) as value. + """ + print(f'parse frames under directory {path}') + if level == 1: + # Only search for one-level directory + def locate_directory(x): + return osp.basename(x) + + frame_dirs = glob.glob(osp.join(path, '*')) + + elif level == 2: + # search for two-level directory + def locate_directory(x): + return osp.join(osp.basename(osp.dirname(x)), osp.basename(x)) + + frame_dirs = glob.glob(osp.join(path, '*', '*')) + + else: + raise ValueError('level can be only 1 or 2') + + def count_files(directory, prefix_list): + """Count file number with a given directory and prefix. + + Args: + directory (str): Data directory to be search. + prefix_list (list): List or prefix. + + Returns: + list (int): Number list of the file with the prefix. + """ + lst = os.listdir(directory) + cnt_list = [len(fnmatch.filter(lst, x + '*')) for x in prefix_list] + return cnt_list + + # check RGB + frame_dict = {} + for i, frame_dir in enumerate(frame_dirs): + total_num = count_files(frame_dir, + (rgb_prefix, flow_x_prefix, flow_y_prefix)) + dir_name = locate_directory(frame_dir) + + num_x = total_num[1] + num_y = total_num[2] + if num_x != num_y: + raise ValueError(f'x and y direction have different number ' + f'of flow images in video directory: {frame_dir}') + if i % 200 == 0: + print(f'{i} videos parsed') + frame_dict[dir_name] = (frame_dir, total_num[0], num_x) + + print('frame directory analysis done') + return frame_dict + + +def parse_ucf101_splits(level): + """Parse UCF-101 dataset into "train", "val", "test" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + + Returns: + list: "train", "val", "test" splits of UCF-101. + """ + class_index_file = 'data/ucf101/annotations/classInd.txt' + train_file_template = 'data/ucf101/annotations/trainlist{:02d}.txt' + test_file_template = 'data/ucf101/annotations/testlist{:02d}.txt' + + with open(class_index_file, 'r') as fin: + class_index = [x.strip().split() for x in fin] + class_mapping = {x[1]: int(x[0]) - 1 for x in class_index} + + def line_to_map(line): + """A function to map line string to video and label. + + Args: + line (str): A long directory path, which is a text path. + + Returns: + tuple[str, str]: (video, label), video is the video id, + label is the video label. + """ + items = line.strip().split() + video = osp.splitext(items[0])[0] + if level == 1: + video = osp.basename(video) + label = items[0] + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + label = class_mapping[osp.dirname(items[0])] + return video, label + + splits = [] + for i in range(1, 4): + with open(train_file_template.format(i), 'r') as fin: + train_list = [line_to_map(x) for x in fin] + + with open(test_file_template.format(i), 'r') as fin: + test_list = [line_to_map(x) for x in fin] + splits.append((train_list, test_list)) + + return splits + + +def parse_jester_splits(level): + """Parse Jester into "train", "val" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + + Returns: + list: "train", "val", "test" splits of Jester dataset. + """ + # Read the annotations + class_index_file = 'data/jester/annotations/jester-v1-labels.csv' + train_file = 'data/jester/annotations/jester-v1-train.csv' + val_file = 'data/jester/annotations/jester-v1-validation.csv' + test_file = 'data/jester/annotations/jester-v1-test.csv' + + with open(class_index_file, 'r') as fin: + class_index = [x.strip() for x in fin] + class_mapping = {class_index[idx]: idx for idx in range(len(class_index))} + + def line_to_map(line, test_mode=False): + items = line.strip().split(';') + video = items[0] + if level == 1: + video = osp.basename(video) + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + if test_mode: + return video + + label = class_mapping[items[1]] + return video, label + + with open(train_file, 'r') as fin: + train_list = [line_to_map(x) for x in fin] + + with open(val_file, 'r') as fin: + val_list = [line_to_map(x) for x in fin] + + with open(test_file, 'r') as fin: + test_list = [line_to_map(x, test_mode=True) for x in fin] + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_sthv1_splits(level): + """Parse Something-Something dataset V1 into "train", "val" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + + Returns: + list: "train", "val", "test" splits of Something-Something V1 dataset. + """ + # Read the annotations + # yapf: disable + class_index_file = 'data/sthv1/annotations/something-something-v1-labels.csv' # noqa + # yapf: enable + train_file = 'data/sthv1/annotations/something-something-v1-train.csv' + val_file = 'data/sthv1/annotations/something-something-v1-validation.csv' + test_file = 'data/sthv1/annotations/something-something-v1-test.csv' + + with open(class_index_file, 'r') as fin: + class_index = [x.strip() for x in fin] + class_mapping = {class_index[idx]: idx for idx in range(len(class_index))} + + def line_to_map(line, test_mode=False): + items = line.strip().split(';') + video = items[0] + if level == 1: + video = osp.basename(video) + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + if test_mode: + return video + + label = class_mapping[items[1]] + return video, label + + with open(train_file, 'r') as fin: + train_list = [line_to_map(x) for x in fin] + + with open(val_file, 'r') as fin: + val_list = [line_to_map(x) for x in fin] + + with open(test_file, 'r') as fin: + test_list = [line_to_map(x, test_mode=True) for x in fin] + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_sthv2_splits(level): + """Parse Something-Something dataset V2 into "train", "val" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + + Returns: + list: "train", "val", "test" splits of Something-Something V2 dataset. + """ + # Read the annotations + # yapf: disable + class_index_file = 'data/sthv2/annotations/something-something-v2-labels.json' # noqa + # yapf: enable + train_file = 'data/sthv2/annotations/something-something-v2-train.json' + val_file = 'data/sthv2/annotations/something-something-v2-validation.json' + test_file = 'data/sthv2/annotations/something-something-v2-test.json' + + with open(class_index_file, 'r') as fin: + class_mapping = json.loads(fin.read()) + + def line_to_map(item, test_mode=False): + video = item['id'] + if level == 1: + video = osp.basename(video) + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + if test_mode: + return video + + template = item['template'].replace('[', '') + template = template.replace(']', '') + label = int(class_mapping[template]) + return video, label + + with open(train_file, 'r') as fin: + items = json.loads(fin.read()) + train_list = [line_to_map(item) for item in items] + + with open(val_file, 'r') as fin: + items = json.loads(fin.read()) + val_list = [line_to_map(item) for item in items] + + with open(test_file, 'r') as fin: + items = json.loads(fin.read()) + test_list = [line_to_map(item, test_mode=True) for item in items] + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_mmit_splits(): + """Parse Multi-Moments in Time dataset into "train", "val" splits. + + Returns: + list: "train", "val", "test" splits of Multi-Moments in Time. + """ + + # Read the annotations + def line_to_map(x): + video = osp.splitext(x[0])[0] + labels = [int(digit) for digit in x[1:]] + return video, labels + + csv_reader = csv.reader(open('data/mmit/annotations/trainingSet.csv')) + train_list = [line_to_map(x) for x in csv_reader] + + csv_reader = csv.reader(open('data/mmit/annotations/validationSet.csv')) + val_list = [line_to_map(x) for x in csv_reader] + + test_list = val_list # not test for mit + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_kinetics_splits(level, dataset): + """Parse Kinetics dataset into "train", "val", "test" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + dataset (str): Denotes the version of Kinetics that needs to be parsed, + choices are "kinetics400", "kinetics600" and "kinetics700". + + Returns: + list: "train", "val", "test" splits of Kinetics. + """ + + def convert_label(s, keep_whitespaces=False): + """Convert label name to a formal string. + + Remove redundant '"' and convert whitespace to '_'. + + Args: + s (str): String to be converted. + keep_whitespaces(bool): Whether to keep whitespace. Default: False. + + Returns: + str: Converted string. + """ + if not keep_whitespaces: + return s.replace('"', '').replace(' ', '_') + + return s.replace('"', '') + + def line_to_map(x, test=False): + """A function to map line string to video and label. + + Args: + x (str): A single line from Kinetics csv file. + test (bool): Indicate whether the line comes from test + annotation file. + + Returns: + tuple[str, str]: (video, label), video is the video id, + label is the video label. + """ + if test: + # video = f'{x[0]}_{int(x[1]):06d}_{int(x[2]):06d}' + video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' + label = -1 # label unknown + return video, label + + video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' + if level == 2: + video = f'{convert_label(x[0])}/{video}' + else: + assert level == 1 + label = class_mapping[convert_label(x[0])] + return video, label + + train_file = f'data/{dataset}/annotations/kinetics_train.csv' + val_file = f'data/{dataset}/annotations/kinetics_val.csv' + test_file = f'data/{dataset}/annotations/kinetics_test.csv' + + csv_reader = csv.reader(open(train_file)) + # skip the first line + next(csv_reader) + + labels_sorted = sorted({convert_label(row[0]) for row in csv_reader}) + class_mapping = {label: i for i, label in enumerate(labels_sorted)} + + csv_reader = csv.reader(open(train_file)) + next(csv_reader) + train_list = [line_to_map(x) for x in csv_reader] + + csv_reader = csv.reader(open(val_file)) + next(csv_reader) + val_list = [line_to_map(x) for x in csv_reader] + + csv_reader = csv.reader(open(test_file)) + next(csv_reader) + test_list = [line_to_map(x, test=True) for x in csv_reader] + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_mit_splits(): + """Parse Moments in Time dataset into "train", "val" splits. + + Returns: + list: "train", "val", "test" splits of Moments in Time. + """ + # Read the annotations + class_mapping = {} + with open('data/mit/annotations/moments_categories.txt') as f_cat: + for line in f_cat.readlines(): + cat, digit = line.rstrip().split(',') + class_mapping[cat] = int(digit) + + def line_to_map(x): + video = osp.splitext(x[0])[0] + label = class_mapping[osp.dirname(x[0])] + return video, label + + csv_reader = csv.reader(open('data/mit/annotations/trainingSet.csv')) + train_list = [line_to_map(x) for x in csv_reader] + + csv_reader = csv.reader(open('data/mit/annotations/validationSet.csv')) + val_list = [line_to_map(x) for x in csv_reader] + + test_list = val_list # no test for mit + + splits = ((train_list, val_list, test_list), ) + return splits + + +def parse_hmdb51_split(level): + train_file_template = 'data/hmdb51/annotations/trainlist{:02d}.txt' + test_file_template = 'data/hmdb51/annotations/testlist{:02d}.txt' + class_index_file = 'data/hmdb51/annotations/classInd.txt' + + def generate_class_index_file(): + """This function will generate a `ClassInd.txt` for HMDB51 in a format + like UCF101, where class id starts with 1.""" + video_path = 'data/hmdb51/videos' + annotation_dir = 'data/hmdb51/annotations' + + class_list = sorted(os.listdir(video_path)) + class_dict = dict() + if not osp.exists(class_index_file): + with open(class_index_file, 'w') as f: + content = [] + for class_id, class_name in enumerate(class_list): + # like `ClassInd.txt` in UCF-101, + # the class_id begins with 1 + class_dict[class_name] = class_id + 1 + cur_line = ' '.join([str(class_id + 1), class_name]) + content.append(cur_line) + content = '\n'.join(content) + f.write(content) + else: + print(f'{class_index_file} has been generated before.') + class_dict = { + class_name: class_id + 1 + for class_id, class_name in enumerate(class_list) + } + + for i in range(1, 4): + train_content = [] + test_content = [] + for class_name in class_dict: + filename = class_name + f'_test_split{i}.txt' + filename_path = osp.join(annotation_dir, filename) + with open(filename_path, 'r') as fin: + for line in fin: + video_info = line.strip().split() + video_name = video_info[0] + if video_info[1] == '1': + target_line = ' '.join([ + osp.join(class_name, video_name), + str(class_dict[class_name]) + ]) + train_content.append(target_line) + elif video_info[1] == '2': + target_line = ' '.join([ + osp.join(class_name, video_name), + str(class_dict[class_name]) + ]) + test_content.append(target_line) + train_content = '\n'.join(train_content) + test_content = '\n'.join(test_content) + with open(train_file_template.format(i), 'w') as fout: + fout.write(train_content) + with open(test_file_template.format(i), 'w') as fout: + fout.write(test_content) + + generate_class_index_file() + + with open(class_index_file, 'r') as fin: + class_index = [x.strip().split() for x in fin] + class_mapping = {x[1]: int(x[0]) - 1 for x in class_index} + + def line_to_map(line): + items = line.strip().split() + video = osp.splitext(items[0])[0] + if level == 1: + video = osp.basename(video) + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + label = class_mapping[osp.dirname(items[0])] + return video, label + + splits = [] + for i in range(1, 4): + with open(train_file_template.format(i), 'r') as fin: + train_list = [line_to_map(x) for x in fin] + + with open(test_file_template.format(i), 'r') as fin: + test_list = [line_to_map(x) for x in fin] + splits.append((train_list, test_list)) + + return splits + + +def parse_diving48_splits(): + + train_file = 'data/diving48/annotations/Diving48_V2_train.json' + test_file = 'data/diving48/annotations/Diving48_V2_test.json' + + train = json.load(open(train_file)) + test = json.load(open(test_file)) + + # class_index_file = 'data/diving48/annotations/Diving48_vocab.json' + # class_list = json.load(open(class_index_file)) + + train_list = [] + test_list = [] + + for item in train: + vid_name = item['vid_name'] + label = item['label'] + train_list.append((vid_name, label)) + + for item in test: + vid_name = item['vid_name'] + label = item['label'] + test_list.append((vid_name, label)) + + splits = ((train_list, test_list), ) + return splits diff --git a/tools/data/resize_videos.py b/tools/data/resize_videos.py new file mode 100644 index 0000000000000000000000000000000000000000..ec1170040a09f62944406741a8742331a9a74aba --- /dev/null +++ b/tools/data/resize_videos.py @@ -0,0 +1,121 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import sys +from multiprocessing import Pool + + +def resize_videos(vid_item): + """Generate resized video cache. + + Args: + vid_item (list): Video item containing video full path, + video relative path. + + Returns: + bool: Whether generate video cache successfully. + """ + full_path, vid_path = vid_item + # Change the output video extension to .mp4 if '--to-mp4' flag is set + if args.to_mp4: + vid_path = vid_path.split('.') + assert len(vid_path) == 2, \ + f"Video path '{vid_path}' contain more than one dot" + vid_path = vid_path[0] + '.mp4' + out_full_path = osp.join(args.out_dir, vid_path) + dir_name = osp.dirname(vid_path) + out_dir = osp.join(args.out_dir, dir_name) + if not osp.exists(out_dir): + os.makedirs(out_dir) + result = os.popen( + f'ffprobe -hide_banner -loglevel error -select_streams v:0 -show_entries stream=width,height -of csv=p=0 {full_path}' # noqa:E501 + ) + w, h = [int(d) for d in result.readline().rstrip().split(',')] + if w > h: + cmd = (f'ffmpeg -hide_banner -loglevel error -i {full_path} ' + f'-vf {"mpdecimate," if args.remove_dup else ""}' + f'scale=-2:{args.scale} ' + f'{"-vsync vfr" if args.remove_dup else ""} ' + f'-c:v libx264 {"-g 16" if args.dense else ""} ' + f'-an {out_full_path} -y') + else: + cmd = (f'ffmpeg -hide_banner -loglevel error -i {full_path} ' + f'-vf {"mpdecimate," if args.remove_dup else ""}' + f'scale={args.scale}:-2 ' + f'{"-vsync vfr" if args.remove_dup else ""} ' + f'-c:v libx264 {"-g 16" if args.dense else ""} ' + f'-an {out_full_path} -y') + os.popen(cmd) + print(f'{vid_path} done') + sys.stdout.flush() + return True + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate the resized cache of original videos') + parser.add_argument('src_dir', type=str, help='source video directory') + parser.add_argument('out_dir', type=str, help='output video directory') + parser.add_argument( + '--dense', + action='store_true', + help='whether to generate a faster cache') + parser.add_argument( + '--level', + type=int, + choices=[1, 2], + default=2, + help='directory level of data') + parser.add_argument( + '--remove-dup', + action='store_true', + help='whether to remove duplicated frames') + parser.add_argument( + '--ext', + type=str, + default='mp4', + choices=['avi', 'mp4', 'webm', 'mkv'], + help='video file extensions') + parser.add_argument( + '--to-mp4', + action='store_true', + help='whether to output videos in mp4 format') + parser.add_argument( + '--scale', + type=int, + default=256, + help='resize image short side length keeping ratio') + parser.add_argument( + '--num-worker', type=int, default=8, help='number of workers') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + + if not osp.isdir(args.out_dir): + print(f'Creating folder: {args.out_dir}') + os.makedirs(args.out_dir) + + print('Reading videos from folder: ', args.src_dir) + print('Extension of videos: ', args.ext) + fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' + + args.ext) + done_fullpath_list = glob.glob(args.out_dir + '/*' * args.level + args.ext) + print('Total number of videos found: ', len(fullpath_list)) + print('Total number of videos transfer finished: ', + len(done_fullpath_list)) + if args.level == 2: + vid_list = list( + map( + lambda p: osp.join( + osp.basename(osp.dirname(p)), osp.basename(p)), + fullpath_list)) + elif args.level == 1: + vid_list = list(map(osp.basename, fullpath_list)) + pool = Pool(args.num_worker) + pool.map(resize_videos, zip(fullpath_list, vid_list)) diff --git a/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt b/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt new file mode 100644 index 0000000000000000000000000000000000000000..d321ffdfe593581136c912d7c6eef195f73e7046 --- /dev/null +++ b/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt @@ -0,0 +1,535 @@ +S001C002P005R002A008 +S001C002P006R001A008 +S001C003P002R001A055 +S001C003P002R002A012 +S001C003P005R002A004 +S001C003P005R002A005 +S001C003P005R002A006 +S001C003P006R002A008 +S002C002P011R002A030 +S002C003P008R001A020 +S002C003P010R002A010 +S002C003P011R002A007 +S002C003P011R002A011 +S002C003P014R002A007 +S003C001P019R001A055 +S003C002P002R002A055 +S003C002P018R002A055 +S003C003P002R001A055 +S003C003P016R001A055 +S003C003P018R002A024 +S004C002P003R001A013 +S004C002P008R001A009 +S004C002P020R001A003 +S004C002P020R001A004 +S004C002P020R001A012 +S004C002P020R001A020 +S004C002P020R001A021 +S004C002P020R001A036 +S005C002P004R001A001 +S005C002P004R001A003 +S005C002P010R001A016 +S005C002P010R001A017 +S005C002P010R001A048 +S005C002P010R001A049 +S005C002P016R001A009 +S005C002P016R001A010 +S005C002P018R001A003 +S005C002P018R001A028 +S005C002P018R001A029 +S005C003P016R002A009 +S005C003P018R002A013 +S005C003P021R002A057 +S006C001P001R002A055 +S006C002P007R001A005 +S006C002P007R001A006 +S006C002P016R001A043 +S006C002P016R001A051 +S006C002P016R001A052 +S006C002P022R001A012 +S006C002P023R001A020 +S006C002P023R001A021 +S006C002P023R001A022 +S006C002P023R001A023 +S006C002P024R001A018 +S006C002P024R001A019 +S006C003P001R002A013 +S006C003P007R002A009 +S006C003P007R002A010 +S006C003P007R002A025 +S006C003P016R001A060 +S006C003P017R001A055 +S006C003P017R002A013 +S006C003P017R002A014 +S006C003P017R002A015 +S006C003P022R002A013 +S007C001P018R002A050 +S007C001P025R002A051 +S007C001P028R001A050 +S007C001P028R001A051 +S007C001P028R001A052 +S007C002P008R002A008 +S007C002P015R002A055 +S007C002P026R001A008 +S007C002P026R001A009 +S007C002P026R001A010 +S007C002P026R001A011 +S007C002P026R001A012 +S007C002P026R001A050 +S007C002P027R001A011 +S007C002P027R001A013 +S007C002P028R002A055 +S007C003P007R001A002 +S007C003P007R001A004 +S007C003P019R001A060 +S007C003P027R002A001 +S007C003P027R002A002 +S007C003P027R002A003 +S007C003P027R002A004 +S007C003P027R002A005 +S007C003P027R002A006 +S007C003P027R002A007 +S007C003P027R002A008 +S007C003P027R002A009 +S007C003P027R002A010 +S007C003P027R002A011 +S007C003P027R002A012 +S007C003P027R002A013 +S008C002P001R001A009 +S008C002P001R001A010 +S008C002P001R001A014 +S008C002P001R001A015 +S008C002P001R001A016 +S008C002P001R001A018 +S008C002P001R001A019 +S008C002P008R002A059 +S008C002P025R001A060 +S008C002P029R001A004 +S008C002P031R001A005 +S008C002P031R001A006 +S008C002P032R001A018 +S008C002P034R001A018 +S008C002P034R001A019 +S008C002P035R001A059 +S008C002P035R002A002 +S008C002P035R002A005 +S008C003P007R001A009 +S008C003P007R001A016 +S008C003P007R001A017 +S008C003P007R001A018 +S008C003P007R001A019 +S008C003P007R001A020 +S008C003P007R001A021 +S008C003P007R001A022 +S008C003P007R001A023 +S008C003P007R001A025 +S008C003P007R001A026 +S008C003P007R001A028 +S008C003P007R001A029 +S008C003P007R002A003 +S008C003P008R002A050 +S008C003P025R002A002 +S008C003P025R002A011 +S008C003P025R002A012 +S008C003P025R002A016 +S008C003P025R002A020 +S008C003P025R002A022 +S008C003P025R002A023 +S008C003P025R002A030 +S008C003P025R002A031 +S008C003P025R002A032 +S008C003P025R002A033 +S008C003P025R002A049 +S008C003P025R002A060 +S008C003P031R001A001 +S008C003P031R002A004 +S008C003P031R002A014 +S008C003P031R002A015 +S008C003P031R002A016 +S008C003P031R002A017 +S008C003P032R002A013 +S008C003P033R002A001 +S008C003P033R002A011 +S008C003P033R002A012 +S008C003P034R002A001 +S008C003P034R002A012 +S008C003P034R002A022 +S008C003P034R002A023 +S008C003P034R002A024 +S008C003P034R002A044 +S008C003P034R002A045 +S008C003P035R002A016 +S008C003P035R002A017 +S008C003P035R002A018 +S008C003P035R002A019 +S008C003P035R002A020 +S008C003P035R002A021 +S009C002P007R001A001 +S009C002P007R001A003 +S009C002P007R001A014 +S009C002P008R001A014 +S009C002P015R002A050 +S009C002P016R001A002 +S009C002P017R001A028 +S009C002P017R001A029 +S009C003P017R002A030 +S009C003P025R002A054 +S010C001P007R002A020 +S010C002P016R002A055 +S010C002P017R001A005 +S010C002P017R001A018 +S010C002P017R001A019 +S010C002P019R001A001 +S010C002P025R001A012 +S010C003P007R002A043 +S010C003P008R002A003 +S010C003P016R001A055 +S010C003P017R002A055 +S011C001P002R001A008 +S011C001P018R002A050 +S011C002P008R002A059 +S011C002P016R002A055 +S011C002P017R001A020 +S011C002P017R001A021 +S011C002P018R002A055 +S011C002P027R001A009 +S011C002P027R001A010 +S011C002P027R001A037 +S011C003P001R001A055 +S011C003P002R001A055 +S011C003P008R002A012 +S011C003P015R001A055 +S011C003P016R001A055 +S011C003P019R001A055 +S011C003P025R001A055 +S011C003P028R002A055 +S012C001P019R001A060 +S012C001P019R002A060 +S012C002P015R001A055 +S012C002P017R002A012 +S012C002P025R001A060 +S012C003P008R001A057 +S012C003P015R001A055 +S012C003P015R002A055 +S012C003P016R001A055 +S012C003P017R002A055 +S012C003P018R001A055 +S012C003P018R001A057 +S012C003P019R002A011 +S012C003P019R002A012 +S012C003P025R001A055 +S012C003P027R001A055 +S012C003P027R002A009 +S012C003P028R001A035 +S012C003P028R002A055 +S013C001P015R001A054 +S013C001P017R002A054 +S013C001P018R001A016 +S013C001P028R001A040 +S013C002P015R001A054 +S013C002P017R002A054 +S013C002P028R001A040 +S013C003P008R002A059 +S013C003P015R001A054 +S013C003P017R002A054 +S013C003P025R002A022 +S013C003P027R001A055 +S013C003P028R001A040 +S014C001P027R002A040 +S014C002P015R001A003 +S014C002P019R001A029 +S014C002P025R002A059 +S014C002P027R002A040 +S014C002P039R001A050 +S014C003P007R002A059 +S014C003P015R002A055 +S014C003P019R002A055 +S014C003P025R001A048 +S014C003P027R002A040 +S015C001P008R002A040 +S015C001P016R001A055 +S015C001P017R001A055 +S015C001P017R002A055 +S015C002P007R001A059 +S015C002P008R001A003 +S015C002P008R001A004 +S015C002P008R002A040 +S015C002P015R001A002 +S015C002P016R001A001 +S015C002P016R002A055 +S015C003P008R002A007 +S015C003P008R002A011 +S015C003P008R002A012 +S015C003P008R002A028 +S015C003P008R002A040 +S015C003P025R002A012 +S015C003P025R002A017 +S015C003P025R002A020 +S015C003P025R002A021 +S015C003P025R002A030 +S015C003P025R002A033 +S015C003P025R002A034 +S015C003P025R002A036 +S015C003P025R002A037 +S015C003P025R002A044 +S016C001P019R002A040 +S016C001P025R001A011 +S016C001P025R001A012 +S016C001P025R001A060 +S016C001P040R001A055 +S016C001P040R002A055 +S016C002P008R001A011 +S016C002P019R002A040 +S016C002P025R002A012 +S016C003P008R001A011 +S016C003P008R002A002 +S016C003P008R002A003 +S016C003P008R002A004 +S016C003P008R002A006 +S016C003P008R002A009 +S016C003P019R002A040 +S016C003P039R002A016 +S017C001P016R002A031 +S017C002P007R001A013 +S017C002P008R001A009 +S017C002P015R001A042 +S017C002P016R002A031 +S017C002P016R002A055 +S017C003P007R002A013 +S017C003P008R001A059 +S017C003P016R002A031 +S017C003P017R001A055 +S017C003P020R001A059 +S019C001P046R001A075 +S019C002P042R001A094 +S019C002P042R001A095 +S019C002P042R001A096 +S019C002P042R001A097 +S019C002P042R001A098 +S019C002P042R001A099 +S019C002P042R001A100 +S019C002P042R001A101 +S019C002P042R001A102 +S019C002P049R002A074 +S019C002P049R002A079 +S019C002P051R001A061 +S019C003P046R001A061 +S019C003P046R002A061 +S019C003P046R002A062 +S020C002P041R001A063 +S020C002P041R001A064 +S020C002P044R001A063 +S020C002P044R001A064 +S020C002P044R001A066 +S020C002P044R001A084 +S020C002P054R001A081 +S021C001P059R001A108 +S021C002P055R001A065 +S021C002P055R001A092 +S021C002P055R001A093 +S021C002P057R001A064 +S021C002P058R001A063 +S021C002P058R001A064 +S021C002P059R001A074 +S021C002P059R001A075 +S021C002P059R001A076 +S021C002P059R001A077 +S021C002P059R001A078 +S021C002P059R001A079 +S021C003P057R002A078 +S021C003P057R002A079 +S021C003P057R002A094 +S022C002P061R001A113 +S022C003P061R002A061 +S022C003P061R002A062 +S022C003P063R002A061 +S022C003P063R002A062 +S022C003P063R002A063 +S022C003P063R002A064 +S022C003P063R002A078 +S022C003P064R002A061 +S022C003P064R002A062 +S022C003P065R002A061 +S022C003P065R002A062 +S022C003P065R002A119 +S022C003P067R002A064 +S023C002P055R001A114 +S023C002P055R002A092 +S023C002P059R001A075 +S023C002P063R001A075 +S023C003P055R002A093 +S023C003P055R002A094 +S023C003P061R002A061 +S023C003P064R001A092 +S024C001P063R001A109 +S024C002P062R002A074 +S024C002P067R001A100 +S024C002P067R001A101 +S024C002P067R001A102 +S024C002P067R001A103 +S024C003P062R002A074 +S024C003P063R002A061 +S024C003P063R002A062 +S025C001P055R002A119 +S025C003P056R002A119 +S025C003P059R002A115 +S026C002P044R001A061 +S026C002P044R001A062 +S026C002P070R001A092 +S026C003P069R002A075 +S026C003P074R002A061 +S026C003P074R002A062 +S026C003P075R001A117 +S026C003P075R001A118 +S027C001P082R001A063 +S027C002P044R002A092 +S027C002P079R001A061 +S027C002P079R001A062 +S027C002P079R001A063 +S027C002P079R001A064 +S027C002P082R001A092 +S027C002P084R001A061 +S027C002P084R001A062 +S027C002P086R001A061 +S027C003P041R002A087 +S027C003P080R002A061 +S027C003P082R002A061 +S027C003P082R002A062 +S027C003P086R002A061 +S027C003P086R002A062 +S028C001P087R001A061 +S028C002P041R001A091 +S028C002P087R001A061 +S028C003P042R002A064 +S028C003P046R002A063 +S028C003P046R002A066 +S028C003P046R002A067 +S028C003P046R002A068 +S028C003P046R002A069 +S028C003P046R002A070 +S028C003P046R002A071 +S028C003P046R002A072 +S028C003P046R002A074 +S028C003P046R002A075 +S028C003P046R002A077 +S028C003P046R002A081 +S028C003P046R002A082 +S028C003P046R002A083 +S028C003P046R002A084 +S028C003P048R002A061 +S028C003P048R002A062 +S028C003P048R002A073 +S028C003P073R002A073 +S028C003P087R001A061 +S028C003P087R002A061 +S028C003P087R002A062 +S029C001P043R002A092 +S029C001P044R002A092 +S029C001P048R001A073 +S029C001P089R001A063 +S029C002P041R001A074 +S029C002P041R001A084 +S029C002P044R001A091 +S029C002P048R001A075 +S029C002P048R001A081 +S029C002P074R001A081 +S029C002P074R001A095 +S029C002P074R001A096 +S029C002P080R001A091 +S029C002P088R001A066 +S029C002P089R001A065 +S029C002P090R001A067 +S029C003P008R002A065 +S029C003P008R002A067 +S029C003P041R001A089 +S029C003P043R001A080 +S029C003P043R001A092 +S029C003P043R001A105 +S029C003P043R002A085 +S029C003P043R002A086 +S029C003P044R002A106 +S029C003P048R001A065 +S029C003P048R002A073 +S029C003P048R002A074 +S029C003P048R002A075 +S029C003P048R002A076 +S029C003P048R002A092 +S029C003P048R002A094 +S029C003P051R002A073 +S029C003P051R002A074 +S029C003P051R002A075 +S029C003P051R002A076 +S029C003P051R002A077 +S029C003P051R002A078 +S029C003P051R002A079 +S029C003P051R002A080 +S029C003P051R002A081 +S029C003P051R002A082 +S029C003P051R002A083 +S029C003P051R002A084 +S029C003P051R002A085 +S029C003P051R002A086 +S029C003P051R002A110 +S029C003P067R001A098 +S029C003P074R002A110 +S029C003P080R002A066 +S029C003P088R002A078 +S029C003P089R001A075 +S029C003P089R002A061 +S029C003P089R002A062 +S029C003P089R002A063 +S029C003P090R002A092 +S029C003P090R002A095 +S030C002P091R002A091 +S030C002P091R002A092 +S030C002P091R002A093 +S030C002P091R002A094 +S030C002P091R002A095 +S030C002P091R002A096 +S030C002P091R002A097 +S030C002P091R002A098 +S030C002P091R002A099 +S030C002P091R002A100 +S030C002P091R002A101 +S030C002P091R002A102 +S030C002P091R002A103 +S030C002P091R002A104 +S030C002P091R002A105 +S030C003P044R002A065 +S030C003P044R002A081 +S030C003P044R002A084 +S031C002P042R001A111 +S031C002P051R001A061 +S031C002P051R001A062 +S031C002P067R001A067 +S031C002P067R001A068 +S031C002P067R001A069 +S031C002P067R001A070 +S031C002P067R001A071 +S031C002P067R001A072 +S031C002P082R001A075 +S031C002P082R002A117 +S031C002P097R001A061 +S031C002P097R001A062 +S031C003P043R002A074 +S031C003P043R002A075 +S031C003P044R002A094 +S031C003P082R002A067 +S031C003P082R002A068 +S031C003P082R002A069 +S031C003P082R002A070 +S031C003P082R002A071 +S031C003P082R002A072 +S031C003P082R002A073 +S031C003P082R002A075 +S031C003P082R002A076 +S031C003P082R002A077 +S031C003P082R002A084 +S031C003P082R002A085 +S031C003P082R002A086 +S032C002P067R001A092 +S032C003P067R002A066 +S032C003P067R002A067 +S032C003P067R002A075 +S032C003P067R002A076 +S032C003P067R002A077 diff --git a/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt b/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt new file mode 100644 index 0000000000000000000000000000000000000000..375050d46a665ffcb2c165e1d84c6fb2405674f2 --- /dev/null +++ b/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt @@ -0,0 +1,302 @@ +S001C002P005R002A008 +S001C002P006R001A008 +S001C003P002R001A055 +S001C003P002R002A012 +S001C003P005R002A004 +S001C003P005R002A005 +S001C003P005R002A006 +S001C003P006R002A008 +S002C002P011R002A030 +S002C003P008R001A020 +S002C003P010R002A010 +S002C003P011R002A007 +S002C003P011R002A011 +S002C003P014R002A007 +S003C001P019R001A055 +S003C002P002R002A055 +S003C002P018R002A055 +S003C003P002R001A055 +S003C003P016R001A055 +S003C003P018R002A024 +S004C002P003R001A013 +S004C002P008R001A009 +S004C002P020R001A003 +S004C002P020R001A004 +S004C002P020R001A012 +S004C002P020R001A020 +S004C002P020R001A021 +S004C002P020R001A036 +S005C002P004R001A001 +S005C002P004R001A003 +S005C002P010R001A016 +S005C002P010R001A017 +S005C002P010R001A048 +S005C002P010R001A049 +S005C002P016R001A009 +S005C002P016R001A010 +S005C002P018R001A003 +S005C002P018R001A028 +S005C002P018R001A029 +S005C003P016R002A009 +S005C003P018R002A013 +S005C003P021R002A057 +S006C001P001R002A055 +S006C002P007R001A005 +S006C002P007R001A006 +S006C002P016R001A043 +S006C002P016R001A051 +S006C002P016R001A052 +S006C002P022R001A012 +S006C002P023R001A020 +S006C002P023R001A021 +S006C002P023R001A022 +S006C002P023R001A023 +S006C002P024R001A018 +S006C002P024R001A019 +S006C003P001R002A013 +S006C003P007R002A009 +S006C003P007R002A010 +S006C003P007R002A025 +S006C003P016R001A060 +S006C003P017R001A055 +S006C003P017R002A013 +S006C003P017R002A014 +S006C003P017R002A015 +S006C003P022R002A013 +S007C001P018R002A050 +S007C001P025R002A051 +S007C001P028R001A050 +S007C001P028R001A051 +S007C001P028R001A052 +S007C002P008R002A008 +S007C002P015R002A055 +S007C002P026R001A008 +S007C002P026R001A009 +S007C002P026R001A010 +S007C002P026R001A011 +S007C002P026R001A012 +S007C002P026R001A050 +S007C002P027R001A011 +S007C002P027R001A013 +S007C002P028R002A055 +S007C003P007R001A002 +S007C003P007R001A004 +S007C003P019R001A060 +S007C003P027R002A001 +S007C003P027R002A002 +S007C003P027R002A003 +S007C003P027R002A004 +S007C003P027R002A005 +S007C003P027R002A006 +S007C003P027R002A007 +S007C003P027R002A008 +S007C003P027R002A009 +S007C003P027R002A010 +S007C003P027R002A011 +S007C003P027R002A012 +S007C003P027R002A013 +S008C002P001R001A009 +S008C002P001R001A010 +S008C002P001R001A014 +S008C002P001R001A015 +S008C002P001R001A016 +S008C002P001R001A018 +S008C002P001R001A019 +S008C002P008R002A059 +S008C002P025R001A060 +S008C002P029R001A004 +S008C002P031R001A005 +S008C002P031R001A006 +S008C002P032R001A018 +S008C002P034R001A018 +S008C002P034R001A019 +S008C002P035R001A059 +S008C002P035R002A002 +S008C002P035R002A005 +S008C003P007R001A009 +S008C003P007R001A016 +S008C003P007R001A017 +S008C003P007R001A018 +S008C003P007R001A019 +S008C003P007R001A020 +S008C003P007R001A021 +S008C003P007R001A022 +S008C003P007R001A023 +S008C003P007R001A025 +S008C003P007R001A026 +S008C003P007R001A028 +S008C003P007R001A029 +S008C003P007R002A003 +S008C003P008R002A050 +S008C003P025R002A002 +S008C003P025R002A011 +S008C003P025R002A012 +S008C003P025R002A016 +S008C003P025R002A020 +S008C003P025R002A022 +S008C003P025R002A023 +S008C003P025R002A030 +S008C003P025R002A031 +S008C003P025R002A032 +S008C003P025R002A033 +S008C003P025R002A049 +S008C003P025R002A060 +S008C003P031R001A001 +S008C003P031R002A004 +S008C003P031R002A014 +S008C003P031R002A015 +S008C003P031R002A016 +S008C003P031R002A017 +S008C003P032R002A013 +S008C003P033R002A001 +S008C003P033R002A011 +S008C003P033R002A012 +S008C003P034R002A001 +S008C003P034R002A012 +S008C003P034R002A022 +S008C003P034R002A023 +S008C003P034R002A024 +S008C003P034R002A044 +S008C003P034R002A045 +S008C003P035R002A016 +S008C003P035R002A017 +S008C003P035R002A018 +S008C003P035R002A019 +S008C003P035R002A020 +S008C003P035R002A021 +S009C002P007R001A001 +S009C002P007R001A003 +S009C002P007R001A014 +S009C002P008R001A014 +S009C002P015R002A050 +S009C002P016R001A002 +S009C002P017R001A028 +S009C002P017R001A029 +S009C003P017R002A030 +S009C003P025R002A054 +S010C001P007R002A020 +S010C002P016R002A055 +S010C002P017R001A005 +S010C002P017R001A018 +S010C002P017R001A019 +S010C002P019R001A001 +S010C002P025R001A012 +S010C003P007R002A043 +S010C003P008R002A003 +S010C003P016R001A055 +S010C003P017R002A055 +S011C001P002R001A008 +S011C001P018R002A050 +S011C002P008R002A059 +S011C002P016R002A055 +S011C002P017R001A020 +S011C002P017R001A021 +S011C002P018R002A055 +S011C002P027R001A009 +S011C002P027R001A010 +S011C002P027R001A037 +S011C003P001R001A055 +S011C003P002R001A055 +S011C003P008R002A012 +S011C003P015R001A055 +S011C003P016R001A055 +S011C003P019R001A055 +S011C003P025R001A055 +S011C003P028R002A055 +S012C001P019R001A060 +S012C001P019R002A060 +S012C002P015R001A055 +S012C002P017R002A012 +S012C002P025R001A060 +S012C003P008R001A057 +S012C003P015R001A055 +S012C003P015R002A055 +S012C003P016R001A055 +S012C003P017R002A055 +S012C003P018R001A055 +S012C003P018R001A057 +S012C003P019R002A011 +S012C003P019R002A012 +S012C003P025R001A055 +S012C003P027R001A055 +S012C003P027R002A009 +S012C003P028R001A035 +S012C003P028R002A055 +S013C001P015R001A054 +S013C001P017R002A054 +S013C001P018R001A016 +S013C001P028R001A040 +S013C002P015R001A054 +S013C002P017R002A054 +S013C002P028R001A040 +S013C003P008R002A059 +S013C003P015R001A054 +S013C003P017R002A054 +S013C003P025R002A022 +S013C003P027R001A055 +S013C003P028R001A040 +S014C001P027R002A040 +S014C002P015R001A003 +S014C002P019R001A029 +S014C002P025R002A059 +S014C002P027R002A040 +S014C002P039R001A050 +S014C003P007R002A059 +S014C003P015R002A055 +S014C003P019R002A055 +S014C003P025R001A048 +S014C003P027R002A040 +S015C001P008R002A040 +S015C001P016R001A055 +S015C001P017R001A055 +S015C001P017R002A055 +S015C002P007R001A059 +S015C002P008R001A003 +S015C002P008R001A004 +S015C002P008R002A040 +S015C002P015R001A002 +S015C002P016R001A001 +S015C002P016R002A055 +S015C003P008R002A007 +S015C003P008R002A011 +S015C003P008R002A012 +S015C003P008R002A028 +S015C003P008R002A040 +S015C003P025R002A012 +S015C003P025R002A017 +S015C003P025R002A020 +S015C003P025R002A021 +S015C003P025R002A030 +S015C003P025R002A033 +S015C003P025R002A034 +S015C003P025R002A036 +S015C003P025R002A037 +S015C003P025R002A044 +S016C001P019R002A040 +S016C001P025R001A011 +S016C001P025R001A012 +S016C001P025R001A060 +S016C001P040R001A055 +S016C001P040R002A055 +S016C002P008R001A011 +S016C002P019R002A040 +S016C002P025R002A012 +S016C003P008R001A011 +S016C003P008R002A002 +S016C003P008R002A003 +S016C003P008R002A004 +S016C003P008R002A006 +S016C003P008R002A009 +S016C003P019R002A040 +S016C003P039R002A016 +S017C001P016R002A031 +S017C002P007R001A013 +S017C002P008R001A009 +S017C002P015R001A042 +S017C002P016R002A031 +S017C002P016R002A055 +S017C003P007R002A013 +S017C003P008R001A059 +S017C003P016R002A031 +S017C003P017R001A055 +S017C003P020R001A059 diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md new file mode 100644 index 0000000000000000000000000000000000000000..00c1814d16c37cff82526197eedca1855583d8c9 --- /dev/null +++ b/tools/data/skeleton/README.md @@ -0,0 +1,129 @@ +# Preparing Skeleton Dataset + + + +```BibTeX +@misc{duan2021revisiting, + title={Revisiting Skeleton-based Action Recognition}, + author={Haodong Duan and Yue Zhao and Kai Chen and Dian Shao and Dahua Lin and Bo Dai}, + year={2021}, + eprint={2104.13586}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## Introduction + +We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes. + +## Prepare Annotations + +We provide links to the pre-processed skeleton annotations, you can directly download them and use them for training & testing. + +- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl +- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl +- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl +- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl +- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl + - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with [link](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl). Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project. +- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl +- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl +- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl +- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations) + +For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `k400_kpfiles_2d.zip` and extract it under `$MMACTION2/data/skeleton/kpfiles` for Kinetics400 training & testing: https://openxlab.org.cn/datasets/OpenMMLab/Kinetics400-skeleton + +If you want to generate 2D skeleton annotations of specified video, please install mmdetection and mmpose first, then use the following script to extract skeleton annotations of NTURGB+D video: + +```python +python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl +``` + +please note that, due to the upgrade of mmpose, the inference results may have slight differences from the provided skeleton annotations. + +## The Format of Annotations + +Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations` + +1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip. +2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields: + 1. `frame_dir` (str): The identifier of the corresponding video. + 2. `total_frames` (int): The number of frames in this video. + 3. `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of (height, width). Only required for 2D skeletons. + 4. `original_shape` (tuple\[int\]): Same as `img_shape`. + 5. `label` (int): The action label. + 6. `keypoint` (np.ndarray, with shape \[M x T x V x C\]): The keypoint annotation. M: number of persons; T: number of frames (same as `total_frames`); V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint). + 7. `keypoint_score` (np.ndarray, with shape \[M x T x V\]): The confidence score of keypoints. Only required for 2D skeletons. + +## Visualization + +For skeleton data visualization, you need also to prepare the RGB videos. Please refer to \[visualize_heatmap_volume\] for detailed process. Here we provide some visualization examples from NTU-60 and FineGYM. + + + + + + + + + +
+
+ Pose Estimation Results +
+ +
+
+ +
+
+ Keypoint Heatmap Volume Visualization +
+ +
+
+ +
+
+ Limb Heatmap Volume Visualization +
+ +
+
+ +
+ +## Convert the NTU RGB+D raw skeleton data to our format (only applicable to GCN backbones) + +Here we also provide the script for converting the NTU RGB+D raw skeleton data to our format. +First, download the raw skeleton data of NTU-RGBD 60 and NTU-RGBD 120 from https://github.com/shahroudy/NTURGB-D. + +For NTU-RGBD 60, preprocess data and convert the data format with + +```python +python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd60_skeleton_path --ignored-sample-path NTU_RGBD_samples_with_missing_skeletons.txt --out-folder your_nturgbd60_output_path --task ntu60 +``` + +For NTU-RGBD 120, preprocess data and convert the data format with + +```python +python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd120_skeleton_path --ignored-sample-path NTU_RGBD120_samples_with_missing_skeletons.txt --out-folder your_nturgbd120_output_path --task ntu120 +``` + +## Convert annotations from third-party projects + +We provide scripts to convert skeleton annotations from third-party projects to MMAction2 formats: + +- BABEL: `babel2mma2.py` + +**TODO**: + +- [x] FineGYM +- [x] NTU60_XSub +- [x] NTU120_XSub +- [x] NTU60_XView +- [x] NTU120_XSet +- [x] UCF101 +- [x] HMDB51 +- [x] Kinetics diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..19973f73240d81833a3e368400160266f107e50b --- /dev/null +++ b/tools/data/skeleton/README_zh-CN.md @@ -0,0 +1,142 @@ +# ๅ‡†ๅค‡้ชจๆžถๆ•ฐๆฎ้›† + +```BibTeX +@misc{duan2021revisiting, + title={Revisiting Skeleton-based Action Recognition}, + author={Haodong Duan and Yue Zhao and Kai Chen and Dian Shao and Dahua Lin and Bo Dai}, + year={2021}, + eprint={2104.13586}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## ็ฎ€ไป‹ + +MMAction2 ๅ‘ๅธƒ [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586) ่ฎบๆ–‡ไธญๆ‰€ไฝฟ็”จ็š„้ชจๆžถๆ ‡ๆณจใ€‚ +้ป˜่ฎคไฝฟ็”จ [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) ไฝœไธบไบบไฝ“ๆฃ€ๆต‹ๅ™จ๏ผŒ +ไฝฟ็”จ [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/top_down/hrnet/coco/hrnet_w32_coco_256x192.py) ไฝœไธบๅ•ไบบๅงฟๆ€ไผฐ่ฎกๆจกๅž‹ใ€‚ +ๅฏนไบŽ FineGYM ๆ•ฐๆฎ้›†๏ผŒMMAction2 ไฝฟ็”จ็š„ๆ˜ฏ่ฟๅŠจๅ‘˜็š„็œŸๅฎžๆก†ๆ ‡ๆณจ๏ผŒ่€Œ้žๆฃ€ๆต‹ๅ™จๆ‰€ๅ‡บ็š„ๆก†ใ€‚็›ฎๅ‰๏ผŒMMAction2 ๅทฒๅ‘ๅธƒ FineGYM ๅ’Œ NTURGB-D Xsub ้ƒจๅˆ†็š„้ชจๆžถๆ ‡ๆณจ๏ผŒๅ…ถไป–ๆ•ฐๆฎ้›†็š„ๆ ‡ๆณจไนŸๅฐ†ๅพˆๅฟซๅ‘ๅธƒใ€‚ + +## ๅ‡†ๅค‡ๆ ‡ๆณจๆ–‡ไปถ + +็›ฎๅ‰๏ผŒMMAction2 ๆ”ฏๆŒ HMDB51, UCF101, FineGYM ๅ’Œ NTURGB+D ๆ•ฐๆฎ้›†ใ€‚ๅฏนไบŽ FineGYM ๆ•ฐๆฎ้›†๏ผŒ็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹่„šๆœฌไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +bash download_annotations.sh ${DATASET} +``` + +็”ฑไบŽ NTURGB+D ๆ•ฐๆฎ้›†็š„ [ไฝฟ็”จๆกไพ‹](http://rose1.ntu.edu.sg/Datasets/actionRecognition.asp)๏ผŒMMAction2 ๅนถๆœช็›ดๆŽฅๅ‘ๅธƒๅฎž้ชŒไธญๆ‰€ไฝฟ็”จ็š„ๆ ‡ๆณจๆ–‡ไปถใ€‚ +ๅ› ๆญค๏ผŒ่ฟ™้‡Œๆไพ›็”Ÿๆˆ NTURGB+D ๆ•ฐๆฎ้›†ไธญ่ง†้ข‘็š„ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถ๏ผŒ่ฟ™ๅฐ†็”Ÿๆˆไธ€ไธช dict ๆ•ฐๆฎๅนถๅฐ†ๅ…ถไฟๅญ˜ไธบไธ€ไธช pickle ๆ–‡ไปถใ€‚ +็”จๆˆทๅฏไปฅ็”Ÿๆˆไธ€ไธช list ็”จไปฅๅŒ…ๅซๅฏนๅบ”่ง†้ข‘็š„ dict ๆ•ฐๆฎ๏ผŒๅนถๅฐ†ๅ…ถไฟๅญ˜ไธบไธ€ไธช pickle ๆ–‡ไปถใ€‚ +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅ่Žทๅพ— `ntu60_xsub_train.pkl`, `ntu60_xsub_val.pkl`, `ntu120_xsub_train.pkl`, `ntu120_xsub_val.pkl` ๆ–‡ไปถ็”จไบŽ่ฎญ็ปƒใ€‚ + +ๅฏนไบŽๆ— ๆณ•่ฟ›่กŒๅงฟๆ€ๆๅ–็š„็”จๆˆท๏ผŒ่ฟ™้‡Œๆไพ›ไบ†ไธŠ่ฟฐๆต็จ‹็š„่พ“ๅ‡บ็ป“ๆžœ๏ผŒๅˆ†ๅˆซๅฏนๅบ” NTURGB-D ๆ•ฐๆฎ้›†็š„ 4 ไธช้ƒจๅˆ†๏ผš + +- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl +- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl +- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl +- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl +- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl + - GYM 2D ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถๆ˜ฏๅŸบไบŽ่ฟๅŠจๅ‘˜็š„็œŸๅฎžๆ ‡ๆณจๆก†็”Ÿๆˆ็š„๏ผŒ็”จๆˆทๅฏไปฅไปŽ่ฟ™ไธช[้“พๆŽฅ](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl)ไธ‹่ฝฝ็œŸๅฎžๆ ‡ๆณจๆก†ใ€‚ๅฆ‚ๆžœไฝ ๅœจ้กน็›ฎไธญไฝฟ็”จไบ†่ฏฅๆ•ฐๆฎ๏ผŒ่ฏทๅผ•็”จ [PoseConv3D](https://arxiv.org/abs/2104.13586) +- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl +- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl +- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl +- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (ๅชๅŒ…ๅซๆ•ฐๆฎๅˆ—่กจ๏ผŒๆฒกๆœ‰ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถ) + +็”ฑไบŽ Kinetics400 ๆ•ฐๆฎ้›†ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถ่ฟ‡ๅคง๏ผŒๆˆ‘ไปฌไธๆไพ›้˜ฟ้‡Œไบ‘็š„ไธ‹่ฝฝ้“พๆŽฅ๏ผŒ่ฏทไฝฟ็”จๆญค[้“พๆŽฅ](https://openxlab.org.cn/datasets/OpenMMLab/Kinetics400-skeleton)ไธ‹่ฝฝ `k400_kpfiles_2d.zip`๏ผŒ่งฃๅŽ‹ๅˆฐ `$MMACTION2/data/skeleton/kpfiles` ็›ฎๅฝ•ไธ‹๏ผŒ็”จไบŽ Kinetics400 ็š„่ฎญ็ปƒๅ’Œๆต‹่ฏ•ใ€‚ + +่‹ฅๆƒณ็”Ÿๆˆๅ•ไธช่ง†้ข‘็š„ 2D ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถ๏ผŒ็”จๆˆทๅœจๅฎ‰่ฃ… mmdetection ๅ’Œ mmpose ไน‹ๅŽ๏ผŒๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ่ฟ›่กŒ NTURGB+D ่ง†้ข‘็š„ๅงฟๆ€ๆๅ–๏ผš + +```python +python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl +``` + +่ฏทๆณจๆ„๏ผŒ็”ฑไบŽ mmpose ็ฎ—ๆณ•ๅบ“ๅ‡็บง๏ผŒๆญค่„šๆœฌ็š„ๆŽจ็†็ป“ๆžœไธŽๆไพ›็š„ๅงฟๆ€็‚นๆ•ฐๆฎ้›†ๅฏ่ƒฝ็•ฅๆœ‰ๅทฎๅผ‚ใ€‚ + +ๅœจ็”จๆˆท่Žทๅพ—ๆ•ฐๆฎ้›†ๆŸ้ƒจๅˆ†ๆ‰€ๆœ‰่ง†้ข‘็š„ๅงฟๆ€ๆ ‡ๆณจๆ–‡ไปถ๏ผˆๅฆ‚ `ntu60_xsub_val`๏ผ‰ๅŽ๏ผŒๅฏไปฅๅฐ†ๅ…ถ้›†ๅˆๆˆไธ€ไธช list ๆ•ฐๆฎๅนถไฟๅญ˜ไธบ `ntu60_xsub_val.pkl`ใ€‚็”จๆˆทๅฏ็”จ่ฟ™ไบ›ๅคงๅž‹ pickle ๆ–‡ไปถ่ฟ›่กŒ่ฎญ็ปƒๅ’Œๆต‹่ฏ•ใ€‚ + +## PoseC3D ็š„ๆ ‡ๆณจๆ–‡ไปถๆ ผๅผ + +่ฟ™้‡Œ็ฎ€ๅ•ไป‹็ป PoseC3D ็š„ๆ ‡ๆณจๆ–‡ไปถๆ ผๅผใ€‚ไปฅ `gym_train.pkl` ไธบไพ‹๏ผš`gym_train.pkl` ๅญ˜ๅ‚จไธ€ไธช้•ฟๅบฆไธบ 20484 ็š„ list๏ผŒlist ็š„ๆฏไธ€้กนไธบๅ•ไธช่ง†้ข‘็š„้ชจๆžถๆ ‡ๆณจ dictใ€‚ๆฏไธช dict ็š„ๅ†…ๅฎนๅฆ‚ไธ‹๏ผš + +- keypoint๏ผšๅ…ณ้”ฎ็‚นๅๆ ‡๏ผŒๅคงๅฐไธบ N๏ผˆ#ไบบๆ•ฐ๏ผ‰x T๏ผˆๆ—ถๅบ้•ฟๅบฆ๏ผ‰x K๏ผˆ#ๅ…ณ้”ฎ็‚น, ่ฟ™้‡Œไธบ17๏ผ‰x 2 ๏ผˆx๏ผŒy ๅๆ ‡๏ผ‰็š„ numpy array ๆ•ฐๆฎ็ฑปๅž‹ +- keypoint_score๏ผšๅ…ณ้”ฎ็‚น็š„็ฝฎไฟกๅˆ†ๆ•ฐ๏ผŒๅคงๅฐไธบ N๏ผˆ#ไบบๆ•ฐ๏ผ‰x T๏ผˆๆ—ถๅบ้•ฟๅบฆ๏ผ‰x K๏ผˆ#ๅ…ณ้”ฎ็‚น, ่ฟ™้‡Œไธบ17๏ผ‰็š„ numpy array ๆ•ฐๆฎ็ฑปๅž‹ +- frame_dir: ๅฏนๅบ”่ง†้ข‘ๅ +- label: ๅŠจไฝœ็ฑปๅˆซ +- img_shape: ๆฏไธ€ๅธงๅ›พๅƒ็š„ๅคงๅฐ +- original_shape: ๅŒ `img_shape` +- total_frames: ่ง†้ข‘ๆ—ถๅบ้•ฟๅบฆ + +ๅฆ‚็”จๆˆทๆƒณไฝฟ็”จ่‡ชๅทฑ็š„ๆ•ฐๆฎ้›†่ฎญ็ปƒ PoseC3D๏ผŒๅฏไปฅๅ‚่€ƒ [Custom Dataset Training](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/custom_dataset_training.md)ใ€‚ + +## ๅฏ่ง†ๅŒ– + +ไธบไบ†ๅฏ่ง†ๅŒ–้ชจๆžถๆ•ฐๆฎ๏ผŒ็”จๆˆท้œ€่ฆๅ‡†ๅค‡ RGB ็š„่ง†้ข‘ใ€‚่ฏฆๆƒ…ๅฏๅ‚่€ƒ \[visualize_heatmap_volume\]ใ€‚่ฟ™้‡Œๆไพ›ไธ€ไบ› NTU-60 ๅ’Œ FineGYM ไธŠ็š„ไพ‹ๅญ + + + + + + + + + +
+
+ ๅงฟๆ€ไผฐ่ฎก็ป“ๆžœ +
+ +
+
+ +
+
+ ๅ…ณ้”ฎ็‚น็ƒญๅŠ›ๅ›พไธ‰็ปดๅฏ่ง†ๅŒ– +
+ +
+
+ +
+
+ ่‚ขไฝ“็ƒญๅŠ›ๅ›พไธ‰็ปดๅฏ่ง†ๅŒ– +
+ +
+
+ +
+ +## ๅฆ‚ไฝ•ๅฐ† NTU RGB+D ๅŽŸๅง‹ๆ•ฐๆฎ่ฝฌๅŒ–ไธบ MMAction2 ๆ ผๅผ ๏ผˆ่ฝฌๆขๅฅฝ็š„ๆ ‡ๆณจๆ–‡ไปถ็›ฎๅ‰ไป…้€‚็”จไบŽ GCN ๆจกๅž‹๏ผ‰ + +่ฟ™้‡Œไป‹็ปๅฆ‚ไฝ•ๅฐ† NTU RGB+D ๅŽŸๅง‹ๆ•ฐๆฎ่ฝฌๅŒ–ไธบ MMAction2 ๆ ผๅผใ€‚้ฆ–ๅ…ˆ๏ผŒ้œ€่ฆไปŽ https://github.com/shahroudy/NTURGB-D ไธ‹่ฝฝๅŽŸๅง‹ NTU-RGBD 60 ๅ’Œ NTU-RGBD 120 ๆ•ฐๆฎ้›†็š„ๅŽŸๅง‹้ชจๆžถๆ•ฐๆฎใ€‚ + +ๅฏนไบŽ NTU-RGBD 60 ๆ•ฐๆฎ้›†๏ผŒๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ + +```python +python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd60_skeleton_path --ignored-sample-path NTU_RGBD_samples_with_missing_skeletons.txt --out-folder your_nturgbd60_output_path --task ntu60 +``` + +ๅฏนไบŽ NTU-RGBD 120 ๆ•ฐๆฎ้›†๏ผŒๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ + +```python +python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd120_skeleton_path --ignored-sample-path NTU_RGBD120_samples_with_missing_skeletons.txt --out-folder your_nturgbd120_output_path --task ntu120 +``` + +## ่ฝฌๆขๅ…ถไป–็ฌฌไธ‰ๆ–น้กน็›ฎ็š„้ชจ้ชผๆ ‡ๆณจ + +MMAction2 ๆไพ›่„šๆœฌไปฅๅฐ†ๅ…ถไป–็ฌฌไธ‰ๆ–น้กน็›ฎ็š„้ชจ้ชผๆ ‡ๆณจ่ฝฌ่‡ณ MMAction2 ๆ ผๅผ๏ผŒๅฆ‚๏ผš + +- BABEL: `babel2mma2.py` + +**ๅพ…ๅŠž้กน**๏ผš + +- [x] FineGYM +- [x] NTU60_XSub +- [x] NTU120_XSub +- [x] NTU60_XView +- [x] NTU120_XSet +- [x] UCF101 +- [x] HMDB51 +- [x] Kinetics diff --git a/tools/data/skeleton/S001C001P001R001A001_rgb.avi b/tools/data/skeleton/S001C001P001R001A001_rgb.avi new file mode 100644 index 0000000000000000000000000000000000000000..62b6258288262fc35a36ea5f344fda03f7d5d044 --- /dev/null +++ b/tools/data/skeleton/S001C001P001R001A001_rgb.avi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cea2b398b79fc619d545a042cd0bce8ace9aa4d482b8c1e30482311f204c4a +size 987146 diff --git a/tools/data/skeleton/babel2mma2.py b/tools/data/skeleton/babel2mma2.py new file mode 100644 index 0000000000000000000000000000000000000000..67bcf6e30a3e45d2985dd901c92ea0a059d598e3 --- /dev/null +++ b/tools/data/skeleton/babel2mma2.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# In this example, we convert babel120_train to MMAction2 format +# The required files can be downloaded from the homepage of BABEL project +import numpy as np +from mmcv import dump, load + + +def gen_babel(x, y): + data = [] + for i, xx in enumerate(x): + sample = dict() + sample['keypoint'] = xx.transpose(3, 1, 2, 0).astype(np.float16) + sample['label'] = y[1][0][i] + names = [y[0][i], y[1][1][i], y[1][2][i], y[1][3][i]] + sample['frame_dir'] = '_'.join([str(k) for k in names]) + sample['total_frames'] = 150 + data.append(sample) + return data + + +x = np.load('train_ntu_sk_120.npy') +y = load('train_label_120.pkl') + +data = gen_babel(x, y) +dump(data, 'babel120_train.pkl') diff --git a/tools/data/skeleton/compress_nturgbd.py b/tools/data/skeleton/compress_nturgbd.py new file mode 100644 index 0000000000000000000000000000000000000000..e021f476777c7ccb1e722842e83bf0bef28b31b2 --- /dev/null +++ b/tools/data/skeleton/compress_nturgbd.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import multiprocessing as mp +import os +import os.path as osp +import subprocess + + +def get_shape(vid): + cmd = 'ffprobe -v error -select_streams v:0 -show_entries ' \ + 'stream=width,height -of csv=s=x:p=0 \"{}\"'.format(vid) + w, h = subprocess.check_output(cmd, shell=True).decode('utf-8').split('x') + return int(w), int(h) + + +def compress(src, dest, shape=None, target_size=540, fps=-1): + if shape is None: + shape = get_shape(src) + w, h = shape + scale_str = f'-vf scale=-2:{target_size}' if w >= h else \ + f'-vf scale={target_size}:-2' + fps_str = f'-r {fps}' if fps > 0 else '' + quality_str = '-q:v 1' + vcodec_str = '-c:v libx264' + cmd = f'ffmpeg -y -loglevel error -i {src} -threads 1 ' \ + f'{quality_str} {scale_str} {fps_str} {vcodec_str} {dest}' + os.system(cmd) + + +def compress_nturgbd(name): + src = name + dest = src.replace('nturgbd_raw', + 'nturgbd_videos').replace('_rgb.avi', '.mp4') + shape = (1920, 1080) + compress(src, dest, shape) + + +src_dir = 'data/nturgbd_raw' +tgt_dir = 'data/nturgbd_videos' +os.makedirs(tgt_dir, exist_ok=True) +files = [osp.join(src_dir, x) for x in os.listdir(src_dir) if '.avi' in x] +pool = mp.Pool(32) +pool.map(compress_nturgbd, files) diff --git a/tools/data/skeleton/gen_ntu_rgbd_raw.py b/tools/data/skeleton/gen_ntu_rgbd_raw.py new file mode 100644 index 0000000000000000000000000000000000000000..57d6e35b110480a2aaf6af00e0660548b7465d6b --- /dev/null +++ b/tools/data/skeleton/gen_ntu_rgbd_raw.py @@ -0,0 +1,217 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp +from typing import Dict, List, Optional, Tuple + +import mmengine +import numpy as np + +training_subjects_60 = [ + 1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34, 35, 38 +] +training_cameras_60 = [2, 3] +training_subjects_120 = [ + 1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34, 35, 38, + 45, 46, 47, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 70, 74, 78, 80, 81, 82, + 83, 84, 85, 86, 89, 91, 92, 93, 94, 95, 97, 98, 100, 103 +] +training_setups_120 = [ + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 +] +max_body_true = 2 +max_body_kinect = 4 +num_joint = 25 +max_frame = 300 + + +def read_skeleton_filter(file: str) -> Dict: + with open(file, 'r') as f: + skeleton_sequence = {'num_frame': int(f.readline()), 'frameInfo': []} + + for t in range(skeleton_sequence['num_frame']): + frame_info = {'numBody': int(f.readline()), 'bodyInfo': []} + + for m in range(frame_info['numBody']): + body_info_key = [ + 'bodyID', 'clipedEdges', 'handLeftConfidence', + 'handLeftState', 'handRightConfidence', 'handRightState', + 'isResticted', 'leanX', 'leanY', 'trackingState' + ] + body_info = { + k: float(v) + for k, v in zip(body_info_key, + f.readline().split()) + } + body_info['numJoint'] = int(f.readline()) + body_info['jointInfo'] = [] + for v in range(body_info['numJoint']): + joint_info_key = [ + 'x', 'y', 'z', 'depthX', 'depthY', 'colorX', 'colorY', + 'orientationW', 'orientationX', 'orientationY', + 'orientationZ', 'trackingState' + ] + joint_info = { + k: float(v) + for k, v in zip(joint_info_key, + f.readline().split()) + } + body_info['jointInfo'].append(joint_info) + frame_info['bodyInfo'].append(body_info) + skeleton_sequence['frameInfo'].append(frame_info) + + return skeleton_sequence + + +def get_nonzero_std(s: np.ndarray) -> float: # T V C + index = s.sum(-1).sum(-1) != 0 + s = s[index] + if len(s) != 0: + s = s[:, :, 0].std() + \ + s[:, :, 1].std() + \ + s[:, :, 2].std() # three channels + else: + s = 0 + return s + + +def read_xyz(file: str, max_body: int = 4, num_joint: int = 25) -> np.ndarray: + seq_info = read_skeleton_filter(file) + data = np.zeros((max_body, seq_info['num_frame'], num_joint, 3)) + for n, f in enumerate(seq_info['frameInfo']): + for m, b in enumerate(f['bodyInfo']): + for j, v in enumerate(b['jointInfo']): + if m < max_body and j < num_joint: + data[m, n, j, :] = [v['x'], v['y'], v['z']] + else: + pass + + # select two max energy body + energy = np.array([get_nonzero_std(x) for x in data]) + index = energy.argsort()[::-1][0:max_body_true] + data = data[index] + + # filter padding body + data = data[data.sum((1, 2, 3)) != 0] + return data + + +def get_names_and_labels(data_path: str, + task: str, + benchmark: str, + ignored_samples: Optional[List[str]] = None) -> Tuple: + train_names = [] + train_labels = [] + val_names = [] + val_labels = [] + + for filename in os.listdir(data_path): + if ignored_samples is not None and filename in ignored_samples: + continue + + setup_number = int(filename[filename.find('S') + 1:filename.find('S') + + 4]) + action_class = int(filename[filename.find('A') + 1:filename.find('A') + + 4]) + subject_id = int(filename[filename.find('P') + 1:filename.find('P') + + 4]) + camera_id = int(filename[filename.find('C') + 1:filename.find('C') + + 4]) + + if benchmark == 'xsub': + if task == 'ntu60': + istraining = (subject_id in training_subjects_60) + else: + istraining = (subject_id in training_subjects_120) + elif benchmark == 'xview': + istraining = (camera_id in training_cameras_60) + elif benchmark == 'xset': + istraining = (setup_number in training_setups_120) + else: + raise ValueError() + + if istraining: + train_names.append(filename) + train_labels.append(action_class - 1) + else: + val_names.append(filename) + val_labels.append(action_class - 1) + + return train_names, train_labels, val_names, val_labels + + +def gendata(data_path: str, + out_path: str, + ignored_sample_path: Optional[str] = None, + task: str = 'ntu60') -> None: + split = dict() + + if ignored_sample_path is not None: + with open(ignored_sample_path, 'r') as f: + ignored_samples = [ + line.strip() + '.skeleton' for line in f.readlines() + ] + else: + ignored_samples = [] + + if task == 'ntu60': + benchmarks = ['xsub', 'xview'] + else: + benchmarks = ['xsub', 'xset'] + + names = None + labels = None + for benchmark in benchmarks: + train_names, train_labels, val_names, val_labels = \ + get_names_and_labels(data_path, task, benchmark, ignored_samples) + split[f'{benchmark}_train'] = [osp.splitext(s)[0] for s in train_names] + split[f'{benchmark}_val'] = [osp.splitext(s)[0] for s in val_names] + + if names is None and labels is None: + names = train_names + val_names + labels = train_labels + val_labels + + results = [] + + prog_bar = mmengine.ProgressBar(len(names)) + for i, s in enumerate(names): + ske = read_xyz( + osp.join(data_path, s), + max_body=max_body_kinect, + num_joint=num_joint).astype(np.float16) + + anno = dict() + anno['frame_dir'] = osp.splitext(s)[0] + anno['label'] = labels[i] + anno['keypoint'] = ske + anno['total_frames'] = ske.shape[1] + results.append(anno) + prog_bar.update() + + annotations = {'split': split, 'annotations': results} + mmengine.dump(annotations, f'{out_path}/{task}_3d.pkl') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate Pose Annotation for NTURGB-D raw skeleton data') + parser.add_argument( + '--data-path', + type=str, + help='raw skeleton data path', + default='../../../data/ntu60/nturgb+d_skeletons/') + parser.add_argument( + '--ignored-sample-path', + type=str, + default='NTU_RGBD_samples_with_missing_skeletons.txt') + parser.add_argument( + '--out-folder', type=str, default='../../../data/skeleton/') + parser.add_argument('--task', type=str, default='ntu60') + args = parser.parse_args() + + assert args.task in ['ntu60', 'ntu120'] + + mmengine.mkdir_or_exist(args.out_folder) + + gendata(args.data_path, args.out_folder, args.ignored_sample_path, + args.task) diff --git a/tools/data/skeleton/label_map_gym99.txt b/tools/data/skeleton/label_map_gym99.txt new file mode 100644 index 0000000000000000000000000000000000000000..79f4010816b30047a74b1b71786c7d94770ebdbb --- /dev/null +++ b/tools/data/skeleton/label_map_gym99.txt @@ -0,0 +1,99 @@ +(VT) round-off, flic-flac with 0.5 turn on, stretched salto forward with 0.5 turn off +(VT) round-off, flic-flac on, stretched salto backward with 2 turn off +(VT) round-off, flic-flac on, stretched salto backward with 1 turn off +(VT) round-off, flic-flac on, stretched salto backward with 1.5 turn off +(VT) round-off, flic-flac on, stretched salto backward with 2.5 turn off +(VT) round-off, flic-flac on, stretched salto backward off +(FX) switch leap with 0.5 turn +(FX) switch leap with 1 turn +(FX) split leap with 1 turn +(FX) split leap with 1.5 turn or more +(FX) switch leap (leap forward with leg change to cross split) +(FX) split jump with 1 turn +(FX) split jump (leg separation 180 degree parallel to the floor) +(FX) johnson with additional 0.5 turn +(FX) straddle pike or side split jump with 1 turn +(FX) switch leap to ring position +(FX) stag jump +(FX) 2 turn with free leg held upward in 180 split position throughout turn +(FX) 2 turn in tuck stand on one leg, free leg straight throughout turn +(FX) 3 turn on one leg, free leg optional below horizontal +(FX) 2 turn on one leg, free leg optional below horizontal +(FX) 1 turn on one leg, free leg optional below horizontal +(FX) 2 turn or more with heel of free leg forward at horizontal throughout turn +(FX) 1 turn with heel of free leg forward at horizontal throughout turn +(FX) arabian double salto tucked +(FX) salto forward tucked +(FX) aerial walkover forward +(FX) salto forward stretched with 2 twist +(FX) salto forward stretched with 1 twist +(FX) salto forward stretched with 1.5 twist +(FX) salto forward stretched, feet land together +(FX) double salto backward stretched +(FX) salto backward stretched with 3 twist +(FX) salto backward stretched with 2 twist +(FX) salto backward stretched with 2.5 twist +(FX) salto backward stretched with 1.5 twist +(FX) double salto backward tucked with 2 twist +(FX) double salto backward tucked with 1 twist +(FX) double salto backward tucked +(FX) double salto backward piked with 1 twist +(FX) double salto backward piked +(BB) sissone (leg separation 180 degree on the diagonal to the floor, take off two feet, land on one foot) +(BB) split jump with 0.5 turn in side position +(BB) split jump +(BB) straddle pike jump or side split jump +(BB) split ring jump (ring jump with front leg horizontal to the floor) +(BB) switch leap with 0.5 turn +(BB) switch leap (leap forward with leg change) +(BB) split leap forward +(BB) johnson (leap forward with leg change and 0.25 turn to side split or straddle pike position) +(BB) switch leap to ring position +(BB) sheep jump (jump with upper back arch and head release with feet to head height/closed Ring) +(BB) wolf hop or jump (hip angle at 45, knees together) +(BB) 1 turn with heel of free leg forward at horizontal throughout turn +(BB) 2 turn on one leg, free leg optional below horizontal +(BB) 1 turn on one leg, free leg optional below horizontal +(BB) 2 turn in tuck stand on one leg, free leg optional +(BB) salto backward tucked with 1 twist +(BB) salto backward tucked +(BB) salto backward stretched-step out (feet land successively) +(BB) salto backward stretched with legs together +(BB) salto sideward tucked, take off from one leg to side stand +(BB) free aerial cartwheel landing in cross position +(BB) salto forward tucked to cross stand +(BB) free aerial walkover forward, landing on one or both feet +(BB) jump backward, flic-flac take-off with 0.5 twist through handstand to walkover forward, also with support on one arm +(BB) flic-flac to land on both feet +(BB) flic-flac with step-out, also with support on one arm +(BB) round-off +(BB) double salto backward tucked +(BB) salto backward tucked +(BB) double salto backward piked +(BB) salto backward stretched with 2 twist +(BB) salto backward stretched with 2.5 twist +(UB) pike sole circle backward with 1 turn to handstand +(UB) pike sole circle backward with 0.5 turn to handstand +(UB) pike sole circle backward to handstand +(UB) giant circle backward with 1 turn to handstand +(UB) giant circle backward with 0.5 turn to handstand +(UB) giant circle backward +(UB) giant circle forward with 1 turn on one arm before handstand phase +(UB) giant circle forward with 0.5 turn to handstand +(UB) giant circle forward +(UB) clear hip circle backward to handstand +(UB) clear pike circle backward with 1 turn to handstand +(UB) clear pike circle backward with 0.5 turn to handstand +(UB) clear pike circle backward to handstand +(UB) stalder backward with 1 turn to handstand +(UB) stalder backward to handstand +(UB) counter straddle over high bar to hang +(UB) counter piked over high bar to hang +(UB) (swing backward or front support) salto forward straddled to hang on high bar +(UB) (swing backward) salto forward piked to hang on high bar +(UB) (swing forward or hip circle backward) salto backward with 0.5 turn piked to hang on high bar +(UB) transition flight from high bar to low bar +(UB) transition flight from low bar to high bar +(UB) (swing forward) double salto backward tucked with 1 turn +(UB) (swing backward) double salto forward tucked +(UB) (swing forward) double salto backward stretched diff --git a/tools/data/skeleton/label_map_ntu60.txt b/tools/data/skeleton/label_map_ntu60.txt new file mode 100644 index 0000000000000000000000000000000000000000..a39bae255609734f7c77bb053179c40fcfbef75d --- /dev/null +++ b/tools/data/skeleton/label_map_ntu60.txt @@ -0,0 +1,60 @@ +drink water +eat meal/snack +brushing teeth +brushing hair +drop +pickup +throw +sitting down +standing up (from sitting position) +clapping +reading +writing +tear up paper +wear jacket +take off jacket +wear a shoe +take off a shoe +wear on glasses +take off glasses +put on a hat/cap +take off a hat/cap +cheer up +hand waving +kicking something +reach into pocket +hopping (one foot jumping) +jump up +make a phone call/answer phone +playing with phone/tablet +typing on a keyboard +pointing to something with finger +taking a selfie +check time (from watch) +rub two hands together +nod head/bow +shake head +wipe face +salute +put the palms together +cross hands in front (say stop) +sneeze/cough +staggering +falling +touch head (headache) +touch chest (stomachache/heart pain) +touch back (backache) +touch neck (neckache) +nausea or vomiting condition +use a fan (with hand or paper)/feeling warm +punching/slapping other person +kicking other person +pushing other person +pat on back of other person +point finger at the other person +hugging other person +giving something to other person +touch other person's pocket +handshaking +walking towards each other +walking apart from each other diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..9ee57874903951eeffd24bc64a77b70e594d3dab --- /dev/null +++ b/tools/data/skeleton/ntu_pose_extraction.py @@ -0,0 +1,305 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import abc +import argparse +import os.path as osp +from collections import defaultdict +from tempfile import TemporaryDirectory + +import mmengine +import numpy as np + +from mmaction.apis import detection_inference, pose_inference +from mmaction.utils import frame_extract + +args = abc.abstractproperty() +args.det_config = 'demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py' # noqa: E501 +args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501 +args.det_score_thr = 0.5 +args.pose_config = 'demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py' # noqa: E501 +args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' # noqa: E501 + + +def intersection(b0, b1): + l, r = max(b0[0], b1[0]), min(b0[2], b1[2]) + u, d = max(b0[1], b1[1]), min(b0[3], b1[3]) + return max(0, r - l) * max(0, d - u) + + +def iou(b0, b1): + i = intersection(b0, b1) + u = area(b0) + area(b1) - i + return i / u + + +def area(b): + return (b[2] - b[0]) * (b[3] - b[1]) + + +def removedup(bbox): + + def inside(box0, box1, threshold=0.8): + return intersection(box0, box1) / area(box0) > threshold + + num_bboxes = bbox.shape[0] + if num_bboxes == 1 or num_bboxes == 0: + return bbox + valid = [] + for i in range(num_bboxes): + flag = True + for j in range(num_bboxes): + if i != j and inside(bbox[i], + bbox[j]) and bbox[i][4] <= bbox[j][4]: + flag = False + break + if flag: + valid.append(i) + return bbox[valid] + + +def is_easy_example(det_results, num_person): + threshold = 0.95 + + def thre_bbox(bboxes, threshold=threshold): + shape = [sum(bbox[:, -1] > threshold) for bbox in bboxes] + ret = np.all(np.array(shape) == shape[0]) + return shape[0] if ret else -1 + + if thre_bbox(det_results) == num_person: + det_results = [x[x[..., -1] > 0.95] for x in det_results] + return True, np.stack(det_results) + return False, thre_bbox(det_results) + + +def bbox2tracklet(bbox): + iou_thre = 0.6 + tracklet_id = -1 + tracklet_st_frame = {} + tracklets = defaultdict(list) + for t, box in enumerate(bbox): + for idx in range(box.shape[0]): + matched = False + for tlet_id in range(tracklet_id, -1, -1): + cond1 = iou(tracklets[tlet_id][-1][-1], box[idx]) >= iou_thre + cond2 = ( + t - tracklet_st_frame[tlet_id] - len(tracklets[tlet_id]) < + 10) + cond3 = tracklets[tlet_id][-1][0] != t + if cond1 and cond2 and cond3: + matched = True + tracklets[tlet_id].append((t, box[idx])) + break + if not matched: + tracklet_id += 1 + tracklet_st_frame[tracklet_id] = t + tracklets[tracklet_id].append((t, box[idx])) + return tracklets + + +def drop_tracklet(tracklet): + tracklet = {k: v for k, v in tracklet.items() if len(v) > 5} + + def meanarea(track): + boxes = np.stack([x[1] for x in track]).astype(np.float32) + areas = (boxes[..., 2] - boxes[..., 0]) * ( + boxes[..., 3] - boxes[..., 1]) + return np.mean(areas) + + tracklet = {k: v for k, v in tracklet.items() if meanarea(v) > 5000} + return tracklet + + +def distance_tracklet(tracklet): + dists = {} + for k, v in tracklet.items(): + bboxes = np.stack([x[1] for x in v]) + c_x = (bboxes[..., 2] + bboxes[..., 0]) / 2. + c_y = (bboxes[..., 3] + bboxes[..., 1]) / 2. + c_x -= 480 + c_y -= 270 + c = np.concatenate([c_x[..., None], c_y[..., None]], axis=1) + dist = np.linalg.norm(c, axis=1) + dists[k] = np.mean(dist) + return dists + + +def tracklet2bbox(track, num_frame): + # assign_prev + bbox = np.zeros((num_frame, 5)) + trackd = {} + for k, v in track: + bbox[k] = v + trackd[k] = v + for i in range(num_frame): + if bbox[i][-1] <= 0.5: + mind = np.Inf + for k in trackd: + if np.abs(k - i) < mind: + mind = np.abs(k - i) + bbox[i] = bbox[k] + return bbox + + +def tracklets2bbox(tracklet, num_frame): + dists = distance_tracklet(tracklet) + sorted_inds = sorted(dists, key=lambda x: dists[x]) + dist_thre = np.Inf + for i in sorted_inds: + if len(tracklet[i]) >= num_frame / 2: + dist_thre = 2 * dists[i] + break + + dist_thre = max(50, dist_thre) + + bbox = np.zeros((num_frame, 5)) + bboxd = {} + for idx in sorted_inds: + if dists[idx] < dist_thre: + for k, v in tracklet[idx]: + if bbox[k][-1] < 0.01: + bbox[k] = v + bboxd[k] = v + bad = 0 + for idx in range(num_frame): + if bbox[idx][-1] < 0.01: + bad += 1 + mind = np.Inf + mink = None + for k in bboxd: + if np.abs(k - idx) < mind: + mind = np.abs(k - idx) + mink = k + bbox[idx] = bboxd[mink] + return bad, bbox[:, None, :] + + +def bboxes2bbox(bbox, num_frame): + ret = np.zeros((num_frame, 2, 5)) + for t, item in enumerate(bbox): + if item.shape[0] <= 2: + ret[t, :item.shape[0]] = item + else: + inds = sorted( + list(range(item.shape[0])), key=lambda x: -item[x, -1]) + ret[t] = item[inds[:2]] + for t in range(num_frame): + if ret[t, 0, -1] <= 0.01: + ret[t] = ret[t - 1] + elif ret[t, 1, -1] <= 0.01: + if t: + if ret[t - 1, 0, -1] > 0.01 and ret[t - 1, 1, -1] > 0.01: + if iou(ret[t, 0], ret[t - 1, 0]) > iou( + ret[t, 0], ret[t - 1, 1]): + ret[t, 1] = ret[t - 1, 1] + else: + ret[t, 1] = ret[t - 1, 0] + return ret + + +def ntu_det_postproc(vid, det_results): + det_results = [removedup(x) for x in det_results] + label = int(vid.split('/')[-1].split('A')[1][:3]) + mpaction = list(range(50, 61)) + list(range(106, 121)) + n_person = 2 if label in mpaction else 1 + is_easy, bboxes = is_easy_example(det_results, n_person) + if is_easy: + print('\nEasy Example') + return bboxes + + tracklets = bbox2tracklet(det_results) + tracklets = drop_tracklet(tracklets) + + print(f'\nHard {n_person}-person Example, found {len(tracklets)} tracklet') + if n_person == 1: + if len(tracklets) == 1: + tracklet = list(tracklets.values())[0] + det_results = tracklet2bbox(tracklet, len(det_results)) + return np.stack(det_results) + else: + bad, det_results = tracklets2bbox(tracklets, len(det_results)) + return det_results + # n_person is 2 + if len(tracklets) <= 2: + tracklets = list(tracklets.values()) + bboxes = [] + for tracklet in tracklets: + bboxes.append(tracklet2bbox(tracklet, len(det_results))[:, None]) + bbox = np.concatenate(bboxes, axis=1) + return bbox + else: + return bboxes2bbox(det_results, len(det_results)) + + +def pose_inference_with_align(args, frame_paths, det_results): + # filter frame without det bbox + det_results = [ + frm_dets for frm_dets in det_results if frm_dets.shape[0] > 0 + ] + + pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint, + frame_paths, det_results, args.device) + # align the num_person among frames + num_persons = max([pose['keypoints'].shape[0] for pose in pose_results]) + num_points = pose_results[0]['keypoints'].shape[1] + num_frames = len(pose_results) + keypoints = np.zeros((num_persons, num_frames, num_points, 2), + dtype=np.float32) + scores = np.zeros((num_persons, num_frames, num_points), dtype=np.float32) + + for f_idx, frm_pose in enumerate(pose_results): + frm_num_persons = frm_pose['keypoints'].shape[0] + for p_idx in range(frm_num_persons): + keypoints[p_idx, f_idx] = frm_pose['keypoints'][p_idx] + scores[p_idx, f_idx] = frm_pose['keypoint_scores'][p_idx] + + return keypoints, scores + + +def ntu_pose_extraction(vid, skip_postproc=False): + tmp_dir = TemporaryDirectory() + frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name) + det_results, _ = detection_inference( + args.det_config, + args.det_checkpoint, + frame_paths, + args.det_score_thr, + device=args.device, + with_score=True) + + if not skip_postproc: + det_results = ntu_det_postproc(vid, det_results) + + anno = dict() + + keypoints, scores = pose_inference_with_align(args, frame_paths, + det_results) + anno['keypoint'] = keypoints + anno['keypoint_score'] = scores + anno['frame_dir'] = osp.splitext(osp.basename(vid))[0] + anno['img_shape'] = (1080, 1920) + anno['original_shape'] = (1080, 1920) + anno['total_frames'] = keypoints.shape[1] + anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1 + tmp_dir.cleanup() + + return anno + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate Pose Annotation for a single NTURGB-D video') + parser.add_argument('video', type=str, help='source video') + parser.add_argument('output', type=str, help='output pickle name') + parser.add_argument('--device', type=str, default='cuda:0') + parser.add_argument('--skip-postproc', action='store_true') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + global_args = parse_args() + args.device = global_args.device + args.video = global_args.video + args.output = global_args.output + args.skip_postproc = global_args.skip_postproc + anno = ntu_pose_extraction(args.video, args.skip_postproc) + mmengine.dump(anno, args.output) diff --git a/tools/data/sthv1/README.md b/tools/data/sthv1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d8611bdfd987d8fb8e76e66ee2b7b5697c888a0 --- /dev/null +++ b/tools/data/sthv1/README.md @@ -0,0 +1,144 @@ +# Preparing Something-Something V1 + +## Introduction + + + +```BibTeX +@misc{goyal2017something, + title={The "something something" video database for learning and evaluating visual common sense}, + author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyล„ska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic}, + year={2017}, + eprint={1706.04261}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +For basic dataset information, you can refer to the dataset [paper](https://arxiv.org/pdf/1706.04261.pdf). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv1/`. + +## Step 1. Prepare Annotations + +Since the official [website](https://20bn.com/datasets/something-something/v1) of Something-Something V1 is currently unavailable, you can download the annotations from third-part source to `$MMACTION2/data/sthv1/` . + +## Step 2. Prepare RGB Frames + +Since the official dataset doesn't provide the original video data and only extracted RGB frames are available, you have to directly download RGB frames. + +You can download all compressed file parts from third-part source to `$MMACTION2/data/sthv1/` and use the following command to uncompress. + +```shell +cd $MMACTION2/data/sthv1/ +cat 20bn-something-something-v1-?? | tar zx +cd $MMACTION2/tools/data/sthv1/ +``` + +For users who only want to use RGB frames, you can skip to step 5 to generate file lists in the format of rawframes. +Since the prefix of official JPGs is "%05d.jpg" (e.g., "00001.jpg"), users need to add `"filename_tmpl='{:05}.jpg'"` to the dict of `data.train`, `data.val` and `data.test` in the config files related with sthv1 like this: + +``` +data = dict( + videos_per_gpu=16, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +``` + +## Step 3. Extract Flow + +This part is **optional** if you only want to use RGB frames. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/sthv1_extracted/ +ln -s /mnt/SSD/sthv1_extracted/ ../../../data/sthv1/rawframes +``` + +Then, you can run the following script to extract optical flow based on RGB frames. + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash extract_flow.sh +``` + +## Step 4. Encode Videos + +This part is **optional** if you only want to use RGB frames. + +You can run the following script to encode videos. + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash encode_videos.sh +``` + +## Step 5. Generate File List + +You can run the follow script to generate file list in the format of rawframes and videos. + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash generate_{rawframes, videos}_filelist.sh +``` + +## Step 6. Check Directory Structure + +After the whole data process for Something-Something V1 preparation, +you will get the rawframes (RGB + Flow), and annotation files for Something-Something V1. + +In the context of the whole project (for Something-Something V1 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ sthv1 +โ”‚ โ”‚ โ”œโ”€โ”€ sthv1_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ sthv1_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +For training and evaluating on Something-Something V1, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/sthv1/README_zh-CN.md b/tools/data/sthv1/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..b8a37d63ef81777846aa5f03cc7539b3825c7d99 --- /dev/null +++ b/tools/data/sthv1/README_zh-CN.md @@ -0,0 +1,142 @@ +# ๅ‡†ๅค‡ Something-Something V1 + +## ็ฎ€ไป‹ + +``` +@misc{goyal2017something, + title={The "something something" video database for learning and evaluating visual common sense}, + author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyล„ska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic}, + year={2017}, + eprint={1706.04261}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](https://20bn.com/datasets/something-something/v1)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/sthv1/`ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +็”ฑไบŽ Something-Something V1 ็š„ๅฎ˜ๆ–น็ฝ‘็ซ™ๅทฒ็ปๅคฑๆ•ˆ๏ผŒ็”จๆˆท้œ€่ฆ้€š่ฟ‡็ฌฌไธ‰ๆ–นๆบไธ‹่ฝฝๅŽŸๅง‹ๆ•ฐๆฎ้›†ใ€‚ไธ‹่ฝฝๅฅฝ็š„ๆ ‡ๆณจๆ–‡ไปถ้œ€่ฆๆ”พๅœจ `$MMACTION2/data/sthv1/annotations` ๆ–‡ไปถๅคนไธ‹ใ€‚ + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡ RGB ๅธง + +ๅฎ˜ๆ–นๆ•ฐๆฎ้›†ๅนถๆœชๆไพ›ๅŽŸๅง‹่ง†้ข‘ๆ–‡ไปถ๏ผŒๅชๆไพ›ไบ†ๅฏนๅŽŸ่ง†้ข‘ๆ–‡ไปถ่ฟ›่กŒๆŠฝๅ–ๅพ—ๅˆฐ็š„ RGB ๅธง๏ผŒ็”จๆˆทๅฏๅœจ็ฌฌไธ‰ๆ–นๆบ็›ดๆŽฅไธ‹่ฝฝ่ง†้ข‘ๅธงใ€‚ + +ๅฐ†ไธ‹่ฝฝๅฅฝ็š„ๅŽ‹็ผฉๆ–‡ไปถๆ”พๅœจ `$MMACTION2/data/sthv1/` ๆ–‡ไปถๅคนไธ‹๏ผŒๅนถไฝฟ็”จไปฅไธ‹่„šๆœฌ่ฟ›่กŒ่งฃๅŽ‹ใ€‚ + +```shell +cd $MMACTION2/data/sthv1/ +cat 20bn-something-something-v1-?? | tar zx +cd $MMACTION2/tools/data/sthv1/ +``` + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ RGB ๅธง๏ผŒๅˆ™ๅฏไปฅ่ทณ่ฟ‡ไธญ้—ดๆญฅ้ชค่‡ณๆญฅ้ชค 5 ไปฅ็›ดๆŽฅ็”Ÿๆˆ่ง†้ข‘ๅธง็š„ๆ–‡ไปถๅˆ—่กจใ€‚ +็”ฑไบŽๅฎ˜็ฝ‘็š„ JPG ๆ–‡ไปถๅๅฝขๅฆ‚ "%05d.jpg" ๏ผˆๆฏ”ๅฆ‚๏ผŒ"00001.jpg"๏ผ‰๏ผŒ้œ€่ฆๅœจ้…็ฝฎๆ–‡ไปถ็š„ `data.train`, `data.val` ๅ’Œ `data.test` ๅค„ๆทปๅŠ  `"filename_tmpl='{:05}.jpg'"` ไปฃ็ ๏ผŒไปฅไฟฎๆ”นๆ–‡ไปถๅๆจกๆฟใ€‚ + +``` +data = dict( + videos_per_gpu=16, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ–ๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จๅŽŸ RGB ๅธงๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/sthv1_extracted/ +ln -s /mnt/SSD/sthv1_extracted/ ../../../data/sthv1/rawframes +``` + +ๅฆ‚ๆžœๆƒณๆŠฝๅ–ๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌไปŽ RGB ๅธงไธญๆŠฝๅ–ๅ‡บๅ…‰ๆตใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash extract_flow.sh +``` + +## ๆญฅ้ชค 4: ็ผ–็ ่ง†้ข‘ + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ RGB ๅธงๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +็”จๆˆทๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค่ฟ›่กŒ่ง†้ข‘็ผ–็ ใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash encode_videos.sh +``` + +## ๆญฅ้ชค 5. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv1/ +bash generate_{rawframes, videos}_filelist.sh +``` + +## ๆญฅ้ชค 6. ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ Something-Something V1 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ +็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„ RGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒSomething-Something V1 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ sthv1 +โ”‚ โ”‚ โ”œโ”€โ”€ sthv1_{train,val}_list_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ sthv1_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +ๅ…ณไบŽๅฏน Something-Something V1 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/sthv1/encode_videos.sh b/tools/data/sthv1/encode_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..98b0166bbe3bd38cc368466f5089ee137662d257 --- /dev/null +++ b/tools/data/sthv1/encode_videos.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_videos.py ../../data/sthv1/rawframes/ ../../data/sthv1/videos/ --fps 12 --level 1 --start-idx 1 --filename-tmpl '%05d' +echo "Encode videos" + +cd sthv1/ diff --git a/tools/data/sthv1/extract_flow.sh b/tools/data/sthv1/extract_flow.sh new file mode 100644 index 0000000000000000000000000000000000000000..04925200f13a4c26507a5ef6c87e5ded00eb1ede --- /dev/null +++ b/tools/data/sthv1/extract_flow.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/sthv1/rawframes/ ../../data/sthv1/rawframes/ --task flow --level 1 --flow-type tvl1 --input-frames +echo "Flow (tv-l1) Generated" +cd sthv1/ diff --git a/tools/data/sthv1/generate_rawframes_filelist.sh b/tools/data/sthv1/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..090695e4c4f28a2c5622a7b9579cc8da673a9a64 --- /dev/null +++ b/tools/data/sthv1/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset train --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset val --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/sthv1/ diff --git a/tools/data/sthv1/generate_videos_filelist.sh b/tools/data/sthv1/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..fd59fb31042e10d1e43bb87ed853d656a2257bd4 --- /dev/null +++ b/tools/data/sthv1/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/sthv1/ diff --git a/tools/data/sthv1/label_map.txt b/tools/data/sthv1/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4f8782c06781e6fd5366a1c877e0a8ae53559b6 --- /dev/null +++ b/tools/data/sthv1/label_map.txt @@ -0,0 +1,174 @@ +Holding something +Turning something upside down +Turning the camera left while filming something +Stacking number of something +Turning the camera right while filming something +Opening something +Approaching something with your camera +Picking something up +Pushing something so that it almost falls off but doesn't +Folding something +Moving something away from the camera +Closing something +Moving away from something with your camera +Turning the camera downwards while filming something +Pushing something so that it slightly moves +Turning the camera upwards while filming something +Pretending to pick something up +Showing something to the camera +Moving something up +Plugging something into something +Unfolding something +Putting something onto something +Showing that something is empty +Pretending to put something on a surface +Taking something from somewhere +Putting something next to something +Moving something towards the camera +Showing a photo of something to the camera +Pushing something with something +Throwing something +Pushing something from left to right +Something falling like a feather or paper +Throwing something in the air and letting it fall +Throwing something against something +Lifting something with something on it +Taking one of many similar things on the table +Showing something behind something +Putting something into something +Tearing something just a little bit +Moving something away from something +Tearing something into two pieces +Pushing something from right to left +Holding something next to something +Putting something, something and something on the table +Pretending to take something from somewhere +Moving something closer to something +Pretending to put something next to something +Uncovering something +Something falling like a rock +Putting something and something on the table +Pouring something into something +Moving something down +Pulling something from right to left +Throwing something in the air and catching it +Tilting something with something on it until it falls off +Putting something in front of something +Pretending to turn something upside down +Putting something on a surface +Pretending to throw something +Showing something on top of something +Covering something with something +Squeezing something +Putting something similar to other things that are already on the table +Lifting up one end of something, then letting it drop down +Taking something out of something +Moving part of something +Pulling something from left to right +Lifting something up completely without letting it drop down +Attaching something to something +Putting something behind something +Moving something and something closer to each other +Holding something in front of something +Pushing something so that it falls off the table +Holding something over something +Pretending to open something without actually opening it +Removing something, revealing something behind +Hitting something with something +Moving something and something away from each other +Touching (without moving) part of something +Pretending to put something into something +Showing that something is inside something +Lifting something up completely, then letting it drop down +Pretending to take something out of something +Holding something behind something +Laying something on the table on its side, not upright +Poking something so it slightly moves +Pretending to close something without actually closing it +Putting something upright on the table +Dropping something in front of something +Dropping something behind something +Lifting up one end of something without letting it drop down +Rolling something on a flat surface +Throwing something onto a surface +Showing something next to something +Dropping something onto something +Stuffing something into something +Dropping something into something +Piling something up +Letting something roll along a flat surface +Twisting something +Spinning something that quickly stops spinning +Putting number of something onto something +Putting something underneath something +Moving something across a surface without it falling down +Plugging something into something but pulling it right out as you remove your hand +Dropping something next to something +Poking something so that it falls over +Spinning something so it continues spinning +Poking something so lightly that it doesn't or almost doesn't move +Wiping something off of something +Moving something across a surface until it falls down +Pretending to poke something +Putting something that cannot actually stand upright upright on the table, so it falls on its side +Pulling something out of something +Scooping something up with something +Pretending to be tearing something that is not tearable +Burying something in something +Tipping something over +Tilting something with something on it slightly so it doesn't fall down +Pretending to put something onto something +Bending something until it breaks +Letting something roll down a slanted surface +Trying to bend something unbendable so nothing happens +Bending something so that it deforms +Digging something out of something +Pretending to put something underneath something +Putting something on a flat surface without letting it roll +Putting something on the edge of something so it is not supported and falls down +Spreading something onto something +Pretending to put something behind something +Sprinkling something onto something +Something colliding with something and both come to a halt +Pushing something off of something +Putting something that can't roll onto a slanted surface, so it stays where it is +Lifting a surface with something on it until it starts sliding down +Pretending or failing to wipe something off of something +Trying but failing to attach something to something because it doesn't stick +Pulling something from behind of something +Pushing something so it spins +Pouring something onto something +Pulling two ends of something but nothing happens +Moving something and something so they pass each other +Pretending to sprinkle air onto something +Putting something that can't roll onto a slanted surface, so it slides down +Something colliding with something and both are being deflected +Pretending to squeeze something +Pulling something onto something +Putting something onto something else that cannot support it so it falls down +Lifting a surface with something on it but not enough for it to slide down +Pouring something out of something +Moving something and something so they collide with each other +Tipping something with something in it over, so something in it falls out +Letting something roll up a slanted surface, so it rolls back down +Pretending to scoop something up with something +Pretending to pour something out of something, but something is empty +Pulling two ends of something so that it gets stretched +Failing to put something into something because something does not fit +Pretending or trying and failing to twist something +Trying to pour something into something, but missing so it spills next to it +Something being deflected from something +Poking a stack of something so the stack collapses +Spilling something onto something +Pulling two ends of something so that it separates into two pieces +Pouring something into something until it overflows +Pretending to spread air onto something +Twisting (wringing) something wet until water comes out +Poking a hole into something soft +Spilling something next to something +Poking a stack of something without the stack collapsing +Putting something onto a slanted surface but it doesn't glide down +Pushing something onto something +Poking something so that it spins around +Spilling something behind something +Poking a hole into some substance diff --git a/tools/data/sthv2/README.md b/tools/data/sthv2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ea2f7dc1c7c8a394c4b70c74f4c06660fe3251c --- /dev/null +++ b/tools/data/sthv2/README.md @@ -0,0 +1,135 @@ +# Preparing Something-Something V2 + +## Introduction + + + +```BibTeX +@misc{goyal2017something, + title={The "something something" video database for learning and evaluating visual common sense}, + author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyล„ska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic}, + year={2017}, + eprint={1706.04261}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +For basic dataset information, you can refer to the dataset [website](https://developer.qualcomm.com/software/ai-datasets/something-something). + +`````{tabs} + +````{group-tab} Download by MIM +MIM supports downloading from OpenDataLab and preprocessing Something-Something V2 dataset with one command line. +```Bash +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login +# download and preprocess by MIM +mim download mmaction2 --dataset sthv2 +``` + +```` + +## Step 1. Prepare Annotations + +First of all, you have to sign in and download annotations to `$MMACTION2/data/sthv2/annotations` on the official [website](https://20bn.com/datasets/something-something/v2). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv2/`. +## Step 2. Prepare Videos + +Then, you can download all data parts to `$MMACTION2/data/sthv2/` and use the following command to uncompress. + +```shell +cd $MMACTION2/data/sthv2/ +cat 20bn-something-something-v2-?? | tar zx +cd $MMACTION2/tools/data/sthv2/ +``` + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/sthv2_extracted/ +ln -s /mnt/SSD/sthv2_extracted/ ../../../data/sthv2/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames. + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_frames.sh +``` + +## Step 4. Generate File List + +you can run the follow script to generate file list in the format of rawframes and videos. + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash generate_{rawframes, videos}_filelist.sh +``` + +```` +````` + +### Check Directory Structure + +After the whole data process for Something-Something V2 preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for Something-Something V2. + +In the context of the whole project (for Something-Something V2 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ sthv2 +โ”‚ โ”‚ โ”œโ”€โ”€ sthv2_{train,val}_list_rawframes.txt(Optional) +โ”‚ โ”‚ โ”œโ”€โ”€ sthv2_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations(Optional) +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes(Optional) +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ img_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ img_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +For training and evaluating on Something-Something V2, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/sthv2/README_zh-CN.md b/tools/data/sthv2/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..da8b081f20bdfdd630c436e8f6661b695512ea9c --- /dev/null +++ b/tools/data/sthv2/README_zh-CN.md @@ -0,0 +1,137 @@ +# ๅ‡†ๅค‡ Something-Something V2 + +## ็ฎ€ไป‹ + + + +```BibTeX +@misc{goyal2017something, + title={The "something something" video database for learning and evaluating visual common sense}, + author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyล„ska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic}, + year={2017}, + eprint={1706.04261}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/something-something)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ + +`````{tabs} + +````{group-tab} ไฝฟ็”จ MIM ไธ‹่ฝฝ +# MIM ๆ”ฏๆŒไธ‹่ฝฝ Something-Something V2 ๆ•ฐๆฎ้›†ใ€‚็”จๆˆทๅฏไปฅ้€š่ฟ‡ไธ€่กŒๅ‘ฝไปค๏ผŒไปŽ OpenDataLab ่ฟ›่กŒไธ‹่ฝฝ๏ผŒๅนถ่ฟ›่กŒ้ข„ๅค„็†ใ€‚ +```Bash +# ๅฎ‰่ฃ… OpenXLab CLI ๅทฅๅ…ท +pip install -U openxlab +# ็™ปๅฝ• OpenXLab +openxlab login +# ้€š่ฟ‡ MIM ่ฟ›่กŒๆ•ฐๆฎ้›†ไธ‹่ฝฝ๏ผŒ้ข„ๅค„็†ใ€‚ๆณจๆ„่ฟ™ๅฐ†่Šฑ่ดน่พƒ้•ฟๆ—ถ้—ด +mim download mmaction2 --dataset sthv2 +``` + +```` + +````{group-tab} ไปŽๅฎ˜ๆ–นๆบไธ‹่ฝฝ +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆท้œ€่ฆๅœจ [ๅฎ˜็ฝ‘](https://developer.qualcomm.com/software/ai-datasets/something-something) ๅฎŒๆˆๆณจๅ†Œ๏ผŒๆ‰่ƒฝไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ไธ‹่ฝฝๅฅฝ็š„ๆ ‡ๆณจๆ–‡ไปถ้œ€่ฆๆ”พๅœจ `$MMACTION2/data/sthv2/annotations` ๆ–‡ไปถๅคนไธ‹ใ€‚ +็”จๆˆทๅฏไปฅไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/sthv2/`ใ€‚ + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏๅฐ†ไธ‹่ฝฝๅฅฝ็š„ๅŽ‹็ผฉๆ–‡ไปถๆ”พๅœจ `$MMACTION2/data/sthv2/` ๆ–‡ไปถๅคนไธ‹๏ผŒๅนถไธ”ไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒ่งฃๅŽ‹ใ€‚ + +```shell +cd $MMACTION2/data/sthv2/ +cat 20bn-something-something-v2-?? | tar zx +cd $MMACTION2/tools/data/sthv2/ +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/sthv2_extracted/ +ln -s /mnt/SSD/sthv2_extracted/ ../../../data/sthv2/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash extract_frames.sh +``` + +## ๆญฅ้ชค 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +cd $MMACTION2/tools/data/sthv2/ +bash generate_{rawframes, videos}_filelist.sh +``` + +```` +````` + +### ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ Something-Something V2 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ +็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„ RGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒSomething-Something V2 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ sthv2 +โ”‚ โ”‚ โ”œโ”€โ”€ sthv2_{train,val}_list_rawframes.txt๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ โ”‚ โ”œโ”€โ”€ sthv2_{train,val}_list_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ | โ”œโ”€โ”€ videos +โ”‚ | | โ”œโ”€โ”€ 1.mp4 +โ”‚ | | โ”œโ”€โ”€ 2.mp4 +โ”‚ | | โ”œโ”€โ”€... +โ”‚ | โ”œโ”€โ”€ rawframes๏ผˆๅฏ้€‰๏ผ‰ +โ”‚ | | โ”œโ”€โ”€ 1 +โ”‚ | | | โ”œโ”€โ”€ img_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ img_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ | | | โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ 2 +โ”‚ | | โ”œโ”€โ”€ ... + +``` + +ๅ…ณไบŽๅฏน Something-Something V2 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/sthv2/extract_frames.sh b/tools/data/sthv2/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..575a132b7cd464a242881cbe13e1a86088a9e0ed --- /dev/null +++ b/tools/data/sthv2/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task both --level 1 --flow-type tvl1 --ext webm +echo "Raw frames (RGB and tv-l1) Generated" +cd sthv2/ diff --git a/tools/data/sthv2/extract_rgb_frames.sh b/tools/data/sthv2/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0da858b318b12eb02b16c21b06e6e71e9a7df40 --- /dev/null +++ b/tools/data/sthv2/extract_rgb_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task rgb --level 1 --ext webm +echo "Genearte raw frames (RGB only)" + +cd sthv2/ diff --git a/tools/data/sthv2/extract_rgb_frames_opencv.sh b/tools/data/sthv2/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..53cca00fa0f5e144df45613f8088ae7d725ab296 --- /dev/null +++ b/tools/data/sthv2/extract_rgb_frames_opencv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task rgb --level 1 --ext webm --use-opencv +echo "Genearte raw frames (RGB only)" + +cd sthv2/ diff --git a/tools/data/sthv2/generate_rawframes_filelist.sh b/tools/data/sthv2/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..e782f981b8020c8109106f5414fc631c3d207222 --- /dev/null +++ b/tools/data/sthv2/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/rawframes/ --num-split 1 --level 1 --subset train --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/rawframes/ --num-split 1 --level 1 --subset val --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/sthv2/ diff --git a/tools/data/sthv2/generate_videos_filelist.sh b/tools/data/sthv2/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..86101ac2068cb154cbec239001c9623e51388026 --- /dev/null +++ b/tools/data/sthv2/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/sthv2/ diff --git a/tools/data/sthv2/label_map.txt b/tools/data/sthv2/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..8202a005d3a3fdb58ba8424c83d136cfe80e6cdc --- /dev/null +++ b/tools/data/sthv2/label_map.txt @@ -0,0 +1,174 @@ +Approaching something with your camera +Attaching something to something +Bending something so that it deforms +Bending something until it breaks +Burying something in something +Closing something +Covering something with something +Digging something out of something +Dropping something behind something +Dropping something in front of something +Dropping something into something +Dropping something next to something +Dropping something onto something +Failing to put something into something because something does not fit +Folding something +Hitting something with something +Holding something +Holding something behind something +Holding something in front of something +Holding something next to something +Holding something over something +Laying something on the table on its side, not upright +Letting something roll along a flat surface +Letting something roll down a slanted surface +Letting something roll up a slanted surface, so it rolls back down +Lifting a surface with something on it but not enough for it to slide down +Lifting a surface with something on it until it starts sliding down +Lifting something up completely without letting it drop down +Lifting something up completely, then letting it drop down +Lifting something with something on it +Lifting up one end of something without letting it drop down +Lifting up one end of something, then letting it drop down +Moving away from something with your camera +Moving part of something +Moving something across a surface until it falls down +Moving something across a surface without it falling down +Moving something and something away from each other +Moving something and something closer to each other +Moving something and something so they collide with each other +Moving something and something so they pass each other +Moving something away from something +Moving something away from the camera +Moving something closer to something +Moving something down +Moving something towards the camera +Moving something up +Opening something +Picking something up +Piling something up +Plugging something into something +Plugging something into something but pulling it right out as you remove your hand +Poking a hole into some substance +Poking a hole into something soft +Poking a stack of something so the stack collapses +Poking a stack of something without the stack collapsing +Poking something so it slightly moves +Poking something so lightly that it doesn't or almost doesn't move +Poking something so that it falls over +Poking something so that it spins around +Pouring something into something +Pouring something into something until it overflows +Pouring something onto something +Pouring something out of something +Pretending or failing to wipe something off of something +Pretending or trying and failing to twist something +Pretending to be tearing something that is not tearable +Pretending to close something without actually closing it +Pretending to open something without actually opening it +Pretending to pick something up +Pretending to poke something +Pretending to pour something out of something, but something is empty +Pretending to put something behind something +Pretending to put something into something +Pretending to put something next to something +Pretending to put something on a surface +Pretending to put something onto something +Pretending to put something underneath something +Pretending to scoop something up with something +Pretending to spread air onto something +Pretending to sprinkle air onto something +Pretending to squeeze something +Pretending to take something from somewhere +Pretending to take something out of something +Pretending to throw something +Pretending to turn something upside down +Pulling something from behind of something +Pulling something from left to right +Pulling something from right to left +Pulling something onto something +Pulling something out of something +Pulling two ends of something but nothing happens +Pulling two ends of something so that it gets stretched +Pulling two ends of something so that it separates into two pieces +Pushing something from left to right +Pushing something from right to left +Pushing something off of something +Pushing something onto something +Pushing something so it spins +Pushing something so that it almost falls off but doesn't +Pushing something so that it falls off the table +Pushing something so that it slightly moves +Pushing something with something +Putting number of something onto something +Putting something and something on the table +Putting something behind something +Putting something in front of something +Putting something into something +Putting something next to something +Putting something on a flat surface without letting it roll +Putting something on a surface +Putting something on the edge of something so it is not supported and falls down +Putting something onto a slanted surface but it doesn't glide down +Putting something onto something +Putting something onto something else that cannot support it so it falls down +Putting something similar to other things that are already on the table +Putting something that can't roll onto a slanted surface, so it slides down +Putting something that can't roll onto a slanted surface, so it stays where it is +Putting something that cannot actually stand upright upright on the table, so it falls on its side +Putting something underneath something +Putting something upright on the table +Putting something, something and something on the table +Removing something, revealing something behind +Rolling something on a flat surface +Scooping something up with something +Showing a photo of something to the camera +Showing something behind something +Showing something next to something +Showing something on top of something +Showing something to the camera +Showing that something is empty +Showing that something is inside something +Something being deflected from something +Something colliding with something and both are being deflected +Something colliding with something and both come to a halt +Something falling like a feather or paper +Something falling like a rock +Spilling something behind something +Spilling something next to something +Spilling something onto something +Spinning something so it continues spinning +Spinning something that quickly stops spinning +Spreading something onto something +Sprinkling something onto something +Squeezing something +Stacking number of something +Stuffing something into something +Taking one of many similar things on the table +Taking something from somewhere +Taking something out of something +Tearing something into two pieces +Tearing something just a little bit +Throwing something +Throwing something against something +Throwing something in the air and catching it +Throwing something in the air and letting it fall +Throwing something onto a surface +Tilting something with something on it slightly so it doesn't fall down +Tilting something with something on it until it falls off +Tipping something over +Tipping something with something in it over, so something in it falls out +Touching (without moving) part of something +Trying but failing to attach something to something because it doesn't stick +Trying to bend something unbendable so nothing happens +Trying to pour something into something, but missing so it spills next to it +Turning something upside down +Turning the camera downwards while filming something +Turning the camera left while filming something +Turning the camera right while filming something +Turning the camera upwards while filming something +Twisting (wringing) something wet until water comes out +Twisting something +Uncovering something +Unfolding something +Wiping something off of something diff --git a/tools/data/sthv2/preprocss.sh b/tools/data/sthv2/preprocss.sh new file mode 100644 index 0000000000000000000000000000000000000000..888ebd335fe9fdd9b7fd102f5f5f91951351eccb --- /dev/null +++ b/tools/data/sthv2/preprocss.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +cat $DOWNLOAD_DIR/OpenDataLab___sthv2/raw/*.tar.gz | tar -xvz -C $(dirname $DATA_ROOT) +tar -xvf $DATA_ROOT/sthv2.tar -C $(dirname $DATA_ROOT) +rm $DATA_ROOT/sthv2.tar diff --git a/tools/data/thumos14/README.md b/tools/data/thumos14/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c76d82c98c1f8f52f9b93bb2850eef753affd722 --- /dev/null +++ b/tools/data/thumos14/README.md @@ -0,0 +1,142 @@ +# Preparing THUMOS'14 + +## Introduction + + + +```BibTeX +@misc{THUMOS14, + author = {Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev, + I. and Shah, M. and Sukthankar, R.}, + title = {{THUMOS} Challenge: Action Recognition with a Large + Number of Classes}, + howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}", + Year = {2014} +} +``` + +For basic dataset information, you can refer to the dataset [website](https://www.crcv.ucf.edu/THUMOS14/download.html). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/thumos14/`. + +## Step 1. Prepare Annotations + +First of all, run the following script to prepare annotations. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash download_videos.sh +``` + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/thumos14_extracted/ +ln -s /mnt/SSD/thumos14_extracted/ ../data/thumos14/rawframes/ +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_rgb_frames_opencv.sh +``` + +If both are required, run the following script to extract frames. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_frames.sh tvl1 +``` + +## Step 4. Fetch File List + +This part is **optional** if you do not use SSN model. + +You can run the follow script to fetch pre-computed tag proposals. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash fetch_tag_proposals.sh +``` + +## Step 5. Denormalize Proposal File + +This part is **optional** if you do not use SSN model. + +You can run the follow script to denormalize pre-computed tag proposals according to +actual number of local rawframes. + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash denormalize_proposal_file.sh +``` + +## Step 6. Check Directory Structure + +After the whole data process for THUMOS'14 preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for THUMOS'14. + +In the context of the whole project (for THUMOS'14 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ thumos14 +โ”‚ โ”‚ โ”œโ”€โ”€ proposals +โ”‚ โ”‚ | โ”œโ”€โ”€ thumos14_tag_val_normalized_proposal_list.txt +โ”‚ โ”‚ | โ”œโ”€โ”€ thumos14_tag_test_normalized_proposal_list.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations_val +โ”‚ โ”‚ โ”œโ”€โ”€ annotations_test +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ val +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_validation_0000001.mp4 +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ | โ”œโ”€โ”€ test +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_test_0000001.mp4 +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ val +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_validation_0000001 +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ img_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ img_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ | โ”œโ”€โ”€ test +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_test_0000001 +``` + +For training and evaluating on THUMOS'14, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/thumos14/README_zh-CN.md b/tools/data/thumos14/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..ec4fd86624df6ce9b8fe8a2010dcd19d56bd676d --- /dev/null +++ b/tools/data/thumos14/README_zh-CN.md @@ -0,0 +1,139 @@ +# ๅ‡†ๅค‡ THUMOS'14 + +## ็ฎ€ไป‹ + + + +```BibTex +@misc{THUMOS14, + author = {Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev, + I. and Shah, M. and Sukthankar, R.}, + title = {{THUMOS} Challenge: Action Recognition with a Large + Number of Classes}, + howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}", + Year = {2014} +} +``` + +็”จๆˆทๅฏไปฅๅ‚็…งๆ•ฐๆฎ้›† [ๅฎ˜็ฝ‘](https://www.crcv.ucf.edu/THUMOS14/download.html)๏ผŒ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๅ‡†ๅค‡ๆ•ฐๆฎ้›†ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/thumos14/`ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash download_annotations.sh +``` + +## ๆญฅ้ชค 2. ไธ‹่ฝฝ่ง†้ข‘ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๆŒ‡ไปคไธ‹่ฝฝ่ง†้ข‘ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash download_videos.sh +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ–ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœ็”จๆˆทๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธŠใ€‚ +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒๆŒ‡ไปค่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"ไธŠ๏ผ‰ +mkdir /mnt/SSD/thumos14_extracted/ +ln -s /mnt/SSD/thumos14_extracted/ ../data/thumos14/rawframes/ +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌ่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash extract_frames.sh tvl1 +``` + +## ๆญฅ้ชค 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +ๅฆ‚ๆžœ็”จๆˆทไธไฝฟ็”จ SSN ๆจกๅž‹๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅฏไฝฟ็”จ่ฟ่กŒไปฅไธ‹่„šๆœฌไธ‹่ฝฝ้ข„ๅ…ˆ่ฎก็ฎ—็š„ๅ€™้€‰ๆ ‡็ญพใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash fetch_tag_proposals.sh +``` + +## ๆญฅ้ชค 5. ๅŽป่ง„่ŒƒๅŒ–ๅ€™้€‰ๆ–‡ไปถ + +ๅฆ‚ๆžœ็”จๆˆทไธไฝฟ็”จ SSN ๆจกๅž‹๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅฏ่ฟ่กŒไปฅไธ‹่„šๆœฌ๏ผŒๆฅๆ นๆฎๆœฌๅœฐๅŽŸๅง‹ๅธง็š„ๅฎž้™…ๆ•ฐ้‡๏ผŒๅŽป่ง„่ŒƒๅŒ–้ข„ๅ…ˆ่ฎก็ฎ—็š„ๅ€™้€‰ๆ ‡็ญพใ€‚ + +```shell +cd $MMACTION2/tools/data/thumos14/ +bash denormalize_proposal_file.sh +``` + +## ๆญฅ้ชค 6. ๆฃ€ๆŸฅ็›ฎๅฝ•็ป“ๆž„ + +ๅœจๅฎŒๆˆ THUMOS'14 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ็”จๆˆทๅฏไปฅๅพ—ๅˆฐ THUMOS'14 ็š„ RGB ๅธง + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒTHUMOS'14 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ thumos14 +โ”‚ โ”‚ โ”œโ”€โ”€ proposals +โ”‚ โ”‚ | โ”œโ”€โ”€ thumos14_tag_val_normalized_proposal_list.txt +โ”‚ โ”‚ | โ”œโ”€โ”€ thumos14_tag_test_normalized_proposal_list.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations_val +โ”‚ โ”‚ โ”œโ”€โ”€ annotations_test +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ val +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_validation_0000001.mp4 +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ | โ”œโ”€โ”€ test +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_test_0000001.mp4 +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ val +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_validation_0000001 +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ img_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ img_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +| โ”‚ โ”‚ | โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ ... +โ”‚ โ”‚ | โ”œโ”€โ”€ test +โ”‚ โ”‚ โ”‚ | โ”œโ”€โ”€ video_test_0000001 +``` + +ๅ…ณไบŽๅฏน THUMOS'14 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒๅฏไปฅๅ‚็…ง [่ฎญ็ปƒๆ•™็จ‹](/docs/zh_cn/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/thumos14/denormalize_proposal_file.sh b/tools/data/thumos14/denormalize_proposal_file.sh new file mode 100644 index 0000000000000000000000000000000000000000..c31f11bb3ad3a5b993c7e08c9ae44f9841c8e96e --- /dev/null +++ b/tools/data/thumos14/denormalize_proposal_file.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/denormalize_proposal_file.py thumos14 --norm-proposal-file data/thumos14/proposals/thumos14_tag_val_normalized_proposal_list.txt --data-prefix data/thumos14/rawframes/val/ +echo "Proposal file denormalized for val set" + +PYTHONPATH=. python tools/data/denormalize_proposal_file.py thumos14 --norm-proposal-file data/thumos14/proposals/thumos14_tag_test_normalized_proposal_list.txt --data-prefix data/thumos14/rawframes/test/ +echo "Proposal file denormalized for test set" + +cd tools/data/thumos14/ diff --git a/tools/data/thumos14/download_annotations.sh b/tools/data/thumos14/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..2ad9d9d450f86a3778d7422b44675fe749160f13 --- /dev/null +++ b/tools/data/thumos14/download_annotations.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/thumos14/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi +cd ${DATA_DIR} + +wget http://crcv.ucf.edu/THUMOS14/Validation_set/TH14_Temporal_annotations_validation.zip --no-check-certificate +wget http://crcv.ucf.edu/THUMOS14/test_set/TH14_Temporal_annotations_test.zip --no-check-certificate + +if [ ! -d "./annotations_val" ]; then + mkdir ./annotations_val +fi +unzip -j TH14_Temporal_annotations_validation.zip -d annotations_val + +if [ ! -d "./annotations_test" ]; then + mkdir ./annotations_test +fi +unzip -j TH14_Temporal_annotations_test.zip -d annotations_test + +rm TH14_Temporal_annotations_validation.zip +rm TH14_Temporal_annotations_test.zip + +cd "../../tools/data/thumos14/" diff --git a/tools/data/thumos14/download_videos.sh b/tools/data/thumos14/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..571b080cf987737f580631bffc97a68060bd99b8 --- /dev/null +++ b/tools/data/thumos14/download_videos.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/thumos14/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget https://storage.googleapis.com/thumos14_files/TH14_validation_set_mp4.zip +wget https://storage.googleapis.com/thumos14_files/TH14_Test_set_mp4.zip + +if [ ! -d "./videos/val" ]; then + mkdir -p ./videos/val +fi +unzip -j TH14_validation_set_mp4.zip -d videos/val + +if [ ! -d "./videos/test" ]; then + mkdir -p ./videos/test +fi +unzip -P "THUMOS14_REGISTERED" -j TH14_Test_set_mp4.zip -d videos/test + +cd "../../tools/data/thumos14/" diff --git a/tools/data/thumos14/extract_frames.sh b/tools/data/thumos14/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..827708691d30a46215aeb1f8fd46fec0f52c6de8 --- /dev/null +++ b/tools/data/thumos14/extract_frames.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --flow-type tvl1 --ext mp4 --task both +echo "Raw frames (RGB and tv-l1) Generated for val set" + +python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --flow-type tvl1 --ext mp4 --task both +echo "Raw frames (RGB and tv-l1) Generated for test set" + +cd thumos14/ diff --git a/tools/data/thumos14/extract_rgb_frames.sh b/tools/data/thumos14/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c92065aefdd088186146455e10847085d19577c --- /dev/null +++ b/tools/data/thumos14/extract_rgb_frames.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --ext mp4 --task rgb +echo "Raw frames (RGB only) generated for val set" + +python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --ext mp4 --task rgb +echo "Raw frames (RGB only) generated for test set" + +cd thumos14/ diff --git a/tools/data/thumos14/extract_rgb_frames_opencv.sh b/tools/data/thumos14/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..4048bbd7b9c9995d0e247155557790e3f50daa89 --- /dev/null +++ b/tools/data/thumos14/extract_rgb_frames_opencv.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --ext mp4 --task rgb --use-opencv +echo "Raw frames (RGB only) generated for val set" + +python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --ext mp4 --task rgb --use-opencv +echo "Raw frames (RGB only) generated for test set" + +cd thumos14/ diff --git a/tools/data/thumos14/fetch_tag_proposals.sh b/tools/data/thumos14/fetch_tag_proposals.sh new file mode 100644 index 0000000000000000000000000000000000000000..4a3654523952552eefc39cf033c2abd10ad0dfa6 --- /dev/null +++ b/tools/data/thumos14/fetch_tag_proposals.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +PROP_DIR="../../../data/thumos14/proposals" + +if [[ ! -d "${PROP_DIR}" ]]; then + echo "${PROP_DIR} does not exist. Creating"; + mkdir -p ${PROP_DIR} +fi + +wget https://download.openmmlab.com/mmaction/dataset/thumos14/thumos14_tag_val_normalized_proposal_list.txt -P ${PROP_DIR} +wget https://download.openmmlab.com/mmaction/dataset/thumos14/thumos14_tag_test_normalized_proposal_list.txt -P ${PROP_DIR} diff --git a/tools/data/ucf101/README.md b/tools/data/ucf101/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cfe741f6a47085fab171ac9b11afb6875b7beb50 --- /dev/null +++ b/tools/data/ucf101/README.md @@ -0,0 +1,127 @@ +# Preparing UCF-101 + +## Introduction + + + +```BibTeX +@article{Soomro2012UCF101AD, + title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + author={K. Soomro and A. Zamir and M. Shah}, + journal={ArXiv}, + year={2012}, + volume={abs/1212.0402} +} +``` + +For basic dataset information, you can refer to the dataset [website](https://www.crcv.ucf.edu/research/data-sets/ucf101/). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ucf101/`. + +## Step 1. Prepare Annotations + +First of all, you can run the following script to prepare annotations. + +```shell +bash download_annotations.sh +``` + +## Step 2. Prepare Videos + +Then, you can run the following script to prepare videos. + +```shell +bash download_videos.sh +``` + +For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by: + +``` +python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/videos_256p_dense_cache --dense --level 2 --ext avi +``` + +## Step 3. Extract RGB and Flow + +This part is **optional** if you only want to use the video loader. + +Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. The extracted frames (RGB + Flow) will take up about 100GB. + +You can run the following script to soft link SSD. + +```shell +# execute these two line (Assume the SSD is mounted at "/mnt/SSD/") +mkdir /mnt/SSD/ucf101_extracted/ +ln -s /mnt/SSD/ucf101_extracted/ ../../../data/ucf101/rawframes +``` + +If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow. + +```shell +bash extract_rgb_frames.sh +``` + +If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images. + +```shell +bash extract_rgb_frames_opencv.sh +``` + +If Optical Flow is also required, run the following script to extract flow using "tvl1" algorithm. + +```shell +bash extract_frames.sh +``` + +## Step 4. Generate File List + +you can run the follow script to generate file list in the format of rawframes and videos. + +```shell +bash generate_videos_filelist.sh +bash generate_rawframes_filelist.sh +``` + +## Step 5. Check Directory Structure + +After the whole data process for UCF-101 preparation, +you will get the rawframes (RGB + Flow), videos and annotation files for UCF-101. + +In the context of the whole project (for UCF-101 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ucf101 +โ”‚ โ”‚ โ”œโ”€โ”€ ucf101_{train,val}_split_{1,2,3}_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ ucf101_{train,val}_split_{1,2,3}_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ApplyEyeMakeup +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_ApplyEyeMakeup_g01_c01.avi + +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ YoYo +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g25_c05.avi +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ApplyEyeMakeup +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_ApplyEyeMakeup_g01_c01 +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ YoYo +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g01_c01 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g25_c05 + +``` + +For training and evaluating on UCF-101, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md). diff --git a/tools/data/ucf101/README_zh-CN.md b/tools/data/ucf101/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..eb9ed7bc6f0d9e5e9e63753b48c671aa13e869d8 --- /dev/null +++ b/tools/data/ucf101/README_zh-CN.md @@ -0,0 +1,125 @@ +# ๅ‡†ๅค‡ UCF-101 + +## ็ฎ€ไป‹ + +```BibTeX +@article{Soomro2012UCF101AD, + title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + author={K. Soomro and A. Zamir and M. Shah}, + journal={ArXiv}, + year={2012}, + volume={abs/1212.0402} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](https://www.crcv.ucf.edu/research/data-sets/ucf101/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/ucf101/`ใ€‚ + +## ๆญฅ้ชค 1. ไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถ + +้ฆ–ๅ…ˆ๏ผŒ็”จๆˆทๅฏ่ฟ่กŒไปฅไธ‹่„šๆœฌไธ‹่ฝฝๆ ‡ๆณจๆ–‡ไปถใ€‚ + +```shell +bash download_annotations.sh +``` + +## ๆญฅ้ชค 2. ๅ‡†ๅค‡่ง†้ข‘ๆ–‡ไปถ + +ไน‹ๅŽ๏ผŒ็”จๆˆทๅฏ่ฟ่กŒไปฅไธ‹่„šๆœฌๅ‡†ๅค‡่ง†้ข‘ๆ–‡ไปถใ€‚ + +```shell +bash download_videos.sh +``` + +็”จๆˆทๅฏไฝฟ็”จไปฅไธ‹่„šๆœฌ๏ผŒๅฏนๅŽŸ่ง†้ข‘่ฟ›่กŒ่ฃๅ‰ช๏ผŒๅพ—ๅˆฐๅฏ†้›†็ผ–็ ไธ”ๆ›ดๅฐๅฐบๅฏธ็š„่ง†้ข‘ใ€‚ + +``` +python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/videos_256p_dense_cache --dense --level 2 --ext avi +``` + +## ๆญฅ้ชค 3. ๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆต + +ๅฆ‚ๆžœ็”จๆˆทๅชๆƒณไฝฟ็”จ่ง†้ข‘ๅŠ ่ฝฝ่ฎญ็ปƒ๏ผŒๅˆ™่ฏฅ้ƒจๅˆ†ๆ˜ฏ **ๅฏ้€‰้กน**ใ€‚ + +ๅœจๆŠฝๅ–่ง†้ข‘ๅธงๅ’Œๅ…‰ๆตไน‹ๅ‰๏ผŒ่ฏทๅ‚่€ƒ [ๅฎ‰่ฃ…ๆŒ‡ๅ—](/docs/zh_cn/get_started/installation.md) ๅฎ‰่ฃ… [denseflow](https://github.com/open-mmlab/denseflow)ใ€‚ + +ๅฆ‚ๆžœๆ‹ฅๆœ‰ๅคง้‡็š„ SSD ๅญ˜ๅ‚จ็ฉบ้—ด๏ผŒๅˆ™ๆŽจ่ๅฐ†ๆŠฝๅ–็š„ๅธงๅญ˜ๅ‚จ่‡ณ I/O ๆ€ง่ƒฝๆ›ดไผ˜็ง€็š„ SSD ไธญใ€‚ๆ‰€ๆŠฝๅ–็š„่ง†้ข‘ๅธงๅ’Œๅ…‰ๆต็บฆๅ ๆฎ 100 GB ็š„ๅญ˜ๅ‚จ็ฉบ้—ดใ€‚ + +ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไธบ SSD ๅปบ็ซ‹่ฝฏ้“พๆŽฅใ€‚ + +```shell +# ๆ‰ง่กŒ่ฟ™ไธค่กŒ่ฟ›่กŒๆŠฝๅ–๏ผˆๅ‡่ฎพ SSD ๆŒ‚่ฝฝๅœจ "/mnt/SSD/"๏ผ‰ +mkdir /mnt/SSD/ucf101_extracted/ +ln -s /mnt/SSD/ucf101_extracted/ ../../../data/ucf101/rawframes +``` + +ๅฆ‚ๆžœ็”จๆˆท้œ€่ฆๆŠฝๅ– RGB ๅธง๏ผˆๅ› ไธบๆŠฝๅ–ๅ…‰ๆต็š„่ฟ‡็จ‹ๅๅˆ†่€—ๆ—ถ๏ผ‰๏ผŒๅฏไปฅ่€ƒ่™‘่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ denseflow **ๅชๆŠฝๅ– RGB ๅธง**ใ€‚ + +```shell +bash extract_rgb_frames.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆฒกๆœ‰ๅฎ‰่ฃ… denseflow๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹ๅ‘ฝไปคไฝฟ็”จ OpenCV ๆŠฝๅ– RGB ๅธงใ€‚็„ถ่€Œ๏ผŒ่ฏฅๆ–นๆณ•ๅช่ƒฝๆŠฝๅ–ไธŽๅŽŸๅง‹่ง†้ข‘ๅˆ†่พจ็އ็›ธๅŒ็š„ๅธงใ€‚ + +```shell +bash extract_rgb_frames_opencv.sh +``` + +ๅฆ‚ๆžœ็”จๆˆทๆƒณๆŠฝๅ– RGB ๅธงๅ’Œๅ…‰ๆต๏ผŒๅˆ™ๅฏไปฅ่ฟ่กŒไปฅไธ‹่„šๆœฌไฝฟ็”จ "tvl1" ็ฎ—ๆณ•่ฟ›่กŒๆŠฝๅ–ใ€‚ + +```shell +bash extract_frames.sh +``` + +## ๆญฅ้ชค 4. ็”Ÿๆˆๆ–‡ไปถๅˆ—่กจ + +็”จๆˆทๅฏไปฅ้€š่ฟ‡่ฟ่กŒไปฅไธ‹ๅ‘ฝไปค็”Ÿๆˆๅธงๅ’Œ่ง†้ข‘ๆ ผๅผ็š„ๆ–‡ไปถๅˆ—่กจใ€‚ + +```shell +bash generate_videos_filelist.sh +bash generate_rawframes_filelist.sh +``` + +## ๆญฅ้ชค 5. ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +ๅœจๅฎŒๆˆๆ‰€ๆœ‰ UCF-101 ๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๆต็จ‹ๅŽ๏ผŒ +็”จๆˆทๅฏไปฅ่Žทๅพ—ๅฏนๅบ”็š„ RGB + ๅ…‰ๆตๆ–‡ไปถ๏ผŒ่ง†้ข‘ๆ–‡ไปถไปฅๅŠๆ ‡ๆณจๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒUCF-101 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ucf101 +โ”‚ โ”‚ โ”œโ”€โ”€ ucf101_{train,val}_split_{1,2,3}_rawframes.txt +โ”‚ โ”‚ โ”œโ”€โ”€ ucf101_{train,val}_split_{1,2,3}_videos.txt +โ”‚ โ”‚ โ”œโ”€โ”€ annotations +โ”‚ โ”‚ โ”œโ”€โ”€ videos +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ApplyEyeMakeup +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_ApplyEyeMakeup_g01_c01.avi + +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ YoYo +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g25_c05.avi +โ”‚ โ”‚ โ”œโ”€โ”€ rawframes +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ApplyEyeMakeup +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_ApplyEyeMakeup_g01_c01 +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ img_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_x_00002.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00001.jpg +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ flow_y_00002.jpg +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ YoYo +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g01_c01 +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ v_YoYo_g25_c05 + +``` + +ๅ…ณไบŽๅฏน UCF-101 ่ฟ›่กŒ่ฎญ็ปƒๅ’Œ้ชŒ่ฏ๏ผŒ่ฏทๅ‚่€ƒ [่ฎญ็ปƒๅ’Œๆต‹่ฏ•ๆ•™็จ‹](/docs/en/user_guides/train_test.md)ใ€‚ diff --git a/tools/data/ucf101/download_annotations.sh b/tools/data/ucf101/download_annotations.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3dc2a90db52a05d9ab92396bb93cd50de99c869 --- /dev/null +++ b/tools/data/ucf101/download_annotations.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/ucf101/annotations" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +wget https://www.crcv.ucf.edu/wp-content/uploads/2019/03/UCF101TrainTestSplits-RecognitionTask.zip --no-check-certificate + +unzip -j UCF101TrainTestSplits-RecognitionTask.zip -d ${DATA_DIR}/ +rm UCF101TrainTestSplits-RecognitionTask.zip diff --git a/tools/data/ucf101/download_videos.sh b/tools/data/ucf101/download_videos.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7d7600e1417a490c1cd8ec5c914116aba80ae32 --- /dev/null +++ b/tools/data/ucf101/download_videos.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/ucf101/" + +if [[ ! -d "${DATA_DIR}" ]]; then + echo "${DATA_DIR} does not exist. Creating"; + mkdir -p ${DATA_DIR} +fi + +cd ${DATA_DIR} + +wget https://www.crcv.ucf.edu/datasets/human-actions/ucf101/UCF101.rar --no-check-certificate +unrar x UCF101.rar +mv ./UCF-101 ./videos + +cd "../../tools/data/ucf101" diff --git a/tools/data/ucf101/extract_frames.sh b/tools/data/ucf101/extract_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..44da782c4e6d4b59e321f43d1c109dda4b07bb0d --- /dev/null +++ b/tools/data/ucf101/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task both --level 2 --flow-type tvl1 +echo "Raw frames (RGB and Flow) Generated" +cd ucf101/ diff --git a/tools/data/ucf101/extract_rgb_frames.sh b/tools/data/ucf101/extract_rgb_frames.sh new file mode 100644 index 0000000000000000000000000000000000000000..71916e28e0574de1d66c436d456f4990b4c44254 --- /dev/null +++ b/tools/data/ucf101/extract_rgb_frames.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task rgb --level 2 --ext avi +echo "Genearte raw frames (RGB only)" + +cd ucf101/ diff --git a/tools/data/ucf101/extract_rgb_frames_opencv.sh b/tools/data/ucf101/extract_rgb_frames_opencv.sh new file mode 100644 index 0000000000000000000000000000000000000000..7cc3de219499900f0777ef3e3508aac3422cbcd7 --- /dev/null +++ b/tools/data/ucf101/extract_rgb_frames_opencv.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd ../ +python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task rgb --level 2 --ext avi --use-opencv +echo "Genearte raw frames (RGB only)" + +cd ucf101/ diff --git a/tools/data/ucf101/generate_rawframes_filelist.sh b/tools/data/ucf101/generate_rawframes_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b0bcd20b457b2655918d1a25ea378b014f467f1 --- /dev/null +++ b/tools/data/ucf101/generate_rawframes_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ + +PYTHONPATH=. python tools/data/build_file_list.py ucf101 data/ucf101/rawframes/ --level 2 --format rawframes --shuffle +echo "Filelist for rawframes generated." + +cd tools/data/ucf101/ diff --git a/tools/data/ucf101/generate_videos_filelist.sh b/tools/data/ucf101/generate_videos_filelist.sh new file mode 100644 index 0000000000000000000000000000000000000000..ef72ca9e3f65c52c206a6fdbd70622401ca30ccb --- /dev/null +++ b/tools/data/ucf101/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ + +PYTHONPATH=. python tools/data/build_file_list.py ucf101 data/ucf101/videos/ --level 2 --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/ucf101/ diff --git a/tools/data/ucf101/label_map.txt b/tools/data/ucf101/label_map.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5c18b454e876330e19a5c8b09663dee4d8f9ad4 --- /dev/null +++ b/tools/data/ucf101/label_map.txt @@ -0,0 +1,101 @@ +ApplyEyeMakeup +ApplyLipstick +Archery +BabyCrawling +BalanceBeam +BandMarching +BaseballPitch +Basketball +BasketballDunk +BenchPress +Biking +Billiards +BlowDryHair +BlowingCandles +BodyWeightSquats +Bowling +BoxingPunchingBag +BoxingSpeedBag +BreastStroke +BrushingTeeth +CleanAndJerk +CliffDiving +CricketBowling +CricketShot +CuttingInKitchen +Diving +Drumming +Fencing +FieldHockeyPenalty +FloorGymnastics +FrisbeeCatch +FrontCrawl +GolfSwing +Haircut +Hammering +HammerThrow +HandstandPushups +HandstandWalking +HeadMassage +HighJump +HorseRace +HorseRiding +HulaHoop +IceDancing +JavelinThrow +JugglingBalls +JumpingJack +JumpRope +Kayaking +Knitting +LongJump +Lunges +MilitaryParade +Mixing +MoppingFloor +Nunchucks +ParallelBars +PizzaTossing +PlayingCello +PlayingDaf +PlayingDhol +PlayingFlute +PlayingGuitar +PlayingPiano +PlayingSitar +PlayingTabla +PlayingViolin +PoleVault +PommelHorse +PullUps +Punch +PushUps +Rafting +RockClimbingIndoor +RopeClimbing +Rowing +SalsaSpin +ShavingBeard +Shotput +SkateBoarding +Skiing +Skijet +SkyDiving +SoccerJuggling +SoccerPenalty +StillRings +SumoWrestling +Surfing +Swing +TableTennisShot +TaiChi +TennisSwing +ThrowDiscus +TrampolineJumping +Typing +UnevenBars +VolleyballSpiking +WalkingWithDog +WallPushups +WritingOnBoard +YoYo diff --git a/tools/data/ucf101_24/README.md b/tools/data/ucf101_24/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3418ce8d5bb0ca126c7490de98ce96b59424a6fc --- /dev/null +++ b/tools/data/ucf101_24/README.md @@ -0,0 +1,89 @@ +# Preparing UCF101-24 + +## Introduction + + + +```BibTeX +@article{Soomro2012UCF101AD, + title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + author={K. Soomro and A. Zamir and M. Shah}, + journal={ArXiv}, + year={2012}, + volume={abs/1212.0402} +} +``` + +For basic dataset information, you can refer to the dataset [website](http://www.thumos.info/download.html). +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ucf101_24/`. + +## Download and Extract + +You can download the RGB frames, optical flow and ground truth annotations from [google drive](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct). +The data are provided from [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md), which is adapted from [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector) and [corrected-UCF101-Annots](https://github.com/gurkirt/corrected-UCF101-Annots). + +:::{note} +The annotation of this UCF101-24 is from [here](https://github.com/gurkirt/corrected-UCF101-Annots), which is more correct. +::: + +After downloading the `UCF101_v2.tar.gz` file and put it in `$MMACTION2/tools/data/ucf101_24/`, you can run the following command to uncompress. + +```shell +tar -zxvf UCF101_v2.tar.gz +``` + +## Check Directory Structure + +After uncompressing, you will get the `rgb-images` directory, `brox-images` directory and `UCF101v2-GT.pkl` for UCF101-24. + +In the context of the whole project (for UCF101-24 only), the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ucf101_24 +โ”‚ | โ”œโ”€โ”€ brox-images +โ”‚ | | โ”œโ”€โ”€ Basketball +โ”‚ | | | โ”œโ”€โ”€ v_Basketball_g01_c01 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00140.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00141.jpg +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ WalkingWithDog +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g01_c01 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g25_c04 +โ”‚ | โ”œโ”€โ”€ rgb-images +โ”‚ | | โ”œโ”€โ”€ Basketball +โ”‚ | | | โ”œโ”€โ”€ v_Basketball_g01_c01 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00140.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00141.jpg +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ WalkingWithDog +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g01_c01 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g25_c04 +โ”‚ | โ”œโ”€โ”€ UCF101v2-GT.pkl + +``` + +:::{note} +The `UCF101v2-GT.pkl` exists as a cache, it contains 6 items as follows: +::: + +1. `labels` (list): List of the 24 labels. +2. `gttubes` (dict): Dictionary that contains the ground truth tubes for each video. + A **gttube** is dictionary that associates with each index of label and a list of tubes. + A **tube** is a numpy array with `nframes` rows and 5 columns, each col is in format like ` `. +3. `nframes` (dict): Dictionary that contains the number of frames for each video, like `'HorseRiding/v_HorseRiding_g05_c02': 151`. +4. `train_videos` (list): A list with `nsplits=1` elements, each one containing the list of training videos. +5. `test_videos` (list): A list with `nsplits=1` elements, each one containing the list of testing videos. +6. `resolution` (dict): Dictionary that outputs a tuple (h,w) of the resolution for each video, like `'FloorGymnastics/v_FloorGymnastics_g09_c03': (240, 320)`. diff --git a/tools/data/ucf101_24/README_zh-CN.md b/tools/data/ucf101_24/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..8e6da66bfe5ab165e560aa9694f239ac5c3e6ab6 --- /dev/null +++ b/tools/data/ucf101_24/README_zh-CN.md @@ -0,0 +1,84 @@ +# ๅ‡†ๅค‡ UCF101-24 + +## ็ฎ€ไป‹ + +```BibTeX +@article{Soomro2012UCF101AD, + title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + author={K. Soomro and A. Zamir and M. Shah}, + journal={ArXiv}, + year={2012}, + volume={abs/1212.0402} +} +``` + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„ [ๅฎ˜็ฝ‘](http://www.thumos.info/download.html)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚ +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/ucf101_24/`ใ€‚ + +## ไธ‹่ฝฝๅ’Œ่งฃๅŽ‹ + +็”จๆˆทๅฏไปฅไปŽ [่ฟ™้‡Œ](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct) ไธ‹่ฝฝ RGB ๅธง๏ผŒๅ…‰ๆตๅ’Œๆ ‡ๆณจๆ–‡ไปถใ€‚ +่ฏฅๆ•ฐๆฎ็”ฑ [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md) ไปฃ็ ๅบ“ๆไพ›๏ผŒ +ๅ‚่€ƒ่‡ช [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector) ๅ’Œ [corrected-UCF101-Annots](https://github.com/gurkirt/corrected-UCF101-Annots)ใ€‚ + +**ๆณจๆ„**๏ผšUCF101-24 ็š„ๆ ‡ๆณจๆ–‡ไปถๆฅ่‡ชไบŽ [่ฟ™้‡Œ](https://github.com/gurkirt/corrected-UCF101-Annots)๏ผŒ่ฏฅๆ ‡ๆณจๆ–‡ไปถ็›ธๅฏนไบŽๅ…ถไป–ๆ ‡ๆณจๆ–‡ไปถๆ›ดๅŠ ๅ‡†็กฎใ€‚ + +็”จๆˆทๅœจไธ‹่ฝฝ `UCF101_v2.tar.gz` ๆ–‡ไปถๅŽ๏ผŒ้œ€ๅฐ†ๅ…ถๆ”พ็ฝฎๅœจ `$MMACTION2/tools/data/ucf101_24/` ็›ฎๅฝ•ไธ‹๏ผŒๅนถไฝฟ็”จไปฅไธ‹ๆŒ‡ไปค่ฟ›่กŒ่งฃๅŽ‹๏ผš + +```shell +tar -zxvf UCF101_v2.tar.gz +``` + +## ๆฃ€ๆŸฅๆ–‡ไปถๅคน็ป“ๆž„ + +็ป่ฟ‡่งฃๅŽ‹ๅŽ๏ผŒ็”จๆˆทๅฐ†ๅพ—ๅˆฐ `rgb-images` ๆ–‡ไปถๅคน๏ผŒ`brox-images` ๆ–‡ไปถๅคนๅ’Œ `UCF101v2-GT.pkl` ๆ–‡ไปถใ€‚ + +ๅœจๆ•ดไธช MMAction2 ๆ–‡ไปถๅคนไธ‹๏ผŒUCF101_24 ็š„ๆ–‡ไปถ็ป“ๆž„ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ ucf101_24 +โ”‚ | โ”œโ”€โ”€ brox-images +โ”‚ | | โ”œโ”€โ”€ Basketball +โ”‚ | | | โ”œโ”€โ”€ v_Basketball_g01_c01 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00140.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00141.jpg +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ WalkingWithDog +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g01_c01 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g25_c04 +โ”‚ | โ”œโ”€โ”€ rgb-images +โ”‚ | | โ”œโ”€โ”€ Basketball +โ”‚ | | | โ”œโ”€โ”€ v_Basketball_g01_c01 +โ”‚ | | | | โ”œโ”€โ”€ 00001.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00002.jpg +โ”‚ | | | | โ”œโ”€โ”€ ... +โ”‚ | | | | โ”œโ”€โ”€ 00140.jpg +โ”‚ | | | | โ”œโ”€โ”€ 00141.jpg +โ”‚ | | โ”œโ”€โ”€ ... +โ”‚ | | โ”œโ”€โ”€ WalkingWithDog +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g01_c01 +โ”‚ | | | โ”œโ”€โ”€ ... +โ”‚ | | | โ”œโ”€โ”€ v_WalkingWithDog_g25_c04 +โ”‚ | โ”œโ”€โ”€ UCF101v2-GT.pkl + +``` + +**ๆณจๆ„**๏ผš`UCF101v2-GT.pkl` ไฝœไธบไธ€ไธช็ผ“ๅญ˜ๆ–‡ไปถ๏ผŒๅฎƒๅŒ…ๅซ 6 ไธช้กน็›ฎ๏ผš + +1. `labels` (list)๏ผš24 ไธช่กŒไธบ็ฑปๅˆซๅ็งฐ็ป„ๆˆ็š„ๅˆ—่กจ +2. `gttubes` (dict)๏ผšๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅŸบๅ‡† tubes ็ป„ๆˆ็š„ๅญ—ๅ…ธ + **gttube** ๆ˜ฏ็”ฑๆ ‡็ญพ็ดขๅผ•ๅ’Œ tube ๅˆ—่กจ็ป„ๆˆ็š„ๅญ—ๅ…ธ + **tube** ๆ˜ฏไธ€ไธช `nframes` ่กŒๅ’Œ 5 ๅˆ—็š„ numpy array๏ผŒๆฏไธ€ๅˆ—็š„ๅฝขๅผๅฆ‚ ` ` +3. `nframes` (dict)๏ผš็”จไปฅ่กจ็คบๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅธงๆ•ฐ๏ผŒๅฆ‚ `'HorseRiding/v_HorseRiding_g05_c02': 151` +4. `train_videos` (list)๏ผšๅŒ…ๅซ `nsplits=1` ็š„ๅ…ƒ็ด ๏ผŒๆฏไธ€้กน้ƒฝๅŒ…ๅซไบ†่ฎญ็ปƒ่ง†้ข‘็š„ๅˆ—่กจ +5. `test_videos` (list)๏ผšๅŒ…ๅซ `nsplits=1` ็š„ๅ…ƒ็ด ๏ผŒๆฏไธ€้กน้ƒฝๅŒ…ๅซไบ†ๆต‹่ฏ•่ง†้ข‘็š„ๅˆ—่กจ +6. `resolution` (dict)๏ผšๆฏไธช่ง†้ข‘ๅฏนๅบ”็š„ๅˆ†่พจ็އ๏ผˆๅฝขๅฆ‚ (h,w)๏ผ‰๏ผŒๅฆ‚ `'FloorGymnastics/v_FloorGymnastics_g09_c03': (240, 320)` diff --git a/tools/data/video_retrieval/README.md b/tools/data/video_retrieval/README.md new file mode 100644 index 0000000000000000000000000000000000000000..09c35491dcccc135a81a1d6e32a9da63efd11d68 --- /dev/null +++ b/tools/data/video_retrieval/README.md @@ -0,0 +1,83 @@ +# Preparing Video Retrieval Datasets + +## Introduction + + + +```BibTeX +@inproceedings{xu2016msr, + title={Msr-vtt: A large video description dataset for bridging video and language}, + author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong}, + booktitle={CVPR}, + pages={5288--5296}, + year={2016} +} +``` + +```BibTeX +@inproceedings{chen2011collecting, + title={Collecting highly parallel data for paraphrase evaluation}, + author={Chen, David and Dolan, William B}, + booktitle={ACL}, + pages={190--200}, + year={2011} +} +``` + +Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/video_retrieval/`. + +## Preparing MSRVTT dataset + +For basic dataset information, you can refer to the MSRVTT dataset [website](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/). Run the following command to prepare the MSRVTT dataset: + +```shell +bash prepare_msrvtt.sh +``` + +After preparation, the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ video_retrieval +โ”‚ โ”‚ โ””โ”€โ”€ msrvtt +โ”‚ โ”‚ โ”œโ”€โ”€ train_9k.json +โ”‚ โ”‚ โ”œโ”€โ”€ train_7k.json +โ”‚ โ”‚ โ”œโ”€โ”€ test_JSFUSION.json +โ”‚ โ”‚ โ””โ”€โ”€โ”€ videos +โ”‚ โ”‚ โ”œโ”€โ”€ video0.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ video1.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ video9999.mp4 +``` + +## Preparing MSVD dataset + +For basic dataset information, you can refer to the MSVD dataset [website](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/). Run the following command to prepare the MSVD dataset: + +```shell +bash prepare_msvd.sh +``` + +After preparation, the folder structure will look like: + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ video_retrieval +โ”‚ โ”‚ โ””โ”€โ”€ msrvd +โ”‚ โ”‚ โ”œโ”€โ”€ train.json +โ”‚ โ”‚ โ”œโ”€โ”€ test.json +โ”‚ โ”‚ โ”œโ”€โ”€ val.json +โ”‚ โ”‚ โ””โ”€โ”€โ”€ videos +โ”‚ โ”‚ โ”œโ”€โ”€ xxx.avi +โ”‚ โ”‚ โ”œโ”€โ”€ xxx.avi +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ xxx.avi +``` diff --git a/tools/data/video_retrieval/README_zh-CN.md b/tools/data/video_retrieval/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..c19b84a2777ec22cca67bec001e29ecd30fd06c1 --- /dev/null +++ b/tools/data/video_retrieval/README_zh-CN.md @@ -0,0 +1,83 @@ +# ๅ‡†ๅค‡่ง†้ข‘ๆฃ€็ดขๆ•ฐๆฎ้›† + +## ็ฎ€ไป‹ + + + +```BibTeX +@inproceedings{xu2016msr, + title={Msr-vtt: A large video description dataset for bridging video and language}, + author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong}, + booktitle={CVPR}, + pages={5288--5296}, + year={2016} +} +``` + +```BibTeX +@inproceedings{chen2011collecting, + title={Collecting highly parallel data for paraphrase evaluation}, + author={Chen, David and Dolan, William B}, + booktitle={ACL}, + pages={190--200}, + year={2011} +} +``` + +ๅœจๆ•ฐๆฎ้›†ๅ‡†ๅค‡ๅ‰๏ผŒ่ฏท็กฎไฟๅ‘ฝไปค่กŒๅฝ“ๅ‰่ทฏๅพ„ไธบ `$MMACTION2/tools/data/video_retrieval/`ใ€‚ + +## ๅ‡†ๅค‡ MSRVTT ๆ•ฐๆฎ้›† + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„[ๅฎ˜็ฝ‘](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚่ฟ่กŒไธ‹้ข็š„ๅ‘ฝไปคๅ‡†ๅค‡ MSRVTT ๆ•ฐๆฎ้›†๏ผš + +```shell +bash prepare_msrvtt.sh +``` + +ๅฎŒๆˆไธŠ่ฟฐๅ‡†ๅค‡ๆญฅ้ชคๅŽ๏ผŒๆ–‡ไปถ็›ฎๅฝ•ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ video_retrieval +โ”‚ โ”‚ โ””โ”€โ”€ msrvtt +โ”‚ โ”‚ โ”œโ”€โ”€ train_9k.json +โ”‚ โ”‚ โ”œโ”€โ”€ train_7k.json +โ”‚ โ”‚ โ”œโ”€โ”€ test_JSFUSION.json +โ”‚ โ”‚ โ””โ”€โ”€โ”€ videos +โ”‚ โ”‚ โ”œโ”€โ”€ video0.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ video1.mp4 +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ video9999.mp4 +``` + +## ๅ‡†ๅค‡ MSVD ๆ•ฐๆฎ้›† + +็”จๆˆทๅฏๅ‚่€ƒ่ฏฅๆ•ฐๆฎ้›†็š„[ๅฎ˜็ฝ‘](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/)๏ผŒไปฅ่Žทๅ–ๆ•ฐๆฎ้›†็›ธๅ…ณ็š„ๅŸบๆœฌไฟกๆฏใ€‚่ฟ่กŒไธ‹้ข็š„ๅ‘ฝไปคๅ‡†ๅค‡ MSVD ๆ•ฐๆฎ้›†๏ผš + +```shell +bash prepare_msvd.sh +``` + +ๅฎŒๅœบไธŠ่ฟฐๅ‡†ๅค‡ๆญฅ้ชคๅŽ๏ผŒๆ–‡ไปถ็›ฎๅฝ•ๅฆ‚ไธ‹๏ผš + +``` +mmaction2 +โ”œโ”€โ”€ mmaction +โ”œโ”€โ”€ tools +โ”œโ”€โ”€ configs +โ”œโ”€โ”€ data +โ”‚ โ”œโ”€โ”€ video_retrieval +โ”‚ โ”‚ โ””โ”€โ”€ msvd +โ”‚ โ”‚ โ”œโ”€โ”€ train.json +โ”‚ โ”‚ โ”œโ”€โ”€ text.json +โ”‚ โ”‚ โ”œโ”€โ”€ val.json +โ”‚ โ”‚ โ””โ”€โ”€โ”€ videos +โ”‚ โ”‚ โ”œโ”€โ”€ xxx.avi +โ”‚ โ”‚ โ”œโ”€โ”€ xxx.avi +โ”‚ โ”‚ โ”œโ”€โ”€ ... +โ”‚ โ”‚ โ””โ”€โ”€ xxx.avi +``` diff --git a/tools/data/video_retrieval/prepare_msrvtt.py b/tools/data/video_retrieval/prepare_msrvtt.py new file mode 100644 index 0000000000000000000000000000000000000000..09fac7659d49c1b095fb06388df959dce56c8617 --- /dev/null +++ b/tools/data/video_retrieval/prepare_msrvtt.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp + +import pandas as pd + +DATA_DIR = '../../../data/video_retrieval/msrvtt' +SUFFIX = '.mp4' + +raw_data_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_data.json') +train_csv_path = [ + osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.9k.csv'), + osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.7k.csv') +] +test_csv_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_JSFUSION_test.csv') +train_json_path = [ + osp.join(DATA_DIR, 'train_9k.json'), + osp.join(DATA_DIR, 'train_7k.json') +] +test_json_path = osp.join(DATA_DIR, 'test_JSFUSION.json') + +with open(raw_data_path, 'r') as f: + data = json.load(f) + +sentences = data['sentences'] +video_dict = {} +for sentence in sentences: + caption = sentence['caption'] + video_id = sentence['video_id'] + if video_id not in video_dict: + video_dict[video_id] = [] + video_dict[video_id].append(caption) + +for ip, op in zip(train_csv_path, train_json_path): + train_csv = pd.read_csv(ip) + train_video_ids = list(train_csv['video_id'].values) + train_video_dict = {} + for video_id in train_video_ids: + train_video_dict[video_id + SUFFIX] = video_dict[video_id] + + with open(op, 'w') as f: + json.dump(train_video_dict, f) + +test_data = pd.read_csv(test_csv_path) + +test_video_dict = {} +for video_id, sentence in zip(test_data['video_id'], test_data['sentence']): + test_video_dict[video_id + SUFFIX] = [sentence] + +with open(test_json_path, 'w') as f: + json.dump(test_video_dict, f) diff --git a/tools/data/video_retrieval/prepare_msrvtt.sh b/tools/data/video_retrieval/prepare_msrvtt.sh new file mode 100644 index 0000000000000000000000000000000000000000..c0ee97d56ad3d28831b40fadf5bb8fbb652a6781 --- /dev/null +++ b/tools/data/video_retrieval/prepare_msrvtt.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/video_retrieval/msrvtt" +mkdir -p ${DATA_DIR} + + +if [ -f "msrvtt_data.zip" ]; then + echo "msrvtt_data.zip exists, skip downloading!" +else + echo "Downloading msrvtt_data.zip." + wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msrvtt_data.zip +fi + +echo "Processing annotations started." +unzip -q msrvtt_data.zip -d ${DATA_DIR} +python prepare_msrvtt.py +echo "Processing annotations completed." + +if [ -f "MSRVTT.zip" ]; then + echo "MSRVTT.zip exists, skip downloading!" +else + echo "Downloading MSRVTT.zip." + wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip +fi + +echo "Processing videos started." +unzip -q MSRVTT.zip -d ${DATA_DIR} +mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/MSRVTT/videos/all" -name "video*.mp4" -exec mv {} "${DATA_DIR}/videos/" \; +echo "Processing videos completed." + +rm -rf "${DATA_DIR}/MSRVTT" +rm -rf "${DATA_DIR}/msrvtt_data" +rm msrvtt_data.zip +rm MSRVTT.zip +echo "The preparation of the msrvtt dataset has been successfully completed." diff --git a/tools/data/video_retrieval/prepare_msvd.py b/tools/data/video_retrieval/prepare_msvd.py new file mode 100644 index 0000000000000000000000000000000000000000..e53813e4644cad2eb5d279660ca8964c975b91dc --- /dev/null +++ b/tools/data/video_retrieval/prepare_msvd.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import pickle + +DATA_DIR = '../../../data/video_retrieval/msvd' +SUFFIX = '.avi' + +data_path = osp.join(DATA_DIR, 'msvd_data/raw-captions.pkl') +train_txt_path = osp.join(DATA_DIR, 'msvd_data/train_list.txt') +test_txt_path = osp.join(DATA_DIR, 'msvd_data/test_list.txt') +val_txt_path = osp.join(DATA_DIR, 'msvd_data/val_list.txt') +train_json_path = osp.join(DATA_DIR, 'train.json') +test_json_path = osp.join(DATA_DIR, 'test.json') +val_json_path = osp.join(DATA_DIR, 'val.json') + +with open(data_path, 'rb') as F: + data = pickle.load(F) + +video_dict = {} +for one_data in data: + caption = data[one_data] + if one_data not in video_dict: + video_dict[one_data] = [] + for cap in caption: + video_dict[one_data].append(' '.join(cap)) + +with open(train_txt_path, 'r') as f: + train_avi = f.readlines() + +train_avi_list = {} +for video in train_avi: + train_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()] + +with open(train_json_path, 'w') as f: + json.dump(train_avi_list, f) + +with open(test_txt_path, 'r') as f: + test_avi = f.readlines() + +test_avi_list = {} +for video in test_avi: + test_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()] +with open(test_json_path, 'w') as f: + json.dump(test_avi_list, f) + +with open(val_txt_path, 'r') as f: + val_avi = f.readlines() + +val_avi_list = {} +for video in val_avi: + val_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()] + +with open(val_json_path, 'w') as f: + json.dump(val_avi_list, f) diff --git a/tools/data/video_retrieval/prepare_msvd.sh b/tools/data/video_retrieval/prepare_msvd.sh new file mode 100644 index 0000000000000000000000000000000000000000..365ac16fade8ea5dbf91560bbdd0d26f0a48ac8d --- /dev/null +++ b/tools/data/video_retrieval/prepare_msvd.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +DATA_DIR="../../../data/video_retrieval/msvd" +mkdir -p ${DATA_DIR} + + +if [ -f "msvd_data.zip" ]; then + echo "msvd_data.zip exists, skip downloading!" +else + echo "Downloading msvd_data.zip." + wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msvd_data.zip +fi + +echo "Processing annotations started." +unzip -q msvd_data.zip -d ${DATA_DIR} +python prepare_msvd.py +echo "Processing annotations completed." + +if [ -f "YouTubeClips.tar" ]; then + echo "YouTubeClips.tar exists, skip downloading!" +else + echo "Downloading YouTubeClips.tar." + wget https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar +fi + +echo "Processing videos started." +tar -xf YouTubeClips.tar -C ${DATA_DIR} +mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/YouTubeClips" -name "*.avi" -exec mv {} "${DATA_DIR}/videos/" \; +echo "Processing videos completed." + +rm -rf "${DATA_DIR}/YouTubeClips" +rm -rf "${DATA_DIR}/msvd_data" +rm msvd_data.zip +rm YouTubeClips.tar +echo "The preparation of the msvd dataset has been successfully completed." diff --git a/tools/deployment/export_onnx_gcn.py b/tools/deployment/export_onnx_gcn.py new file mode 100644 index 0000000000000000000000000000000000000000..07c70e03ccbc96b6eb548929967db24fb78121ed --- /dev/null +++ b/tools/deployment/export_onnx_gcn.py @@ -0,0 +1,164 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This script serves the sole purpose of converting skeleton-based graph +# in MMAction2 to ONNX files. Please note that attempting to convert other +# models using this script may not yield successful results. +import argparse + +import numpy as np +import onnxruntime +import torch +import torch.nn as nn +from mmengine import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import LabelData + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get model flops and params') + parser.add_argument('config', help='config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--num_frames', type=int, default=150, help='number of input frames.') + parser.add_argument( + '--num_person', type=int, default=2, help='number of maximum person.') + parser.add_argument( + '--num_joints', + type=int, + default=0, + help='number of joints. If not given, will use default settings from' + 'the config file') + parser.add_argument( + '--device', type=str, default='cpu', help='CPU/CUDA device option') + parser.add_argument( + '--output_file', + type=str, + default='stgcn.onnx', + help='file name of the output onnx file') + args = parser.parse_args() + return args + + +class AvgPool2d(nn.Module): + + def forward(self, x): + return x.mean(dim=(-1, -2), keepdims=True) + + +class MaxPool2d(nn.Module): + + def forward(self, x): + x = x.max(dim=-1, keepdim=True)[0] + x = x.max(dim=-2, keepdim=True)[0] + return x + + +class GCNNet(nn.Module): + + def __init__(self, base_model): + super(GCNNet, self).__init__() + self.backbone = base_model.backbone + self.head = base_model.cls_head + + if hasattr(self.head, 'pool'): + pool = self.head.pool + if isinstance(pool, nn.AdaptiveAvgPool2d): + assert pool.output_size == 1 + self.head.pool = AvgPool2d() + elif isinstance(pool, nn.AdaptiveMaxPool2d): + assert pool.output_size == 1 + self.head.pool = MaxPool2d() + + def forward(self, input_tensor): + feat = self.backbone(input_tensor) + cls_score = self.head(feat) + return cls_score + + +def softmax(x): + x = np.exp(x - x.max()) + return x / x.sum() + + +def main(): + args = parse_args() + config = Config.fromfile(args.config) + init_default_scope(config.get('default_scope', 'mmaction')) + + if config.model.type != 'RecognizerGCN': + print( + 'This script serves the sole purpose of converting skeleton-based ' + 'graph in MMAction2 to ONNX files. Please note that attempting to ' + 'convert other models using this script may not yield successful ' + 'results.\n\n') + + base_model = MODELS.build(config.model) + load_checkpoint(base_model, args.checkpoint, map_location='cpu') + base_model.to(args.device) + + lookup = {'openpose': 18, 'nturgb+d': 25, 'coco': 17} + + num_joints = args.num_joints + num_person = args.num_person + num_frames = args.num_frames + if num_joints == 0: + layout = config.model.backbone.graph_cfg.layout + if layout not in lookup: + raise KeyError( + '`layout` not supported, please specify `num_joints`') + num_joints = lookup[layout] + + input_tensor = torch.randn(1, num_person, num_frames, num_joints, 3) + input_tensor = input_tensor.clamp(-3, 3).to(args.device) + + base_model.eval() + + data_sample = ActionDataSample() + data_sample.pred_scores = LabelData() + data_sample.pred_labels = LabelData() + base_output = base_model( + input_tensor.unsqueeze(0), data_samples=[data_sample], + mode='predict')[0] + base_output = base_output.pred_score.detach().cpu().numpy() + + model = GCNNet(base_model).to(args.device) + model.eval() + + torch.onnx.export( + model, (input_tensor), + args.output_file, + input_names=['input_tensor'], + output_names=['cls_score'], + export_params=True, + do_constant_folding=True, + verbose=False, + opset_version=12, + dynamic_axes={ + 'input_tensor': { + 0: 'batch_size', + 1: 'num_person', + 2: 'num_frames' + }, + 'cls_score': { + 0: 'batch_size' + } + }) + + print(f'Successfully export the onnx file to {args.output_file}') + + # Test exported file + session = onnxruntime.InferenceSession(args.output_file) + input_feed = {'input_tensor': input_tensor.cpu().data.numpy()} + outputs = session.run(['cls_score'], input_feed=input_feed) + output = softmax(outputs[0][0]) + + diff = abs(base_output - output).max() + if diff < 1e-5: + print('The output difference is smaller than 1e-5.') + + +if __name__ == '__main__': + main() diff --git a/tools/deployment/export_onnx_posec3d.py b/tools/deployment/export_onnx_posec3d.py new file mode 100644 index 0000000000000000000000000000000000000000..c80c606e8036bfee55fb08417091d4080e7339aa --- /dev/null +++ b/tools/deployment/export_onnx_posec3d.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This script serves the sole purpose of converting PoseC3D skeleton models +# in MMAction2 to ONNX files. Please note that attempting to convert other +# models using this script may not yield successful results. +import argparse + +import numpy as np +import onnxruntime +import torch +import torch.nn as nn +from mmengine import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import LabelData + +from mmaction.registry import MODELS +from mmaction.structures import ActionDataSample + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get model flops and params') + parser.add_argument('config', help='config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--num_frames', type=int, default=48, help='number of input frames.') + parser.add_argument( + '--image_size', type=int, default=64, help='size of the frame') + parser.add_argument( + '--num_joints', + type=int, + default=0, + help='number of joints. If not given, will use default settings from' + 'the config file') + parser.add_argument( + '--device', type=str, default='cpu', help='CPU/CUDA device option') + parser.add_argument( + '--output_file', + type=str, + default='posec3d.onnx', + help='file name of the output onnx file') + args = parser.parse_args() + return args + + +class AvgPool3d(nn.Module): + + def forward(self, x): + return x.mean(dim=(-1, -2, -3), keepdims=True) + + +class MaxPool3d(nn.Module): + + def forward(self, x): + x = x.max(dim=-1, keepdim=True)[0] + x = x.max(dim=-2, keepdim=True)[0] + x = x.max(dim=-3, keepdim=True)[0] + return x + + +class GCNNet(nn.Module): + + def __init__(self, base_model): + super(GCNNet, self).__init__() + self.backbone = base_model.backbone + self.head = base_model.cls_head + + if hasattr(self.head, 'pool'): + pool = self.head.pool + if isinstance(pool, nn.AdaptiveAvgPool3d): + assert pool.output_size == 1 + self.head.pool = AvgPool3d() + elif isinstance(pool, nn.AdaptiveMaxPool3d): + assert pool.output_size == 1 + self.head.pool = MaxPool3d() + + def forward(self, input_tensor): + feat = self.backbone(input_tensor) + cls_score = self.head(feat) + return cls_score + + +def softmax(x): + x = np.exp(x - x.max()) + return x / x.sum() + + +def main(): + args = parse_args() + config = Config.fromfile(args.config) + + if config.model.type != 'RecognizerGCN': + print('This script serves the sole purpose of converting PoseC3D ' + 'skeleton models in MMAction2 to ONNX files. Please note that ' + 'attempting to convert other models using this script may not ' + 'yield successful results.\n\n') + + init_default_scope(config.get('default_scope', 'mmaction')) + + base_model = MODELS.build(config.model) + load_checkpoint(base_model, args.checkpoint, map_location='cpu') + base_model.to(args.device) + + num_joints = args.num_joints + image_size = args.image_size + num_frames = args.num_frames + if num_joints == 0: + num_joints = config.model.backbone.in_channels + + input_tensor = torch.randn(1, num_joints, num_frames, image_size, + image_size) + input_tensor = input_tensor.clamp(-3, 3).to(args.device) + + base_model.eval() + + data_sample = ActionDataSample() + data_sample.pred_scores = LabelData() + data_sample.pred_labels = LabelData() + base_output = base_model( + input_tensor.unsqueeze(0), data_samples=[data_sample], + mode='predict')[0] + base_output = base_output.pred_score.detach().cpu().numpy() + + model = GCNNet(base_model).to(args.device) + model.eval() + + torch.onnx.export( + model, (input_tensor), + args.output_file, + input_names=['input_tensor'], + output_names=['cls_score'], + export_params=True, + do_constant_folding=True, + verbose=False, + opset_version=11, + dynamic_axes={ + 'input_tensor': { + 0: 'batch_size', + 2: 'num_frames' + }, + 'cls_score': { + 0: 'batch_size' + } + }) + + print(f'Successfully export the onnx file to {args.output_file}') + + # Test exported file + session = onnxruntime.InferenceSession(args.output_file) + input_feed = {'input_tensor': input_tensor.cpu().data.numpy()} + outputs = session.run(['cls_score'], input_feed=input_feed) + output = softmax(outputs[0][0]) + + diff = abs(base_output - output).max() + if diff < 1e-5: + print('The output difference is smaller than 1e-5.') + + +if __name__ == '__main__': + main() diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py new file mode 100644 index 0000000000000000000000000000000000000000..4f03a2ee5afdd39e990343d2ef2ab786ddaadd12 --- /dev/null +++ b/tools/deployment/export_onnx_stdet.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This script serves the sole purpose of converting spatial-temporal detection +# models supported in MMAction2 to ONNX files. Please note that attempting to +# convert other models using this script may not yield successful results. +import argparse + +import onnxruntime +import torch +import torch.nn as nn +from mmdet.structures.bbox import bbox2roi +from mmengine import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint + +from mmaction.registry import MODELS + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get model flops and params') + parser.add_argument('config', help='config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--num_frames', type=int, default=8, help='number of input frames.') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[256, 455], + help='input image size') + parser.add_argument( + '--device', type=str, default='cpu', help='CPU/CUDA device option') + parser.add_argument( + '--output_file', + type=str, + default='stdet.onnx', + help='file name of the output onnx file') + args = parser.parse_args() + return args + + +class SpatialMaxPool3d(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x.max(dim=-1, keepdim=True)[0] + return x.max(dim=-2, keepdim=True)[0] + + +class SpatialAvgPool(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + return x.mean(dim=(-1, -2), keepdims=True) + + +class TemporalMaxPool3d(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + return x.max(dim=-3, keepdim=True)[0] + + +class TemporalAvgPool3d(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + return x.mean(dim=-3, keepdim=True) + + +class GlobalPool2d(nn.Module): + + def __init__(self, pool_size, output_size, later_max=True): + super().__init__() + self.pool = nn.AvgPool2d(pool_size) + self.max = later_max + self.output_size = output_size + + def forward(self, x): + x = self.pool(x) + if self.max: + x = x.max(dim=-1, keepdim=True)[0] + x = x.max(dim=-2, keepdim=True)[0] + else: + x = x.mean(dim=(-1, -2), keepdims=True) + x = x.expand(-1, -1, self.output_size, self.output_size) + return x + + +class STDet(nn.Module): + + def __init__(self, base_model, input_tensor): + super(STDet, self).__init__() + self.backbone = base_model.backbone + self.bbox_roi_extractor = base_model.roi_head.bbox_roi_extractor + self.bbox_head = base_model.roi_head.bbox_head + + output_size = self.bbox_roi_extractor.global_pool.output_size + pool_size = min(input_tensor.shape[-2:]) // 16 // output_size + + if isinstance(self.bbox_head.temporal_pool, nn.AdaptiveAvgPool3d): + self.bbox_head.temporal_pool = TemporalAvgPool3d() + else: + self.bbox_head.temporal_pool = TemporalMaxPool3d() + if isinstance(self.bbox_head.spatial_pool, nn.AdaptiveAvgPool3d): + self.bbox_head.spatial_pool = SpatialAvgPool() + self.bbox_roi_extractor.global_pool = GlobalPool2d( + pool_size, output_size, later_max=False) + else: + self.bbox_head.spatial_pool = SpatialMaxPool3d() + self.bbox_roi_extractor.global_pool = GlobalPool2d( + pool_size, output_size, later_max=True) + + def forward(self, input_tensor, rois): + feat = self.backbone(input_tensor) + bbox_feats, _ = self.bbox_roi_extractor(feat, rois) + cls_score = self.bbox_head(bbox_feats) + return cls_score + + +def main(): + args = parse_args() + config = Config.fromfile(args.config) + + if config.model.type != 'FastRCNN': + print('This script serves the sole purpose of converting spatial ' + 'temporal detection models in MMAction2 to ONNX files. Please ' + 'note that attempting to convert other models using this script ' + 'may not yield successful results.\n\n') + + init_default_scope(config.get('default_scope', 'mmaction')) + + base_model = MODELS.build(config.model) + load_checkpoint(base_model, args.checkpoint, map_location='cpu') + base_model.to(args.device) + + if len(args.shape) == 1: + input_shape = (args.shape[0], args.shape[0]) + elif len(args.shape) == 2: + input_shape = tuple(args.shape) + else: + raise ValueError('invalid input shape') + + input_tensor = torch.randn(1, 3, args.num_frames, *input_shape) + input_tensor = input_tensor.clamp(-3, 3).to(args.device) + proposal = torch.Tensor([[22., 59., 67., 157.], [186., 73., 217., 159.], + [407., 95., 431., 168.]]) + + rois = bbox2roi([proposal]).to(args.device) + + model = STDet(base_model, input_tensor).to(args.device) + model.eval() + cls_score = model(input_tensor, rois) + print(f'Model output shape: {cls_score.shape}') + + torch.onnx.export( + model, (input_tensor, rois), + args.output_file, + input_names=['input_tensor', 'rois'], + output_names=['cls_score'], + export_params=True, + do_constant_folding=True, + verbose=False, + opset_version=11, + dynamic_axes={ + 'input_tensor': { + 0: 'batch_size', + 3: 'height', + 4: 'width' + }, + 'rois': { + 0: 'total_num_bbox_for_the_batch' + }, + 'cls_score': { + 0: 'total_num_bbox_for_the_batch' + } + }) + + print(f'Successfully export the onnx file to {args.output_file}') + + # Test exported file + session = onnxruntime.InferenceSession(args.output_file) + input_feed = { + 'input_tensor': input_tensor.cpu().data.numpy(), + 'rois': rois.cpu().data.numpy() + } + outputs = session.run(['cls_score'], input_feed=input_feed) + outputs = outputs[0] + diff = abs(cls_score.cpu().data.numpy() - outputs).max() + if diff < 1e-5: + print('The output difference is smaller than 1e-5.') + + +if __name__ == '__main__': + main() diff --git a/tools/deployment/mmaction2torchserve.py b/tools/deployment/mmaction2torchserve.py new file mode 100644 index 0000000000000000000000000000000000000000..32ef31ffb2dad38270e19ca45ae6028155cfb8bd --- /dev/null +++ b/tools/deployment/mmaction2torchserve.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import shutil +from argparse import ArgumentParser, Namespace +from pathlib import Path +from tempfile import TemporaryDirectory + +from mmengine.config import Config +from mmengine.utils import mkdir_or_exist + +try: + from model_archiver.model_packaging import package_model + from model_archiver.model_packaging_utils import ModelExportUtils +except ImportError: + raise ImportError('`torch-model-archiver` is required.' + 'Try: pip install torch-model-archiver') + + +def mmaction2torchserve( + config_file: str, + checkpoint_file: str, + output_folder: str, + model_name: str, + label_file: str, + model_version: str = '1.0', + force: bool = False, +): + """Converts MMAction2 model (config + checkpoint) to TorchServe `.mar`. + + Args: + config_file (str): In MMAction2 config format. + checkpoint_file (str): In MMAction2 checkpoint format. + output_folder (str): Folder where `{model_name}.mar` will be created. + The file created will be in TorchServe archive format. + label_file (str): A txt file which contains the action category names. + model_name (str | None): If not None, used for naming the + `{model_name}.mar` file that will be created under `output_folder`. + If None, `{Path(checkpoint_file).stem}` will be used. + model_version (str): Model's version. + force (bool): If True, if there is an existing `{model_name}.mar` file + under `output_folder` it will be overwritten. + """ + mkdir_or_exist(output_folder) + + config = Config.fromfile(config_file) + + with TemporaryDirectory() as tmpdir: + config.dump(f'{tmpdir}/config.py') + shutil.copy(label_file, f'{tmpdir}/label_map.txt') + + args = Namespace( + **{ + 'model_file': f'{tmpdir}/config.py', + 'serialized_file': checkpoint_file, + 'handler': f'{Path(__file__).parent}/mmaction_handler.py', + 'model_name': model_name or Path(checkpoint_file).stem, + 'version': model_version, + 'export_path': output_folder, + 'force': force, + 'requirements_file': None, + 'extra_files': f'{tmpdir}/label_map.txt', + 'runtime': 'python', + 'archive_format': 'default' + }) + manifest = ModelExportUtils.generate_manifest_json(args) + package_model(args, manifest) + + +def parse_args(): + parser = ArgumentParser( + description='Convert MMAction2 models to TorchServe `.mar` format.') + parser.add_argument('config', type=str, help='config file path') + parser.add_argument('checkpoint', type=str, help='checkpoint file path') + parser.add_argument( + '--output-folder', + type=str, + required=True, + help='Folder where `{model_name}.mar` will be created.') + parser.add_argument( + '--model-name', + type=str, + default=None, + help='If not None, used for naming the `{model_name}.mar`' + 'file that will be created under `output_folder`.' + 'If None, `{Path(checkpoint_file).stem}` will be used.') + parser.add_argument( + '--label-file', + type=str, + default=None, + help='A txt file which contains the action category names. ') + parser.add_argument( + '--model-version', + type=str, + default='1.0', + help='Number used for versioning.') + parser.add_argument( + '-f', + '--force', + action='store_true', + help='overwrite the existing `{model_name}.mar`') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + + mmaction2torchserve(args.config, args.checkpoint, args.output_folder, + args.model_name, args.label_file, args.model_version, + args.force) diff --git a/tools/deployment/mmaction_handler.py b/tools/deployment/mmaction_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..6fdd29df02c1c75494b35648a1e7379a1a2a038a --- /dev/null +++ b/tools/deployment/mmaction_handler.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import base64 +import os +import os.path as osp +import warnings + +import decord +import numpy as np +import torch + +from mmaction.apis import inference_recognizer, init_recognizer # noqa: F401 + +try: + from ts.torch_handler.base_handler import BaseHandler +except ImportError: + raise ImportError('`ts` is required. Try: pip install ts.') + + +class MMActionHandler(BaseHandler): + + def initialize(self, context): + properties = context.system_properties + self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu' + self.device = torch.device(self.map_location + ':' + + str(properties.get('gpu_id')) if torch.cuda. + is_available() else self.map_location) + self.manifest = context.manifest + + model_dir = properties.get('model_dir') + serialized_file = self.manifest['model']['serializedFile'] + checkpoint = os.path.join(model_dir, serialized_file) + self.config_file = os.path.join(model_dir, 'config.py') + + mapping_file_path = osp.join(model_dir, 'label_map.txt') + if not os.path.isfile(mapping_file_path): + warnings.warn('Missing the label_map.txt file. ' + 'Inference output will not include class name.') + self.mapping = None + else: + lines = open(mapping_file_path).readlines() + self.mapping = [x.strip() for x in lines] + + self.model = init_recognizer(self.config_file, checkpoint, self.device) + self.initialized = True + + def preprocess(self, data): + videos = [] + + for row in data: + video = row.get('data') or row.get('body') + if isinstance(video, str): + video = base64.b64decode(video) + # First save the bytes as a tmp file + with open('/tmp/tmp.mp4', 'wb') as fout: + fout.write(video) + + video = decord.VideoReader('/tmp/tmp.mp4') + frames = [x.asnumpy() for x in video] + videos.append(np.stack(frames)) + + return videos + + def inference(self, data, *args, **kwargs): + results = [inference_recognizer(self.model, item) for item in data] + return results + + def postprocess(self, data): + # Format output following the example ObjectDetectionHandler format + output = [] + for video_idx, video_result in enumerate(data): + output.append([]) + assert isinstance(video_result, list) + + output[video_idx] = { + self.mapping[x[0]] if self.mapping else x[0]: float(x[1]) + for x in video_result + } + + return output diff --git a/tools/deployment/publish_model.py b/tools/deployment/publish_model.py new file mode 100644 index 0000000000000000000000000000000000000000..2ce1a78e3a805250a3c36343245e1c2304fd9413 --- /dev/null +++ b/tools/deployment/publish_model.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import datetime +import os +import platform +import subprocess + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a checkpoint to be published') + parser.add_argument('in_file', help='input checkpoint filename') + parser.add_argument('out_file', help='output checkpoint filename') + args = parser.parse_args() + return args + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove some unnecessary keys for smaller file size + unnecessary_keys = ['message_hub', 'optimizer', 'param_schedulers'] + for k in unnecessary_keys: + if k in checkpoint: + del checkpoint[k] + unnecessary_params = ['data_preprocessor.mean', 'data_preprocessor.std'] + for k in unnecessary_params: + if 'state_dict' in checkpoint and k in checkpoint['state_dict']: + del checkpoint['state_dict'][k] + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + torch.save(checkpoint, out_file) + if platform.system() == 'Windows': + sha = subprocess.check_output( + ['certutil', '-hashfile', out_file, 'SHA256']) + sha = str(sha).split('\\r\\n')[1] + else: + sha = subprocess.check_output(['sha256sum', out_file]).decode() + if out_file.endswith('.pth'): + out_file_name = out_file[:-4] + else: + out_file_name = out_file + + current_date = datetime.datetime.now().strftime('%Y%m%d') + final_file = out_file_name + f'_{current_date}-{sha[:8]}.pth' + os.rename(out_file, final_file) + + +def main(): + args = parse_args() + process_checkpoint(args.in_file, args.out_file) + + +if __name__ == '__main__': + main() diff --git a/tools/dist_test.sh b/tools/dist_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..958b6691a026a4c30050dd41ec4a76576e9450f4 --- /dev/null +++ b/tools/dist_test.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -x + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +# Arguments starting from the forth one are captured by ${@:4} +python -m torch.distributed.launch --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS --master_port=$PORT $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} diff --git a/tools/dist_train.sh b/tools/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..f2d9b609430b2dd49867b456bed50e83438323a9 --- /dev/null +++ b/tools/dist_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -x + +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS --master_port=$PORT $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} +# Any arguments from the third one are captured by ${@:3} diff --git a/tools/misc/bsn_proposal_generation.py b/tools/misc/bsn_proposal_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..652ad8f7eedfcf753dea26066badfd8dab7c93e9 --- /dev/null +++ b/tools/misc/bsn_proposal_generation.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +import mmengine +import numpy as np +import torch.multiprocessing as mp + +from mmaction.models.localizers.utils import (generate_bsp_feature, + generate_candidate_proposals) + + +def load_video_infos(ann_file): + """Load the video annotations. + + Args: + ann_file (str): A json file path of the annotation file. + + Returns: + list[dict]: A list containing annotations for videos. + """ + video_infos = [] + anno_database = mmengine.load(ann_file) + for video_name in anno_database: + video_info = anno_database[video_name] + video_info['video_name'] = video_name + video_infos.append(video_info) + return video_infos + + +def generate_proposals(ann_file, tem_results_dir, pgm_proposals_dir, + pgm_proposals_thread, **kwargs): + """Generate proposals using multi-process. + + Args: + ann_file (str): A json file path of the annotation file for + all videos to be processed. + tem_results_dir (str): Directory to read tem results + pgm_proposals_dir (str): Directory to save generated proposals. + pgm_proposals_thread (int): Total number of threads. + kwargs (dict): Keyword arguments for "generate_candidate_proposals". + """ + video_infos = load_video_infos(ann_file) + num_videos = len(video_infos) + num_videos_per_thread = num_videos // pgm_proposals_thread + processes = [] + manager = mp.Manager() + result_dict = manager.dict() + kwargs['result_dict'] = result_dict + for tid in range(pgm_proposals_thread - 1): + tmp_video_list = range(tid * num_videos_per_thread, + (tid + 1) * num_videos_per_thread) + p = mp.Process( + target=generate_candidate_proposals, + args=( + tmp_video_list, + video_infos, + tem_results_dir, + ), + kwargs=kwargs) + p.start() + processes.append(p) + + tmp_video_list = range((pgm_proposals_thread - 1) * num_videos_per_thread, + num_videos) + p = mp.Process( + target=generate_candidate_proposals, + args=( + tmp_video_list, + video_infos, + tem_results_dir, + ), + kwargs=kwargs) + p.start() + processes.append(p) + + for p in processes: + p.join() + + # save results + os.makedirs(pgm_proposals_dir, exist_ok=True) + prog_bar = mmengine.ProgressBar(num_videos) + header = 'tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa' + for video_name in result_dict: + proposals = result_dict[video_name] + proposal_path = osp.join(pgm_proposals_dir, video_name + '.csv') + np.savetxt( + proposal_path, + proposals, + header=header, + delimiter=',', + comments='') + prog_bar.update() + + +def generate_features(ann_file, tem_results_dir, pgm_proposals_dir, + pgm_features_dir, pgm_features_thread, **kwargs): + """Generate proposals features using multi-process. + + Args: + ann_file (str): A json file path of the annotation file for + all videos to be processed. + tem_results_dir (str): Directory to read tem results. + pgm_proposals_dir (str): Directory to read generated proposals. + pgm_features_dir (str): Directory to save generated features. + pgm_features_thread (int): Total number of threads. + kwargs (dict): Keyword arguments for "generate_bsp_feature". + """ + video_infos = load_video_infos(ann_file) + num_videos = len(video_infos) + num_videos_per_thread = num_videos // pgm_features_thread + processes = [] + manager = mp.Manager() + feature_return_dict = manager.dict() + kwargs['result_dict'] = feature_return_dict + for tid in range(pgm_features_thread - 1): + tmp_video_list = range(tid * num_videos_per_thread, + (tid + 1) * num_videos_per_thread) + p = mp.Process( + target=generate_bsp_feature, + args=( + tmp_video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + ), + kwargs=kwargs) + p.start() + processes.append(p) + tmp_video_list = range((pgm_features_thread - 1) * num_videos_per_thread, + num_videos) + p = mp.Process( + target=generate_bsp_feature, + args=( + tmp_video_list, + video_infos, + tem_results_dir, + pgm_proposals_dir, + ), + kwargs=kwargs) + p.start() + processes.append(p) + + for p in processes: + p.join() + + # save results + os.makedirs(pgm_features_dir, exist_ok=True) + prog_bar = mmengine.ProgressBar(num_videos) + for video_name in feature_return_dict.keys(): + bsp_feature = feature_return_dict[video_name] + feature_path = osp.join(pgm_features_dir, video_name + '.npy') + np.save(feature_path, bsp_feature) + prog_bar.update() + + +def parse_args(): + parser = argparse.ArgumentParser(description='Proposal generation module') + parser.add_argument('config', help='test config file path') + parser.add_argument( + '--mode', + choices=['train', 'test'], + default='test', + help='train or test') + args = parser.parse_args() + return args + + +def main(): + print('Begin Proposal Generation Module') + args = parse_args() + cfg = mmengine.Config.fromfile(args.config) + tem_results_dir = cfg.tem_results_dir + pgm_proposals_dir = cfg.pgm_proposals_dir + pgm_features_dir = cfg.pgm_features_dir + if args.mode == 'test': + generate_proposals(cfg.ann_file_val, tem_results_dir, + pgm_proposals_dir, **cfg.pgm_proposals_cfg) + print('\nFinish proposal generation') + generate_features(cfg.ann_file_val, tem_results_dir, pgm_proposals_dir, + pgm_features_dir, **cfg.pgm_features_test_cfg) + print('\nFinish feature generation') + + elif args.mode == 'train': + generate_proposals(cfg.ann_file_train, tem_results_dir, + pgm_proposals_dir, **cfg.pgm_proposals_cfg) + print('\nFinish proposal generation') + generate_features(cfg.ann_file_train, tem_results_dir, + pgm_proposals_dir, pgm_features_dir, + **cfg.pgm_features_train_cfg) + print('\nFinish feature generation') + + print('Finish Proposal Generation Module') + + +if __name__ == '__main__': + main() diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..a8a0478f6fc5f6e1d9e17b00568c0eccb0125c93 --- /dev/null +++ b/tools/misc/clip_feature_extraction.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmengine import dump, list_from_file, load +from mmengine.config import Config, DictAction +from mmengine.runner import Runner + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 feature extraction') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('output_prefix', type=str, help='output prefix') + parser.add_argument( + '--video-list', type=str, default=None, help='video file list') + parser.add_argument( + '--video-root', type=str, default=None, help='video root directory') + parser.add_argument( + '--spatial-type', + type=str, + default='avg', + choices=['avg', 'max', 'keep'], + help='Pooling type in spatial dimension') + parser.add_argument( + '--temporal-type', + type=str, + default='avg', + choices=['avg', 'max', 'keep'], + help='Pooling type in temporal dimension') + parser.add_argument( + '--long-video-mode', + action='store_true', + help='Perform long video inference to get a feature list from a video') + parser.add_argument( + '--clip-interval', + type=int, + default=None, + help='Clip interval for Clip interval of adjacent center of sampled ' + 'clips, used for long video inference') + parser.add_argument( + '--frame-interval', + type=int, + default=None, + help='Temporal interval of adjacent sampled frames, used for long ' + 'video long video inference') + parser.add_argument( + '--multi-view', + action='store_true', + help='Perform multi view inference') + parser.add_argument( + '--dump-score', + action='store_true', + help='Dump predict scores rather than features') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def merge_args(cfg, args): + """Merge CLI arguments to config.""" + test_pipeline = cfg.test_dataloader.dataset.pipeline + # -------------------- Feature Head -------------------- + if not args.dump_score: + backbone_type2name = dict( + ResNet3dSlowFast='slowfast', + MobileNetV2TSM='tsm', + ResNetTSM='tsm', + ) + + if cfg.model.type == 'RecognizerGCN': + backbone_name = 'gcn' + else: + backbone_name = backbone_type2name.get(cfg.model.backbone.type) + num_segments = None + if backbone_name == 'tsm': + for idx, transform in enumerate(test_pipeline): + if transform.type == 'UntrimmedSampleFrames': + clip_len = transform['clip_len'] + continue + elif transform.type == 'SampleFrames': + clip_len = transform['num_clips'] + num_segments = cfg.model.backbone.get('num_segments', 8) + assert num_segments == clip_len, \ + f'num_segments and clip length must same for TSM, but got ' \ + f'num_segments {num_segments} clip_len {clip_len}' + if cfg.model.test_cfg is not None: + max_testing_views = cfg.model.test_cfg.get( + 'max_testing_views', num_segments) + assert max_testing_views % num_segments == 0, \ + 'tsm needs to infer with batchsize of multiple ' \ + 'of num_segments.' + + spatial_type = None if args.spatial_type == 'keep' else \ + args.spatial_type + temporal_type = None if args.temporal_type == 'keep' else \ + args.temporal_type + feature_head = dict( + type='FeatureHead', + spatial_type=spatial_type, + temporal_type=temporal_type, + backbone_name=backbone_name, + num_segments=num_segments) + cfg.model.cls_head = feature_head + + # ---------------------- multiple view ---------------------- + if not args.multi_view: + # average features among multiple views + cfg.model.cls_head['average_clips'] = 'score' + if cfg.model.type == 'Recognizer3D': + for idx, transform in enumerate(test_pipeline): + if transform.type == 'SampleFrames': + test_pipeline[idx]['num_clips'] = 1 + for idx, transform in enumerate(test_pipeline): + if transform.type == 'SampleFrames': + test_pipeline[idx]['twice_sample'] = False + # if transform.type in ['ThreeCrop', 'TenCrop']: + if transform.type == 'TenCrop': + test_pipeline[idx].type = 'CenterCrop' + + # -------------------- pipeline settings -------------------- + # assign video list and video root + if args.video_list is not None: + cfg.test_dataloader.dataset.ann_file = args.video_list + if args.video_root is not None: + if cfg.test_dataloader.dataset.type == 'VideoDataset': + cfg.test_dataloader.dataset.data_prefix = dict( + video=args.video_root) + elif cfg.test_dataloader.dataset.type == 'RawframeDataset': + cfg.test_dataloader.dataset.data_prefix = dict(img=args.video_root) + args.video_list = cfg.test_dataloader.dataset.ann_file + args.video_root = cfg.test_dataloader.dataset.data_prefix + # use UntrimmedSampleFrames for long video inference + if args.long_video_mode: + # preserve features of multiple clips + cfg.model.cls_head['average_clips'] = None + cfg.test_dataloader.batch_size = 1 + is_recognizer2d = (cfg.model.type == 'Recognizer2D') + + frame_interval = args.frame_interval + for idx, transform in enumerate(test_pipeline): + if transform.type == 'UntrimmedSampleFrames': + clip_len = transform['clip_len'] + continue + # replace SampleFrame by UntrimmedSampleFrames + elif transform.type in ['SampleFrames', 'UniformSample']: + assert args.clip_interval is not None, \ + 'please specify clip interval for long video inference' + if is_recognizer2d: + # clip_len of UntrimmedSampleFrames is same as + # num_clips for 2D Recognizer. + clip_len = transform['num_clips'] + else: + clip_len = transform['clip_len'] + if frame_interval is None: + # take frame_interval of SampleFrames as default + frame_interval = transform.get('frame_interval') + assert frame_interval is not None, \ + 'please specify frame interval for long video ' \ + 'inference when use UniformSample or 2D Recognizer' + + sample_cfgs = dict( + type='UntrimmedSampleFrames', + clip_len=clip_len, + clip_interval=args.clip_interval, + frame_interval=frame_interval) + test_pipeline[idx] = sample_cfgs + continue + # flow input will stack all frames + if cfg.test_dataloader.dataset.get('modality') == 'Flow': + clip_len = 1 + + if is_recognizer2d: + from mmaction.models import ActionDataPreprocessor + from mmaction.registry import MODELS + + @MODELS.register_module() + class LongVideoDataPreprocessor(ActionDataPreprocessor): + """DataPreprocessor for 2D recognizer to infer on long video. + + Which would stack the num_clips to batch dimension, to preserve + feature of each clip (no average among clips) + """ + + def __init__(self, num_frames=8, **kwargs) -> None: + super().__init__(**kwargs) + self.num_frames = num_frames + + def preprocess(self, inputs, data_samples, training=False): + batch_inputs, data_samples = super().preprocess( + inputs, data_samples, training) + # [N*M, T, C, H, W] + nclip_batch_inputs = batch_inputs.view( + (-1, self.num_frames) + batch_inputs.shape[2:]) + # data_samples = data_samples * \ + # nclip_batch_inputs.shape[0] + return nclip_batch_inputs, data_samples + + preprocessor_cfg = cfg.model.data_preprocessor + preprocessor_cfg.type = 'LongVideoDataPreprocessor' + preprocessor_cfg['num_frames'] = clip_len + + # -------------------- Dump predictions -------------------- + args.dump = osp.join(args.output_prefix, 'total_feats.pkl') + dump_metric = dict(type='DumpResults', out_file_path=args.dump) + cfg.test_evaluator = [dump_metric] + cfg.work_dir = osp.join(args.output_prefix, 'work_dir') + + return cfg + + +def split_feats(args): + total_feats = load(args.dump) + if args.dump_score: + total_feats = [sample['pred_scores']['item'] for sample in total_feats] + + video_list = list_from_file(args.video_list) + video_list = [line.split(' ')[0] for line in video_list] + + for video_name, feature in zip(video_list, total_feats): + dump(feature, osp.join(args.output_prefix, video_name + '.pkl')) + os.remove(args.dump) + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + cfg = merge_args(cfg, args) + cfg.launcher = args.launcher + + cfg.load_from = args.checkpoint + + # build the runner from config + runner = Runner.from_cfg(cfg) + + # start testing + runner.test() + + split_feats(args) + + +if __name__ == '__main__': + main() diff --git a/tools/misc/dist_clip_feature_extraction.sh b/tools/misc/dist_clip_feature_extraction.sh new file mode 100644 index 0000000000000000000000000000000000000000..ebdf23cc090b15591a987b8161433381065978ef --- /dev/null +++ b/tools/misc/dist_clip_feature_extraction.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +PORT=${PORT:-29500} + +PYTHONPATH="$(dirname $0)/../..":$PYTHONPATH \ +# Arguments starting from the forth one are captured by ${@:4} +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/clip_feature_extraction.py $CONFIG $CHECKPOINT \ + --launcher pytorch ${@:4} diff --git a/tools/misc/flow_extraction.py b/tools/misc/flow_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..84866c74385dee2f88c99cb454158878d9623428 --- /dev/null +++ b/tools/misc/flow_extraction.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +import cv2 +import numpy as np + + +def flow_to_img(raw_flow, bound=20.): + """Convert flow to gray image. + + Args: + raw_flow (np.ndarray[float]): Estimated flow with the shape (w, h). + bound (float): Bound for the flow-to-image normalization. Default: 20. + + Returns: + np.ndarray[uint8]: The result list of np.ndarray[uint8], with shape + (w, h). + """ + flow = np.clip(raw_flow, -bound, bound) + flow += bound + flow *= (255 / float(2 * bound)) + flow = flow.astype(np.uint8) + return flow + + +def generate_flow(frames, method='tvl1'): + """Estimate flow with given frames. + + Args: + frames (list[np.ndarray[uint8]]): List of rgb frames, with shape + (w, h, 3). + method (str): Use which method to generate flow. Options are 'tvl1' + and 'farneback'. Default: 'tvl1'. + + Returns: + list[np.ndarray[float]]: The result list of np.ndarray[float], with + shape (w, h, 2). + """ + assert method in ['tvl1', 'farneback'] + gray_frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames] + + if method == 'tvl1': + tvl1 = cv2.optflow.DualTVL1OpticalFlow_create() + + def op(x, y): + return tvl1.calc(x, y, None) + elif method == 'farneback': + + def op(x, y): + return cv2.calcOpticalFlowFarneback(x, y, None, 0.5, 3, 15, 3, 5, + 1.2, 0) + + gray_st = gray_frames[:-1] + gray_ed = gray_frames[1:] + + flow = [op(x, y) for x, y in zip(gray_st, gray_ed)] + return flow + + +def extract_dense_flow(path, + dest, + bound=20., + save_rgb=False, + start_idx=0, + rgb_tmpl='img_{:05d}.jpg', + flow_tmpl='{}_{:05d}.jpg', + method='tvl1'): + """Extract dense flow given video or frames, save them as gray-scale + images. + + Args: + path (str): Location of the input video. + dest (str): The directory to store the extracted flow images. + bound (float): Bound for the flow-to-image normalization. Default: 20. + save_rgb (bool): Save extracted RGB frames. Default: False. + start_idx (int): The starting frame index if use frames as input, the + first image is path.format(start_idx). Default: 0. + rgb_tmpl (str): The template of RGB frame names, Default: + 'img_{:05d}.jpg'. + flow_tmpl (str): The template of Flow frame names, Default: + '{}_{:05d}.jpg'. + method (str): Use which method to generate flow. Options are 'tvl1' + and 'farneback'. Default: 'tvl1'. + """ + + frames = [] + assert osp.exists(path) + video = cv2.VideoCapture(path) + flag, f = video.read() + while flag: + frames.append(f) + flag, f = video.read() + + flow = generate_flow(frames, method=method) + + flow_x = [flow_to_img(x[:, :, 0], bound) for x in flow] + flow_y = [flow_to_img(x[:, :, 1], bound) for x in flow] + + if not osp.exists(dest): + os.system('mkdir -p ' + dest) + flow_x_names = [ + osp.join(dest, flow_tmpl.format('x', ind + start_idx)) + for ind in range(len(flow_x)) + ] + flow_y_names = [ + osp.join(dest, flow_tmpl.format('y', ind + start_idx)) + for ind in range(len(flow_y)) + ] + + num_frames = len(flow) + for i in range(num_frames): + cv2.imwrite(flow_x_names[i], flow_x[i]) + cv2.imwrite(flow_y_names[i], flow_y[i]) + + if save_rgb: + img_names = [ + osp.join(dest, rgb_tmpl.format(ind + start_idx)) + for ind in range(len(frames)) + ] + for frame, name in zip(frames, img_names): + cv2.imwrite(name, frame) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Extract flow and RGB images') + parser.add_argument( + '--input', + help='videos for frame extraction, can be' + 'single video or a video list, the video list should be a txt file ' + 'and just consists of filenames without directories') + parser.add_argument( + '--prefix', + default='', + help='the prefix of input ' + 'videos, used when input is a video list') + parser.add_argument( + '--dest', + default='', + help='the destination to save ' + 'extracted frames') + parser.add_argument( + '--save-rgb', action='store_true', help='also save ' + 'rgb frames') + parser.add_argument( + '--rgb-tmpl', + default='img_{:05d}.jpg', + help='template filename of rgb frames') + parser.add_argument( + '--flow-tmpl', + default='{}_{:05d}.jpg', + help='template filename of flow frames') + parser.add_argument( + '--start-idx', + type=int, + default=1, + help='the start ' + 'index of extracted frames') + parser.add_argument( + '--method', + default='tvl1', + help='use which method to ' + 'generate flow') + parser.add_argument( + '--bound', type=float, default=20, help='maximum of ' + 'optical flow') + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + if args.input.endswith('.txt'): + lines = open(args.input).readlines() + lines = [x.strip() for x in lines] + videos = [osp.join(args.prefix, x) for x in lines] + dests = [osp.join(args.dest, x.split('.')[0]) for x in lines] + for video, dest in zip(videos, dests): + extract_dense_flow(video, dest, args.bound, args.save_rgb, + args.start_idx, args.rgb_tmpl, args.flow_tmpl, + args.method) + else: + extract_dense_flow(args.input, args.dest, args.bound, args.save_rgb, + args.start_idx, args.rgb_tmpl, args.flow_tmpl, + args.method) diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..b8c515333af2fef573854fca82b739407ea3de85 --- /dev/null +++ b/tools/slurm_test.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +CHECKPOINT=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +PY_ARGS=${@:5} # Arguments starting from the fifth one are captured +SRUN_ARGS=${SRUN_ARGS:-""} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..cdcebfe09fa864020ace105cfdec54584e38c7ff --- /dev/null +++ b/tools/slurm_train.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +export MASTER_PORT=$((12000 + $RANDOM % 20000)) +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} +PY_ARGS=${@:4} # Any arguments from the forth one are captured by this + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS} diff --git a/tools/test.py b/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..84f9d71f7611f73c869919cb0d06b207f64d0f05 --- /dev/null +++ b/tools/test.py @@ -0,0 +1,126 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.runner import Runner + +from mmaction.registry import RUNNERS + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--dump', + type=str, + help='dump predictions to a pickle file for offline evaluation') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--show-dir', + help='directory where the visualization images will be saved.') + parser.add_argument( + '--show', + action='store_true', + help='whether to display the prediction results in a window.') + parser.add_argument( + '--interval', + type=int, + default=1, + help='visualize per interval samples.') + parser.add_argument( + '--wait-time', + type=float, + default=2, + help='display time of every window. (second)') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def merge_args(cfg, args): + """Merge CLI arguments to config.""" + # -------------------- visualization -------------------- + if args.show or (args.show_dir is not None): + assert 'visualization' in cfg.default_hooks, \ + 'VisualizationHook is not set in the `default_hooks` field of ' \ + 'config. Please set `visualization=dict(type="VisualizationHook")`' + + cfg.default_hooks.visualization.enable = True + cfg.default_hooks.visualization.show = args.show + cfg.default_hooks.visualization.wait_time = args.wait_time + cfg.default_hooks.visualization.out_dir = args.show_dir + cfg.default_hooks.visualization.interval = args.interval + + # -------------------- Dump predictions -------------------- + if args.dump is not None: + assert args.dump.endswith(('.pkl', '.pickle')), \ + 'The dump file must be a pkl file.' + dump_metric = dict(type='DumpResults', out_file_path=args.dump) + if isinstance(cfg.test_evaluator, (list, tuple)): + cfg.test_evaluator = list(cfg.test_evaluator) + cfg.test_evaluator.append(dump_metric) + else: + cfg.test_evaluator = [cfg.test_evaluator, dump_metric] + + return cfg + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + cfg = merge_args(cfg, args) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start testing + runner.test() + + +if __name__ == '__main__': + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6d91268b8cf6d4352bafe695e9356907a2d063d4 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,143 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.runner import Runner + +from mmaction.registry import RUNNERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a action recognizer') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume', + nargs='?', + type=str, + const='auto', + help='If specify checkpoint path, resume from it, while if not ' + 'specify, try to auto resume from the latest checkpoint ' + 'in the work directory.') + parser.add_argument( + '--amp', + action='store_true', + help='enable automatic-mixed-precision training') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + parser.add_argument( + '--auto-scale-lr', + action='store_true', + help='whether to auto scale the learning rate according to the ' + 'actual batch size and the original batch size.') + parser.add_argument('--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--diff-rank-seed', + action='store_true', + help='whether or not set different seeds for different ranks') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def merge_args(cfg, args): + """Merge CLI arguments to config.""" + if args.no_validate: + cfg.val_cfg = None + cfg.val_dataloader = None + cfg.val_evaluator = None + + cfg.launcher = args.launcher + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.get('type', 'OptimWrapper') + assert optim_wrapper in ['OptimWrapper', 'AmpOptimWrapper'], \ + '`--amp` is not supported custom optimizer wrapper type ' \ + f'`{optim_wrapper}.' + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.setdefault('loss_scale', 'dynamic') + + # resume training + if args.resume == 'auto': + cfg.resume = True + cfg.load_from = None + elif args.resume is not None: + cfg.resume = True + cfg.load_from = args.resume + + # enable auto scale learning rate + if args.auto_scale_lr: + cfg.auto_scale_lr.enable = True + + # set random seeds + if cfg.get('randomness', None) is None: + cfg.randomness = dict( + seed=args.seed, + diff_rank_seed=args.diff_rank_seed, + deterministic=args.deterministic) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + return cfg + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + + # merge cli arguments to config + cfg = merge_args(cfg, args) + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba694c583580e991ab8468a84df5a226f73384b --- /dev/null +++ b/tools/visualizations/browse_dataset.py @@ -0,0 +1,233 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import sys +import warnings +from copy import deepcopy + +import cv2 +import mmcv +import numpy as np +from mmengine.config import Config, DictAction +from mmengine.dataset import Compose +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from mmengine.visualization import Visualizer + +from mmaction.registry import DATASETS +from mmaction.visualization import ActionVisualizer +from mmaction.visualization.action_visualizer import _get_adaptive_scale + + +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + 'output_dir', default=None, type=str, help='output directory') + parser.add_argument('--label', default=None, type=str, help='label file') + parser.add_argument( + '--phase', + '-p', + default='train', + type=str, + choices=['train', 'test', 'val'], + help='phase of dataset to visualize, accept "train" "test" and "val".' + ' Defaults to "train".') + parser.add_argument( + '--show-number', + '-n', + type=int, + default=sys.maxsize, + help='number of images selected to visualize, must bigger than 0. if ' + 'the number is bigger than length of dataset, show all the images in ' + 'dataset; default "sys.maxsize", show all images in dataset') + parser.add_argument( + '--fps', + default=5, + type=int, + help='specify fps value of the output video when using rawframes to ' + 'generate file') + parser.add_argument( + '--mode', + '-m', + default='transformed', + type=str, + choices=['original', 'transformed', 'concat', 'pipeline'], + help='display mode; display original pictures or transformed pictures' + ' or comparison pictures. "original" means show images load from disk' + '; "transformed" means to show images after transformed; "concat" ' + 'means show images stitched by "original" and "output" images. ' + '"pipeline" means show all the intermediate images. ' + 'Defaults to "transformed".') + parser.add_argument( + '--rescale-factor', + '-r', + type=float, + help='video rescale factor, which is useful if the output is too ' + 'large or too small.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def make_grid(videos, names, rescale_factor=None): + """Concat list of pictures into a single big picture, align height here.""" + vis = Visualizer() + + ori_shapes = [vid[0].shape[:2] for vid in videos] + if rescale_factor is not None: + videos = [[mmcv.imrescale(img, rescale_factor) for img in video] + for video in videos] + + max_height = int(max(vid[0].shape[0] for vid in videos) * 1.4) + min_width = min(vid[0].shape[1] for vid in videos) + horizontal_gap = min_width // 10 + img_scale = _get_adaptive_scale((max_height, min_width)) + + texts = [] + text_positions = [] + start_x = 0 + for i, vid in enumerate(videos): + for j, img in enumerate(vid): + pad_height = (max_height - img.shape[0]) // 2 + pad_width = horizontal_gap // 2 + # make border + videos[i][j] = cv2.copyMakeBorder( + img, + pad_height, + max_height - img.shape[0] - pad_height + + int(img_scale * 30 * 2), + pad_width, + pad_width, + cv2.BORDER_CONSTANT, + value=(255, 255, 255)) + + texts.append(f'{names[i]}\n{ori_shapes[i]}') + text_positions.append( + [start_x + img.shape[1] // 2 + pad_width, max_height]) + start_x += img.shape[1] + horizontal_gap + + out_frames = [] + for i in range(len(videos[0])): + imgs = [vid[i] for vid in videos] + display_img = np.concatenate(imgs, axis=1) + vis.set_image(display_img) + img_scale = _get_adaptive_scale(display_img.shape[:2]) + vis.draw_texts( + texts, + positions=np.array(text_positions), + font_sizes=img_scale * 7, + colors='black', + horizontal_alignments='center', + font_families='monospace') + out_frames.append(vis.get_image()) + return out_frames + + +class InspectCompose(Compose): + """Compose multiple transforms sequentially. + + And record "imgs" field of all results in one list. + """ + + def __init__(self, transforms, intermediate_imgs): + super().__init__(transforms=transforms) + self.intermediate_imgs = intermediate_imgs + + def __call__(self, data): + + for idx, t in enumerate(self.transforms): + data = t(data) + if data is None: + return None + if 'imgs' in data: + name = t.__class__.__name__ + imgs = deepcopy(data['imgs']) + if name == 'FormatShape': + continue + if name == 'ThreeCrop': + n_crops = 3 + clip_len = len(imgs) // n_crops + crop_imgs = [ + imgs[idx * clip_len:(idx + 1) * clip_len] + for idx in range(n_crops) + ] + imgs = np.concatenate(crop_imgs, axis=1) + imgs = [img for img in imgs] + if name == 'TenCrop': + warnings.warn( + 'TenCrop is not supported, only show one crop') + self.intermediate_imgs.append({'name': name, 'imgs': imgs}) + return data + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + init_default_scope(cfg.get('default_scope', 'mmaction')) + + dataset_cfg = cfg.get(args.phase + '_dataloader').get('dataset') + dataset = DATASETS.build(dataset_cfg) + + intermediate_imgs = [] + dataset.pipeline = InspectCompose(dataset.pipeline.transforms, + intermediate_imgs) + + # init visualizer + vis_backends = [dict( + type='LocalVisBackend', + save_dir=args.output_dir, + )] + visualizer = ActionVisualizer( + vis_backends=vis_backends, save_dir='place_holder') + + if args.label: + labels = open(args.label).readlines() + labels = [x.strip() for x in labels] + visualizer.dataset_meta = dict(classes=labels) + + # init visualization video number + display_number = min(args.show_number, len(dataset)) + progress_bar = ProgressBar(display_number) + + for i, item in zip(range(display_number), dataset): + rescale_factor = args.rescale_factor + if args.mode == 'original': + video = intermediate_imgs[0]['imgs'] + elif args.mode == 'transformed': + video = intermediate_imgs[-1]['imgs'] + elif args.mode == 'concat': + ori_video = intermediate_imgs[0]['imgs'] + trans_video = intermediate_imgs[-1]['imgs'] + video = make_grid([ori_video, trans_video], + ['original', 'transformed'], rescale_factor) + rescale_factor = None + else: + video = make_grid([result['imgs'] for result in intermediate_imgs], + [result['name'] for result in intermediate_imgs], + rescale_factor) + rescale_factor = None + + intermediate_imgs.clear() + + data_sample = item['data_samples'].numpy() + + file_id = f'video_{i}' + video = [x[..., ::-1] for x in video] + visualizer.add_datasample( + file_id, video, data_sample, fps=args.fps, out_type='video') + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/tools/visualizations/vis_cam.py b/tools/visualizations/vis_cam.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ed917b09c198f18b317312d288bd5cdf2c60fa --- /dev/null +++ b/tools/visualizations/vis_cam.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp +from typing import Dict, List, Optional, Tuple + +import mmcv +import numpy as np +import torch.nn as nn +from mmengine import Config, DictAction +from mmengine.dataset import Compose, pseudo_collate + +from mmaction.apis import init_recognizer +from mmaction.utils import GradCAM + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMAction2 GradCAM Visualization') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file/url') + parser.add_argument('video', help='video file/url or rawframes directory') + parser.add_argument( + '--use-frames', + default=False, + action='store_true', + help='whether to use rawframes as input') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--target-layer-name', + type=str, + default='backbone/layer4/1/relu', + help='GradCAM target layer name') + parser.add_argument('--out-filename', default=None, help='output filename') + parser.add_argument('--fps', default=5, type=int) + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--target-resolution', + nargs=2, + default=None, + type=int, + help='Target resolution (w, h) for resizing the frames when using a ' + 'video as input. If either dimension is set to -1, the frames are ' + 'resized by keeping the existing aspect ratio') + parser.add_argument( + '--resize-algorithm', + default='bilinear', + help='resize algorithm applied to generate video & gif') + + args = parser.parse_args() + return args + + +def build_inputs(model: nn.Module, + video_path: str, + use_frames: bool = False) -> Dict: + """build inputs for GradCAM. + + Note that, building inputs for GradCAM is exactly the same as building + inputs for Recognizer test stage. Codes from `inference_recognizer`. + + Args: + model (nn.Module): Recognizer model. + video_path (str): video file/url or rawframes directory. + use_frames (bool): whether to use rawframes as input. + Defaults to False. + + Returns: + dict: Both GradCAM inputs and Recognizer test stage inputs, + including two keys, ``inputs`` and ``data_samples``. + """ + if not (osp.exists(video_path) or video_path.startswith('http')): + raise RuntimeError(f"'{video_path}' is missing") + + if osp.isfile(video_path) and use_frames: + raise RuntimeError( + f"'{video_path}' is a video file, not a rawframe directory") + if osp.isdir(video_path) and not use_frames: + raise RuntimeError( + f"'{video_path}' is a rawframe directory, not a video file") + + cfg = model.cfg + + # build the data pipeline + test_pipeline = cfg.test_pipeline + test_pipeline = Compose(test_pipeline) + # prepare data + if use_frames: + filename_tmpl = cfg.test_dataloader.dataset.get( + 'filename_tmpl', 'img_{:05}.jpg') + start_index = cfg.test_dataloader.dataset.get('start_index', 1) + data = dict( + frame_dir=video_path, + total_frames=len(os.listdir(video_path)), + label=-1, + start_index=start_index, + filename_tmpl=filename_tmpl, + modality='RGB') + else: + start_index = cfg.test_dataloader.dataset.get('start_index', 0) + data = dict( + filename=video_path, + label=-1, + start_index=start_index, + modality='RGB') + data = test_pipeline(data) + data = pseudo_collate([data]) + + return data + + +def _resize_frames(frame_list: List[np.ndarray], + scale: Optional[Tuple[int]] = None, + keep_ratio: bool = True, + interpolation: str = 'bilinear') -> List[np.ndarray]: + """Resize frames according to given scale. + + Codes are modified from `mmaction/datasets/transforms/processing.py`, + `Resize` class. + + Args: + frame_list (list[np.ndarray]): Frames to be resized. + scale (tuple[int]): If keep_ratio is True, it serves as scaling + factor or maximum size: the image will be rescaled as large + as possible within the scale. Otherwise, it serves as (w, h) + of output size. + keep_ratio (bool): If set to True, Images will be resized without + changing the aspect ratio. Otherwise, it will resize images to a + given size. Defaults to True. + interpolation (str): Algorithm used for interpolation: + 'nearest' | 'bilinear'. Defaults to ``'bilinear'``. + + Returns: + list[np.ndarray]: Resized frames. + """ + if scale is None or (scale[0] == -1 and scale[1] == -1): + return frame_list + scale = tuple(scale) + max_long_edge = max(scale) + max_short_edge = min(scale) + if max_short_edge == -1: + scale = (np.inf, max_long_edge) + + img_h, img_w, _ = frame_list[0].shape + + if keep_ratio: + new_w, new_h = mmcv.rescale_size((img_w, img_h), scale) + else: + new_w, new_h = scale + + frame_list = [ + mmcv.imresize(img, (new_w, new_h), interpolation=interpolation) + for img in frame_list + ] + + return frame_list + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + cfg.merge_from_dict(args.cfg_options) + + # Build the recognizer from a config file and checkpoint file/url + model = init_recognizer(cfg, args.checkpoint, device=args.device) + + inputs = build_inputs(model, args.video, use_frames=args.use_frames) + gradcam = GradCAM(model, args.target_layer_name) + results = gradcam(inputs) + + if args.out_filename is not None: + try: + from moviepy.editor import ImageSequenceClip + except ImportError: + raise ImportError('Please install moviepy to enable output file.') + + # frames_batches shape [B, T, H, W, 3], in RGB order + frames_batches = (results[0] * 255.).numpy().astype(np.uint8) + frames = frames_batches.reshape(-1, *frames_batches.shape[-3:]) + + frame_list = list(frames) + frame_list = _resize_frames( + frame_list, + args.target_resolution, + interpolation=args.resize_algorithm) + + video_clips = ImageSequenceClip(frame_list, fps=args.fps) + out_type = osp.splitext(args.out_filename)[1][1:] + if out_type == 'gif': + video_clips.write_gif(args.out_filename) + else: + video_clips.write_videofile(args.out_filename, remove_temp=True) + + +if __name__ == '__main__': + main() diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..990f5c70704aaf8643ce8593689543140a0a89f5 --- /dev/null +++ b/tools/visualizations/vis_scheduler.py @@ -0,0 +1,275 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import os.path as osp +import re +from pathlib import Path +from unittest.mock import MagicMock + +import matplotlib.pyplot as plt +import rich +import torch.nn as nn +from mmengine.config import Config, DictAction +from mmengine.hooks import Hook +from mmengine.model import BaseModel +from mmengine.registry import init_default_scope +from mmengine.runner import Runner +from mmengine.visualization import Visualizer +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn +from torch.utils.data import DataLoader + +from mmaction.utils import get_str_type + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Visualize a Dataset Pipeline') + parser.add_argument('config', help='config file path') + parser.add_argument( + '-p', + '--parameter', + type=str, + default='lr', + choices=['lr', 'momentum'], + help='The parameter to visualize its change curve, choose from' + '"lr" and "momentum". Defaults to "lr".') + parser.add_argument( + '-d', + '--dataset-size', + type=int, + help='The size of the dataset. If specify, `build_dataset` will ' + 'be skipped and use this size as the dataset size.') + parser.add_argument( + '-n', + '--ngpus', + type=int, + default=1, + help='The number of GPUs used in training.') + parser.add_argument( + '-s', + '--save-path', + type=Path, + help='The learning rate curve plot save path') + parser.add_argument( + '--log-level', + default='WARNING', + help='The log level of the handler and logger. Defaults to ' + 'WARNING.') + parser.add_argument('--title', type=str, help='title of figure') + parser.add_argument( + '--style', type=str, default='whitegrid', help='style of plt') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--window-size', + default='12*7', + help='Size of the window to display images, in format of "$W*$H".') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + if args.window_size != '': + assert re.match(r'\d+\*\d+', args.window_size), \ + "'window-size' must be in format 'W*H'." + + return args + + +class SimpleModel(BaseModel): + """simple model that do nothing in train_step.""" + + def __init__(self): + super(SimpleModel, self).__init__() + self.data_preprocessor = nn.Identity() + self.conv = nn.Conv2d(1, 1, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + pass + + def train_step(self, data, optim_wrapper): + pass + + +class ParamRecordHook(Hook): + + def __init__(self, by_epoch): + super().__init__() + self.by_epoch = by_epoch + self.lr_list = [] + self.momentum_list = [] + self.task_id = 0 + self.progress = Progress(BarColumn(), MofNCompleteColumn(), + TextColumn('{task.description}')) + + def before_train(self, runner): + if self.by_epoch: + total = runner.train_loop.max_epochs + self.task_id = self.progress.add_task( + 'epochs', start=True, total=total) + else: + total = runner.train_loop.max_iters + self.task_id = self.progress.add_task( + 'iters', start=True, total=total) + self.progress.start() + + def after_train_epoch(self, runner): + if self.by_epoch: + self.progress.update(self.task_id, advance=1) + + def after_train_iter(self, runner, batch_idx, data_batch, outputs): + if not self.by_epoch: + self.progress.update(self.task_id, advance=1) + self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0]) + self.momentum_list.append( + runner.optim_wrapper.get_momentum()['momentum'][0]) + + def after_train(self, runner): + self.progress.stop() + + +def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True): + """Plot learning rate vs iter graph.""" + try: + import seaborn as sns + sns.set_style(args.style) + except ImportError: + pass + + wind_w, wind_h = args.window_size.split('*') + wind_w, wind_h = int(wind_w), int(wind_h) + plt.figure(figsize=(wind_w, wind_h)) + + ax: plt.Axes = plt.subplot() + ax.plot(lr_list, linewidth=1) + + if by_epoch: + ax.xaxis.tick_top() + ax.set_xlabel('Iters') + ax.xaxis.set_label_position('top') + sec_ax = ax.secondary_xaxis( + 'bottom', + functions=(lambda x: x / iters_per_epoch, + lambda y: y * iters_per_epoch)) + sec_ax.set_xlabel('Epochs') + else: + plt.xlabel('Iters') + plt.ylabel(param_name) + + if args.title is None: + plt.title(f'{osp.basename(args.config)} {param_name} curve') + else: + plt.title(args.title) + + +def simulate_train(data_loader, cfg, by_epoch): + model = SimpleModel() + param_record_hook = ParamRecordHook(by_epoch=by_epoch) + default_hooks = dict( + param_scheduler=cfg.default_hooks['param_scheduler'], + runtime_info=None, + timer=None, + logger=None, + checkpoint=None, + sampler_seed=None, + param_record=param_record_hook) + + runner = Runner( + model=model, + work_dir=cfg.work_dir, + train_dataloader=data_loader, + train_cfg=cfg.train_cfg, + log_level=cfg.log_level, + optim_wrapper=cfg.optim_wrapper, + param_scheduler=cfg.param_scheduler, + default_scope=cfg.default_scope, + default_hooks=default_hooks, + auto_scale_lr=cfg.get('auto_scale_lr'), + visualizer=MagicMock(spec=Visualizer), + custom_hooks=cfg.get('custom_hooks', None)) + + runner.train() + + return param_record_hook.lr_list, param_record_hook.momentum_list + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + if cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.log_level = args.log_level + init_default_scope(cfg.get('default_scope', 'mmaction')) + + # make sure save_root exists + if args.save_path and not args.save_path.parent.exists(): + raise FileNotFoundError( + f'The save path is {args.save_path}, and directory ' + f"'{args.save_path.parent}' do not exist.") + + # init logger + print('Param_scheduler :') + rich.print_json(json.dumps(cfg.param_scheduler)) + + # prepare data loader + batch_size = cfg.train_dataloader.batch_size * args.ngpus + + if 'by_epoch' in cfg.train_cfg: + by_epoch = cfg.train_cfg.get('by_epoch') + elif 'type' in cfg.train_cfg: + by_epoch = get_str_type(cfg.train_cfg.get('by_epoch')) \ + == 'EpochBasedTrainLoop' + else: + raise ValueError('please set `train_cfg`.') + + if args.dataset_size is None and by_epoch: + from mmaction.registry import DATASETS + dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset)) + print(f'dataset is {dataset_size}') + else: + dataset_size = args.dataset_size or batch_size + + data_loader = DataLoader(range(dataset_size), batch_size) + assert len(data_loader) > 0, \ + 'Please decrease batchsize to make sure that ' \ + 'a epoch at least have one iteration!' + dataset_info = ( + f'\nDataset infos:' + f'\n - Dataset size: {dataset_size}' + f'\n - Batch size per GPU: {cfg.train_dataloader.batch_size}' + f'\n - Number of GPUs: {args.ngpus}' + f'\n - Total batch size: {batch_size}') + if by_epoch: + dataset_info += f'\n - Iterations per epoch: {len(data_loader)}' + rich.print(dataset_info + '\n') + + # simulation training process + lr_list, momentum_list = simulate_train(data_loader, cfg, by_epoch) + if args.parameter == 'lr': + param_list = lr_list + else: + param_list = momentum_list + + param_name = 'Learning Rate' if args.parameter == 'lr' else 'Momentum' + plot_curve(param_list, args, param_name, len(data_loader), by_epoch) + + if args.save_path: + plt.savefig(args.save_path) + print(f'\nThe {param_name} graph is saved at {args.save_path}') + + if not args.not_show: + plt.show() + + +if __name__ == '__main__': + main() diff --git a/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py b/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py new file mode 100644 index 0000000000000000000000000000000000000000..df0b1af96081f71101e986c826b4d9b2d7d542b4 --- /dev/null +++ b/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py @@ -0,0 +1,237 @@ +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +auto_scale_lr = dict(base_batch_size=256, enable=False) +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +dataset_type = 'VideoDataset' +default_hooks = dict( + checkpoint=dict( + interval=3, max_keep_ckpts=3, save_best='auto', type='CheckpointHook'), + logger=dict(ignore_last=False, interval=20, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + runtime_info=dict(type='RuntimeInfoHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffers=dict(type='SyncBuffersHook'), + timer=dict(type='IterTimerHook')) +default_scope = 'mmaction' +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +file_client_args = dict(io_backend='disk') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=20) +model = dict( + backbone=dict( + depth=50, + norm_eval=False, + pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth', + type='ResNet'), + cls_head=dict( + average_clips='prob', + consensus=dict(dim=1, type='AvgConsensus'), + dropout_ratio=0.4, + in_channels=2048, + init_std=0.01, + num_classes=400, + spatial_type='avg', + type='TSNHead'), + data_preprocessor=dict( + format_shape='NCHW', + mean=[ + 123.675, + 116.28, + 103.53, + ], + std=[ + 58.395, + 57.12, + 57.375, + ], + type='ActionDataPreprocessor'), + test_cfg=None, + train_cfg=None, + type='Recognizer2D') +optim_wrapper = dict( + clip_grad=dict(max_norm=40, norm_type=2), + optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0001)) +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + end=100, + gamma=0.1, + milestones=[ + 40, + 80, + ], + type='MultiStepLR'), +] +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + ann_file='data/kinetics400/kinetics400_val_list_videos.txt', + data_prefix=dict(video='data/kinetics400/videos_val'), + pipeline=[ + dict(io_backend='disk', type='DecordInit'), + dict( + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True, + type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict(crop_size=224, type='TenCrop'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), + ], + test_mode=True, + type='VideoDataset'), + num_workers=8, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict(type='AccMetric') +test_pipeline = [ + dict(io_backend='disk', type='DecordInit'), + dict( + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True, + type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict(crop_size=224, type='TenCrop'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), +] +train_cfg = dict( + max_epochs=100, type='EpochBasedTrainLoop', val_begin=1, val_interval=1) +train_dataloader = dict( + batch_size=32, + dataset=dict( + ann_file='data/kinetics400/kinetics400_train_list_videos.txt', + data_prefix=dict(video='data/kinetics400/videos_train'), + pipeline=[ + dict(io_backend='disk', type='DecordInit'), + dict( + clip_len=1, frame_interval=1, num_clips=8, + type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict( + input_size=224, + max_wh_scale_gap=1, + random_crop=False, + scales=( + 1, + 0.875, + 0.75, + 0.66, + ), + type='MultiScaleCrop'), + dict(keep_ratio=False, scale=( + 224, + 224, + ), type='Resize'), + dict(flip_ratio=0.5, type='Flip'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), + ], + type='VideoDataset'), + num_workers=8, + persistent_workers=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_pipeline = [ + dict(io_backend='disk', type='DecordInit'), + dict(clip_len=1, frame_interval=1, num_clips=8, type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict( + input_size=224, + max_wh_scale_gap=1, + random_crop=False, + scales=( + 1, + 0.875, + 0.75, + 0.66, + ), + type='MultiScaleCrop'), + dict(keep_ratio=False, scale=( + 224, + 224, + ), type='Resize'), + dict(flip_ratio=0.5, type='Flip'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), +] +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=32, + dataset=dict( + ann_file='data/kinetics400/kinetics400_val_list_videos.txt', + data_prefix=dict(video='data/kinetics400/videos_val'), + pipeline=[ + dict(io_backend='disk', type='DecordInit'), + dict( + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True, + type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict(crop_size=224, type='CenterCrop'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), + ], + test_mode=True, + type='VideoDataset'), + num_workers=8, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict(type='AccMetric') +val_pipeline = [ + dict(io_backend='disk', type='DecordInit'), + dict( + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True, + type='SampleFrames'), + dict(type='DecordDecode'), + dict(scale=( + -1, + 256, + ), type='Resize'), + dict(crop_size=224, type='CenterCrop'), + dict(input_format='NCHW', type='FormatShape'), + dict(type='PackActionInputs'), +] +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='ActionVisualizer', vis_backends=[ + dict(type='LocalVisBackend'), + ]) diff --git a/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth b/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth new file mode 100644 index 0000000000000000000000000000000000000000..7655e0930a9f8d79b633d7892f0c9ae1b3557f49 --- /dev/null +++ b/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2692d16c712e24994aaa3cfb48f957a521e053ffb81c474e2c0b3e579c888650 +size 97641409