euijinrnd commited on Jul 29

Commit

9de9fbf

verified ·

1 Parent(s): 5142365

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +184 -0
LICENSE +21 -0
README.md +357 -0
Untitled.ipynb +86 -0
data/.gitignore +2 -0
data/compute_dataset_stat.py +240 -0
data/compute_dataset_stat_hdf5.py +100 -0
data/episode_transform.py +406 -0
data/filelock.py +24 -0
data/hdf5_maniskill_dataset.py +243 -0
data/hdf5_vla_dataset.py +533 -0
data/preprocess.py +323 -0
data/preprocess_scripts/__init__.py +73 -0
data/preprocess_scripts/aloha_shoes_table.py +55 -0
data/preprocess_scripts/austin_buds_dataset_converted_externally_to_rlds.py +82 -0
data/preprocess_scripts/berkeley_autolab_ur5.py +95 -0
data/preprocess_scripts/berkeley_cable_routing.py +73 -0
data/preprocess_scripts/berkeley_gnm_sac_son.py +78 -0
data/preprocess_scripts/berkeley_rpt_converted_externally_to_rlds.py +84 -0
data/preprocess_scripts/calvin.py +176 -0
data/preprocess_scripts/cmu_franka_exploration_dataset_converted_externally_to_rlds.py +75 -0
data/preprocess_scripts/cmu_play_fusion.py +82 -0
data/preprocess_scripts/cmu_stretch.py +84 -0
data/preprocess_scripts/droid.py +78 -0
data/preprocess_scripts/fractal20220817_data.py +92 -0
data/preprocess_scripts/iamlab_cmu_pickup_insert_converted_externally_to_rlds.py +80 -0
data/preprocess_scripts/libero_goal_no_noops.py +82 -0
data/preprocess_scripts/libero_spatial_no_noops.py +82 -0
data/preprocess_scripts/nyu_rot_dataset_converted_externally_to_rlds.py +82 -0
data/preprocess_scripts/robo_net.py +71 -0
data/preprocess_scripts/robomimic_lift_ph.py +97 -0
data/preprocess_scripts/robomimic_square_ph.py +97 -0
data/preprocess_scripts/roboset.py +367 -0
data/preprocess_scripts/roboturk.py +77 -0
data/preprocess_scripts/roboturk_real_objectsearch.py +217 -0
data/preprocess_scripts/roboturk_real_towercreation.py +223 -0
data/preprocess_scripts/stanford_hydra_dataset_converted_externally_to_rlds.py +94 -0
data/preprocess_scripts/tokyo_u_lsmo_converted_externally_to_rlds.py +90 -0
data/preprocess_scripts/utokyo_pr2_opening_fridge_converted_externally_to_rlds.py +92 -0
data/preprocess_scripts/utokyo_xarm_bimanual_converted_externally_to_rlds.py +117 -0
data/preprocess_scripts/viola.py +89 -0
data/producer.py +280 -0
data/utils.py +235 -0
data/vla_dataset.py +147 -0
encode_lang.py +60 -0
finetune.sh +57 -0
finetune_maniskill.sh +48 -0
inference.sh +5 -0
main.py +301 -0
models/ema_model.py +89 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,184 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Some encoder paths
+facebook/
+openai/
+google/
+# Log
+logs/
+# Output
+outs/
+# Checkpoints
+checkpoints/
+# VSC
+.vscode/
+# Wandb
+wandb/
+# Distributed leaning
+hostfile.txt
+.deepspeed_env

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 TSAIL group
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,357 @@

+# RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation
+### 📝[Paper](https://arxiv.org/pdf/2410.07864) | 🌍[Project Page](https://rdt-robotics.github.io/rdt-robotics/) | 🤗[Model](https://huggingface.co/robotics-diffusion-transformer/rdt-1b) | 🛢️[Data](https://huggingface.co/datasets/robotics-diffusion-transformer/rdt-ft-data)
+![](./assets/head.png)
+RDT-1B is a **1B**-parameter (*largest* to date) imitation learning **Diffusion Transformer** pre-trained on **1M+** (*largest* to date) multi-robot episodes. Given language instruction and RGB images of up to three views, RDT can predict the next $64$ robot actions. RDT is inherently compatible with **almost all kinds of modern mobile manipulators**, from single-arm to dual-arm, joint to EEF, position to velocity, and even with wheeled locomotion.
+We have fine-tuned RDT on **6K+** (one of the *largest*) self-collected bimanual episodes and deployed it on the ALOHA **dual-arm** robot. It has achieved state-of-the-art performance in terms of dexterity, zero-shot generalizability, and few-shot learning. You can find Demo videos on our [project page](https://rdt-robotics.github.io/rdt-robotics/).
+This repo is an official PyTorch implementation of RDT, containing:
+- 🛠️Model [implementation](models/rdt_runner.py) of RDT
+- 🤗1M-step [checkpoint](https://huggingface.co/robotics-diffusion-transformer/rdt-1b) of RDT-1B pre-trained on multi-robot data
+- 🤗500K-step [checkpoint](https://huggingface.co/robotics-diffusion-transformer/rdt-170m) of RDT-170M (RDT(small) in [ablation](https://arxiv.org/pdf/2410.07864))
+- 📈Training and sampling [scripts](train/train.py) (with DeepSpeed)
+- 🤖An [example](scripts/agilex_inference.py) of real-robot deployment
+- 🕹️Simulation benchmark from [Maniskill](https://github.com/haosulab/ManiSkill) environment
+The following guides include the [installation](#installation), [fine-tuning](#fine-tuning-on-your-own-dataset), and [deployment](#deployment-on-real-robots). Please refer to [pre-training](docs/pretrain.md) for a detailed list of pre-training datasets and a pre-training guide.
+## 📰 News
+- [2024/12/17] 🔥 [Scripts](#simulation-benchmark) for evaluating RDT in Maniskill Simulation Benchmark is released!
+- [2024/10/23] 🔥 **RDT-170M** (Smaller) model is released, a more VRAM-friendly solution 🚀💻.
+## Installation
+1. Clone this repo and install prerequisites:
+    ```bash
+    # Clone this repo
+    git clone [email protected]:thu-ml/RoboticsDiffusionTransformer.git
+    cd RoboticsDiffusionTransformer
+    # Create a Conda environment
+    conda create -n rdt python=3.10.0
+    conda activate rdt
+    # Install pytorch
+    # Look up https://pytorch.org/get-started/previous-versions/ with your cuda version for a correct command
+    pip install torch==2.1.0 torchvision==0.16.0  --index-url https://download.pytorch.org/whl/cu121
+    # Install packaging
+    pip install packaging==24.0
+    # Install flash-attn
+    pip install flash-attn --no-build-isolation
+    # Install other prequisites
+    pip install -r requirements.txt
+    ```
+2. Download off-the-shelf multi-modal encoders:
+   You can download the encoders from the following links:
+   - `t5-v1_1-xxl`: [link](https://huggingface.co/google/t5-v1_1-xxl/tree/main)🤗
+   - `siglip`: [link](https://huggingface.co/google/siglip-so400m-patch14-384)🤗
+   And link the encoders to the repo directory:
+   ```bash
+   # Under the root directory of this repo
+   mkdir -p google
+   # Link the downloaded encoders to this repo
+   ln -s /path/to/t5-v1_1-xxl google/t5-v1_1-xxl
+   ln -s /path/to/siglip-so400m-patch14-384 google/siglip-so400m-patch14-384
+   ```
+3. Fill the missing argument in [this file](configs/base.yaml#L22):
+   Note that this buffer will only be used during pre-training. See [this doc](docs/pretrain.md) for more details.
+   ```
+   # ...
+   dataset:
+   # ...
+   # ADD YOUR buf_path: the path to the buffer (at least 400GB)
+      buf_path: /path/to/buffer
+   # ...
+   ```
+## Fine-Tuning on Your Own Dataset
+If your fine-tuning dataset is in the [Open X-Embodiment](https://robotics-transformer-x.github.io/) or the collection of our pre-training datasets (see [this doc](docs/pretrain.md#download-and-prepare-datasets)), you can also fine-tune RDT through the pre-trained pipeline. You need to remove other redundant datasets in the parameters. We refer to [this guide](docs/pretrain.md) (pre-training).
+1. Prepare your dataset:
+   You need to download your dataset to the disk and give it a name `my_cool_dataset`.
+   Then, you can link your dataset to the repo directory:
+   ```bash
+   # Under the root directory of this repo
+   cd data
+   mkdir -p datasets
+   # Link the downloaded dataset to this repo
+   ln -s /path/to/my_cool_dataset datasets/my_cool_dataset
+   ```
+2. Implement the dataset loader:
+   You need to:
+   1. Register the configuration of `my_cool_dataset`:
+      Append the control frequency of `my_cool_dataset` in [this file](configs/dataset_control_freq.json). Write the name of `my_cool_dataset` in [this file](configs/finetune_datasets.json) and [this file](configs/finetune_sample_weights.json), where the value of the sampling weight doesn't matter since you only have one dataset. In these two files, we leave a placeholder of `agilex`; you can simply replace it with `my_cool_dataset`.
+   2. Re-Implement the class of `HDF5VLADataset`:
+      You can find this class in [this file](data/hdf5_vla_dataset.py). In this file, we provide an example of loading the fine-tuning dataset used in our paper (see [this link](https://huggingface.co/datasets/robotics-diffusion-transformer/rdt-ft-data)).
+      To adapt it to your dataset, you need to: (a) modify the `HDF5_DIR` (directory to `my_cool_dataset`) and `DATASET_NAME` (should be `"my_cool_dataset"`) in L21 and L22; (b) Implement the two functions of `parse_hdf5_file()` and `parse_hdf5_file_state_only()`. Please take a look at the original file for detailed comments and examples.
+      Note 1: Despite its name, you don't necessarily need to use HDF5 to store your data. Just make sure that the class is correctly implemented.
+      Note 2: During implementation, you may need to fill your robot action into the unified action vector (L180-194). Please refer to [this file](configs/state_vec.py) for an explanation of each element in the unified vector. We have reserved enough slots for each physical quantity. For example, we have reserved ten slots for joint angles. If your robot arm has six degrees of freedom, you only need to fill in the first six.
+      **IMPORTANT 1:** If your robot is single-arm, please fill its action into the *right-arm* portion of the unified action vector, aligning with our pre-training datasets.
+      **IMPORTANT 2:** We use [6D representation](https://arxiv.org/pdf/1812.07035) for EEF rotation. If your action space contains EEF rotation (angle or quaternion), please refer to [this file](docs/test_6drot.py) for conversion. We note that this mapping is not reversible. Different Euler angles may be equivalent and correspond to the same 6D representation.
+      **IMPORTANT 3:** No physical quantities (except the gripper width) are normalized during pre-training. This can preserve each physical quantity's meaning, thereby promoting generalization across robots. Therefore, we encourage you not to normalize any physical quantities but to choose appropriate units for them. Generally, we use the International System of Units, which ensures that most values fall within [-1,1]. As an exception, we perform min-max normalization on the gripper width to [0,1].
+      **IMPORTANT 4:** If you use RTX 4090 (or lower), the GPU memory may be too low to load the `t5-v1_1-xxl` encoder. Instead, we recommend you precompute the language embeddings (see [this file](scripts/encode_lang_batch.py) for an example script) and load them during training. In this way, you need to specify the path to the embeddings in the `HDF5VLADataset` (see L148) rather than the natural language.
+   3. Compute the dataset statistics information for `my_cool_dataset`:
+      ```bash
+      # Under the root directory of this repo
+      # Use -h to see the full usage
+      python -m data.compute_dataset_stat_hdf5
+      ```
+3. Start fine-tuning:
+   Configurations relevant to model architecture and data processing are in [this file](configs/base.yaml). Normally, you do not need to modify these configurations; otherwise, it will cause errors in loading the pre-training checkpoint. Configurations relevant to training are passed through *Command Line Arguments*. Use `python main.py -h ` to see the descriptions. We provide an example of a fine-tuning script in [this file](finetune.sh) (`finetune.sh`). You may need to modify some of the parameters in this file, such as `CUTLASS_PATH` and `WANDB_PROJECT`.
+   Use this to start fine-tuning:
+   ```bash
+   source finetune.sh
+   ```
+   with `finetune.sh` detailed as below:
+   ```bash
+      deepspeed --hostfile=hostfile.txt main.py \
+         --deepspeed="./configs/zero2.json" \   # If you want to use DeepSpeed, which is strongly recommended
+         --pretrained_model_name_or_path=<MODEL ID | DIRECTORY OF MODEL WEIGHTS | PATH TO MODEL CHECKPOINT> \
+         --pretrained_text_encoder_name_or_path=<MODEL ID | PATH TO MODEL DIRECTORY > \   # e.g., google/t5-v1_1-xxl
+         --pretrained_vision_encoder_name_or_path=<MODEL ID | PATH TO MODEL DIRECTORY> \  # e.g., google/siglip-so400m-patch14-384
+         --output_dir=<DIRECTORY to SAVE CHECKPOINTS> \ # e.g., checkpoints/rdt-1b-agilex
+         --train_batch_size=32 \
+         --sample_batch_size=64 \   # batch size for diffusion sampling in validation
+         --max_train_steps=200000 \
+         --checkpointing_period=1000 \
+         --sample_period=500 \   # sample period for validation
+         --checkpoints_total_limit=40 \
+         --lr_scheduler="constant" \
+         --learning_rate=1e-4 \
+         --mixed_precision="bf16" \ # If you want to use mixed precision, bf16 is recommended
+         --dataloader_num_workers=8 \
+         --image_aug \  # If you want to use image augmentation
+         --dataset_type="finetune" \
+         --state_noise_snr=40 \  # If you want to add noise to the state
+         --load_from_hdf5 \   # If you use HDF5 to store your data
+         --report_to=wandb
+   ```
+   **IMPORTANT**: If you have already chosen to precompute the language embeddings, please specify `--precomp_lang_embed` in the `finetune.sh`.
+   Note 1: `pretrained_model_name_or_path` can one of:
+      - a string, the *model id* of a pre-trained model hosted inside a model repo on HuggingFace. Please fill with `"robotics-diffusion-transformer/rdt-1b"`, which is the officially-released [RDT-1B model](https://huggingface.co/robotics-diffusion-transformer/rdt-1b)🤗 at HuggingFace. (recommended)
+      - a string, the path to a *directory* containing the manually downloaded model weights from HuggingFace, e.g., `"/path/to/rdt-1b"`. You should first manually download the `rdt-1b` directory from this [link](https://huggingface.co/robotics-diffusion-transformer/rdt-1b)🤗.
+      - a string, the path to a *directory* containing model weights saved using [`~RDTRunner.save_pretrained`] method. This can be either:
+        -  `"checkpoints/rdt-pretrain-1b/checkpoint-<STEP NUMBER>"`: This is the path to the checkpoint saved in the `<STEP NUMBE>` iteration during pre-training. Refer to [this file](docs/pretrain.md) for a tutorial on how to start your own pre-training.
+        - `"checkpoints/rdt-pretrain-1b"`: If the pre-training completes normally without any exception, you can specify this path to load the last checkpoint.
+      - a string, the path to model checkpoint (`*.pt`) saved by DeepSpeed, e.g., `"checkpoints/rdt-pretrain-1b/checkpoint-<STEP NUMBER>/pytorch_model/mp_rank_00_model_states.pt"` (verified)
+      - `None` if you want to randomly initialize the model using configuration at `config_path`.
+   Note 2: You can monitor the training process by observing `loss` (through a long window moving average) and `overall_avg_sample_mse` in [Wandb](https://wandb.ai/site) or [TensorBoard](https://www.tensorflow.org/tensorboard). We empirically found that the lower the `overall_avg_sample_mse`, the better the model performs. Usually, fine-tuning is over when this value converges.
+   Note 3: If the training oscillates, you can increase the batch size by adding more GPUs or setting a larger `--gradient_accumulation_steps`.
+## Deployment on Real-Robots
+We have encapsulated the inference of the model into a class named `RoboticDiffusionTransformerModel` (see [this file](scripts/agilex_model.py#L38)). You can call this class's `step()` method for inference. However, you may need to re-implement some parts according to your specific robot. You should at least modify the `_format_joint_to_state()` (L164) and `_unformat_action_to_joint()` (L196) to convert between robot raw actions and unified action vectors that RDT accepts. You may also specify the control frequency of your robot (L49).
+**IMPORTANT**: When you feed the images into `step()`, remember the order MUST be `[ext_{t-1}, right_wrist_{t-1}, left_wrist_{t-1}, ext_{t}, right_wrist_{t}, left_wrist_{t}]`.
+We provide an example hardware code in [this file](scripts/agilex_inference.py) for deployment on Mobile ALOHA, and the corresponding running script in [this file](inference.sh) (`inference.sh`), which is detailed below;
+   ```bash
+      python -m scripts.agilex_inference \
+         --use_actions_interpolation \
+         --pretrained_model_name_or_path=<PATH TO MODEL CHECKPOINT> \  # your finetuned checkpoint: e.g., checkpoints/rdt-finetune-1b/checkpoint-<STEP NUMBER>, checkpoints/rdt-finetune-1b/checkpoint-<STEP NUMBER>/pytorch_model/mp_rank_00_model_states.pt, the same before
+         --lang_embeddings_path=<PATH TO YOUR INSTURCTION EMBEDDINGS> \ # e.g. outs/lang_embeddings/your_instr.pt"
+         --ctrl_freq=25    # your control frequency
+   ```
+**IMPORTANT**: If you on-board GPU memory is not enough to encode the language, please refer to [this file](scripts/encode_lang.py) for precomputation and specify the language embedding path in `inference.sh`. Detail instructions are provided below:
+   1. Set Required Parameters in `scripts/encode_lang.py`
+      ```python
+      # ...
+      GPU = 0
+      MODEL_PATH = "google/t5-v1_1-xxl"
+      CONFIG_PATH = "configs/base.yaml"
+      SAVE_DIR = "outs/"   # output directory
+      # Modify this to your task name and instruction
+      TASK_NAME = "handover_pan"
+      INSTRUCTION = "Pick up the black marker on the right and put it into the packaging box on the left."
+      # Note: if your GPU VRAM is less than 24GB,
+      # it is recommended to enable offloading by specifying an offload directory.
+      OFFLOAD_DIR = None  # Specify your offload directory here, ensuring the directory exists.
+      # ...
+      ```
+   2. Run the script
+      ```
+      python -m scripts.encode_lang
+      ```
+Note: If you want to deploy on the Mobile ALOHA robot, don't forget to install the hardware prerequisites (see [this repo](https://github.com/MarkFzp/mobile-aloha)).
+## Simulation Benchmark
+We comprehensively evaluate RDT against baseline methods using the ManiSkill simulation benchmark. Specifically, we focus on five benchmark tasks: `PegInsertionSide`, `PickCube`, `StackCube`, `PlugCharger`, and `PushCube`. Here's a brief overview of the evaluation setup:
+**Evaluation Setup:**
+1. **Install ManiSkill:**
+   Within the [RDT environment](#installation), install ManiSkill as follows:
+   ```bash
+   conda activate rdt
+   pip install --upgrade mani_skill
+   ```
+2. **Configure Vulkan:**
+   Follow the [ManiSkill documentation](https://maniskill.readthedocs.io/en/latest/user_guide/getting_started/installation.html#vulkan) to properly set up Vulkan。
+3. **Obtain Model Weights:**
+   Download the fine-tuned model weights from [this Hugging Face repository](https://huggingface.co/robotics-diffusion-transformer/maniskill-model/tree/main/rdt). Download the precomputed language embeddings from [here](https://huggingface.co/robotics-diffusion-transformer/maniskill-model/tree/main/lang_embeds) to the root directory of this repo.
+4. **Run Evaluation Scripts:**
+   After completing the setup steps, execute the provided evaluation scripts to assess RDT on the selected tasks.
+```
+conda activate rdt
+python -m eval_sim.eval_rdt_maniskill \
+--pretrained_path PATH_TO_PRETRAINED_MODEL
+```
+### Implementation Details
+#### Data
+Utilizing the [official ManiSkill repository](https://github.com/haosulab/ManiSkill), we generated 5,000 trajectories through motion planning. The initial action mode of these trajectories is absolute joint position control and we subsequently converted them into delta end-effector pose control to align with the pre-training action space of OpenVLA and Octo. We strictly adhered to the official codebases of OpenVLA and Octo, modifying only the dataset-loading scripts. Consequently, we finetuned OpenVLA and Octo using the delta end-effector pose data. For RDT and Diffusion-Policy we leverage joint position control data for training which is aligned with our pre-training stage as well.
+####  Training
+- OpenVLA is fine-tuned from the officially released pre-trained checkpoint with LoRA-rank 32 until converge.
+- Octo is fine-tuned from the officially released pre-trained checkpoint for 1M iterations until converge.
+- Diffusion-Policy is trained from scratch for 1000 epochs. We select the checkpoint of 700 epoch which has the lowest validation sample loss of 1e-3.
+- RDT is fine-tuned from our released pre-trained checkpoint for 300ks iterations.
+#### Results
+Each method is evaluated over 250 trials (10 random seeds with 25 trials per seed). The quantitative results, including success rate mean and std value across 10 random seeds are presented below:
+||PegInsertionSide|PickCube|StackCube|PlugCharger|PushCube|Mean|
+|---|---|---|---|---|---|---|
+|RDT|**13.2±0.29%**|**77.2±0.48%**|74.0±0.30%|**1.2±0.07%**|**100±0.00%**|**53.6±0.52%**|
+|OpenVLA|0.0±0.00%|8±0.00%|8±0.00%|0.0±0.00%|8±0.00%|4.8±0.00%|
+|Octo|0.0±0.00%|0.0±0.00%|0.0±0.00%|0.0±0.00%|0.0±0.00%|0.0±0.00%|
+|Diffusion-Policy|0.0±0.00%|40.0±0.00%|**80.0±0.00%**|0.0%±0.00%|88.0±0.00%|30.2±0.00%|
+#### Finetune RDT with Maniskill Data
+To fine-tune RDT with Maniskill data, first download the Maniskill data from [here](https://huggingface.co/robotics-diffusion-transformer/maniskill-model) and extract it to `data/datasets/rdt-ft-data`. Then copy the code in `data/hdf5_vla_dataset.py` to `data/hdf5_maniskill_dataset.py` and run the following script:
+```
+bash finetune_maniskill.sh
+```
+#### Reproducing Baseline Results
+Download and extract the fine-tuned model weights from [here](https://huggingface.co/robotics-diffusion-transformer/maniskill-model) to `eval_sim/`.
+- OpenVLA: Clone [OpenVLA repo](https://github.com/openvla/openvla) in `./eval_sim/` and install its environment & ManiSkill. Then run the following script:
+```
+python -m eval_sim.eval_openvla --pretrained_path PATH_TO_PRETRAINED_MODEL
+```
+- Octo: Clone [Octo repo](https://github.com/octo-models/octo.git) in `./eval_sim/` and install its environment & ManiSkill. The run the following script:
+```
+python -m eval_sim.eval_octo --pretrained_path PATH_TO_PRETRAINED_MODEL
+```
+- Diffusion-Policy: Clone our simplified [Diffusion-Policy repo](https://github.com/LBG21/RDT-Eval-Diffusion-Policy) in `./eval_sim/` and run:
+```
+python -m eval_sim.eval_dp --pretrained_path PATH_TO_PRETRAINED_MODEL
+```
+## FAQ
+### 1. How can I fine-tune RDTs with limited VRAM?
+- **Use a Smaller Model**: Opt for the [RDT-170M model](https://huggingface.co/robotics-diffusion-transformer/rdt-170m), which requires less VRAM.
+- **Select a Memory-Efficient ZeRO Stage**: Choose a more memory-efficient ZeRO stage based on your needs:
+  - **ZeRO-3 with Offload** > **ZeRO-3** > **ZeRO-2 with Offload** > **ZeRO-2** > **ZeRO-1**
+  - By default, we use [ZeRO-2](https://github.com/thu-ml/RoboticsDiffusionTransformer/blob/c68398ed526733faca4eec52cc1a7d15a9f8fea7/finetune.sh#L29) for a balance between speed and memory efficiency. Find more details on ZeRO stages [here](https://huggingface.co/docs/transformers/main/deepspeed#select-a-zero-stage) and [here](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training).
+- **Enable 8-bit Adam Optimization**: Activate 8-bit Adam by setting [`use_8bit_adam=True`](https://github.com/thu-ml/RoboticsDiffusionTransformer/blob/c68398ed526733faca4eec52cc1a7d15a9f8fea7/main.py#L195) for reduced memory usage during training.
+- **Apply 4-bit or 8-bit Quantization**: Quantizing model weights can significantly reduce VRAM requirements.
+- **Use [XFormers](https://github.com/facebookresearch/xformers)**: This library provides optimized transformers with efficient memory usage.
+- **Enable Gradient Checkpointing**: Implement `gradient_checkpointing` manually to save memory during backpropagation. See [here](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html) for instructions. Once you have successfully implemented this feature, we welcome you to submit a PR👏.
+- **Gradient Accumulation**: Set a larger `--gradient_accumulation_steps=<num_steps>`. This will accumulate the gradients of `<num_steps>` batches for backpropagation. Equivalently, this will increase the batch size by `<num_steps>` times, at the cost of `<num_steps>` times the running time.
+### 2. How many steps are recommended for fine-tuning RDT?
+Regardless of the batch size you select, it is recommended to train for at least 150K steps to achieve optimal results.
+### 3. What to do if t5-xxL is too large to store in GPU memory?
+1. Do not load T5-XXL in your GPU memory when training. Pre-compute language embeddings in advance.
+2. Set `OFFLOAD_DIR` to enable CPU offloading in `scripts/encode_lang_batch.py` and `scripts/encode_lang.py`.
+3. Use smaller versions of t5 like t5-base instead of t5-xxL.
+## Citation
+If you find our work helpful, please cite us:
+```bibtex
+@article{liu2024rdt,
+  title={RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation},
+  author={Liu, Songming and Wu, Lingxuan and Li, Bangguo and Tan, Hengkai and Chen, Huayu and Wang, Zhengyi and Xu, Ke and Su, Hang and Zhu, Jun},
+  journal={arXiv preprint arXiv:2410.07864},
+  year={2024}
+}
+```
+Thank you!
+## License
+All the code, model weights, and data are licensed under [MIT license](./LICENSE).

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "71e6c6b4-1e9b-4abb-b36e-80f7fa919c93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import h5py\n",
+    "import os\n",
+    "import fnmatch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6bc2e70e-970c-4874-8ce2-5950acbf74d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = 'singlevla_benchmark'\n",
+    "HDF5_DIR = f\"/home/shared/{dataset_name}/\"\n",
+    "DATASET_NAME = dataset_name\n",
+    "\n",
+    "file_paths = []\n",
+    "for root, _, files in os.walk(HDF5_DIR):\n",
+    "    for filename in fnmatch.filter(files, '*.hdf5'):\n",
+    "        file_path = os.path.join(root, filename)\n",
+    "        file_paths.append(file_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a43097df-bd96-4300-9473-748ce19406c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = h5py.File(file_paths[0], 'r')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3bb5763f-4156-4dba-ac19-66361837afbd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<KeysViewHDF5 ['ee_pos', 'joint_pos', 'leftview_image', 'rightview_image']>"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "f['observation'].keys()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

data/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Ignore data files
2	+ datasets

data/compute_dataset_stat.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+This file will compute the min, max, mean, and standard deviation of each datasets
+in `pretrain_datasets.json` or `pretrain_datasets.json`.
+"""
+import json
+import argparse
+import os
+# from multiprocessing import Pool, Manager
+import tensorflow as tf
+import numpy as np
+from tqdm import tqdm
+from data.vla_dataset import VLADataset
+from data.hdf5_vla_dataset import HDF5VLADataset
+from data.preprocess import generate_json_state
+# Process each dataset to get the statistics
+@tf.autograph.experimental.do_not_convert
+def process_dataset(name_dataset_pair):
+    # print(f"PID {os.getpid()} processing {name_dataset_pair[0]}")
+    dataset_iter = name_dataset_pair[1]
+    MAX_EPISODES = 100000
+    EPS = 1e-8
+    # For debugging
+    # MAX_EPISODES = 10
+    episode_cnt = 0
+    state_sum = 0
+    state_sum_sq = 0
+    z_state_sum = 0
+    z_state_sum_sq = 0
+    state_cnt = 0
+    nz_state_cnt = None
+    state_max = None
+    state_min = None
+    for episode in dataset_iter:
+        episode_cnt += 1
+        if episode_cnt % 1000 == 0:
+            print(f"Processing episodes {episode_cnt}/{MAX_EPISODES}")
+        if episode_cnt > MAX_EPISODES:
+            break
+        episode_dict = episode['episode_dict']
+        dataset_name = episode['dataset_name']
+        res_tup = generate_json_state(
+            episode_dict, dataset_name
+        )
+        states = res_tup[1]
+        # Convert to numpy
+        states = states.numpy()
+        # Zero the values that are close to zero
+        z_states = states.copy()
+        z_states[np.abs(states) <= EPS] = 0
+        # Compute the non-zero count
+        if nz_state_cnt is None:
+            nz_state_cnt = np.zeros(states.shape[1])
+        nz_state_cnt += np.sum(np.abs(states) > EPS, axis=0)
+        # Update statistics
+        state_sum += np.sum(states, axis=0)
+        state_sum_sq += np.sum(states**2, axis=0)
+        z_state_sum += np.sum(z_states, axis=0)
+        z_state_sum_sq += np.sum(z_states**2, axis=0)
+        state_cnt += states.shape[0]
+        if state_max is None:
+            state_max = np.max(states, axis=0)
+            state_min = np.min(states, axis=0)
+        else:
+            state_max = np.maximum(state_max, np.max(states, axis=0))
+            state_min = np.minimum(state_min, np.min(states, axis=0))
+    # Add one to avoid division by zero
+    nz_state_cnt = np.maximum(nz_state_cnt, np.ones_like(nz_state_cnt))
+    result = {
+        "dataset_name": name_dataset_pair[0],
+        "state_mean": (state_sum / state_cnt).tolist(),
+        "state_std": np.sqrt(
+            np.maximum(
+                (z_state_sum_sq / nz_state_cnt) - (z_state_sum / state_cnt)**2 * (state_cnt / nz_state_cnt),
+                np.zeros_like(state_sum_sq)
+            )
+        ).tolist(),
+        "state_min": state_min.tolist(),
+        "state_max": state_max.tolist(),
+    }
+    return result
+def process_hdf5_dataset(vla_dataset):
+    EPS = 1e-8
+    episode_cnt = 0
+    state_sum = 0
+    state_sum_sq = 0
+    z_state_sum = 0
+    z_state_sum_sq = 0
+    state_cnt = 0
+    nz_state_cnt = None
+    state_max = None
+    state_min = None
+    for i in tqdm(range(len(vla_dataset))):
+        episode = vla_dataset.get_item(i, state_only=True)
+        episode_cnt += 1
+        states = episode['state']
+        # Zero the values that are close to zero
+        z_states = states.copy()
+        z_states[np.abs(states) <= EPS] = 0
+        # Compute the non-zero count
+        if nz_state_cnt is None:
+            nz_state_cnt = np.zeros(states.shape[1])
+        nz_state_cnt += np.sum(np.abs(states) > EPS, axis=0)
+        # Update statistics
+        state_sum += np.sum(states, axis=0)
+        state_sum_sq += np.sum(states**2, axis=0)
+        z_state_sum += np.sum(z_states, axis=0)
+        z_state_sum_sq += np.sum(z_states**2, axis=0)
+        state_cnt += states.shape[0]
+        if state_max is None:
+            state_max = np.max(states, axis=0)
+            state_min = np.min(states, axis=0)
+        else:
+            state_max = np.maximum(state_max, np.max(states, axis=0))
+            state_min = np.minimum(state_min, np.min(states, axis=0))
+    # Add one to avoid division by zero
+    nz_state_cnt = np.maximum(nz_state_cnt, np.ones_like(nz_state_cnt))
+    result = {
+        "dataset_name": vla_dataset.get_dataset_name(),
+        "state_mean": (state_sum / state_cnt).tolist(),
+        "state_std": np.sqrt(
+            np.maximum(
+                (z_state_sum_sq / nz_state_cnt) - (z_state_sum / state_cnt)**2 * (state_cnt / nz_state_cnt),
+                np.zeros_like(state_sum_sq)
+            )
+        ).tolist(),
+        "state_min": state_min.tolist(),
+        "state_max": state_max.tolist(),
+    }
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Multiprocessing currently with bugs
+    # parser.add_argument('--n_workers', type=int, default=1,
+    #                     help="Number of parallel workers.")
+    parser.add_argument('--dataset_type', type=str,
+                        default="pretrain",
+                        help="Whether to load the pretrain dataset or finetune dataset.")
+    parser.add_argument('--save_path', type=str,
+                        default="configs/dataset_stat.json",
+                        help="JSON file path to save the dataset statistics.")
+    parser.add_argument('--skip_exist', action='store_true',
+                        help="Whether to skip the existing dataset statistics.")
+    parser.add_argument('--hdf5_dataset', action='store_true',
+                        help="Whether to load the dataset from the HDF5 files.")
+    args = parser.parse_args()
+    if args.hdf5_dataset:
+        vla_dataset = HDF5VLADataset()
+        dataset_name = vla_dataset.get_dataset_name()
+        try:
+            with open(args.save_path, 'r') as f:
+                results = json.load(f)
+        except FileNotFoundError:
+            results = {}
+        if args.skip_exist and dataset_name in results:
+            print(f"Skipping existed {dataset_name} dataset statistics")
+        else:
+            print(f"Processing {dataset_name} dataset")
+            result = process_hdf5_dataset(vla_dataset)
+            results[result["dataset_name"]] = result
+            with open(args.save_path, 'w') as f:
+                json.dump(results, f, indent=4)
+        print("All datasets have been processed.")
+        os._exit(0)
+    vla_dataset = VLADataset(
+        seed=0, dataset_type=args.dataset_type, repeat=False)
+    name_dataset_pairs = vla_dataset.name2dataset.items()
+    # num_workers = args.n_workers
+    for name_dataset_pair in tqdm(name_dataset_pairs):
+        try:
+            with open(args.save_path, 'r') as f:
+                results = json.load(f)
+        except FileNotFoundError:
+            results = {}
+        if args.skip_exist and name_dataset_pair[0] in results:
+            print(f"Skipping existed {name_dataset_pair[0]} dataset statistics")
+            continue
+        print(f"Processing {name_dataset_pair[0]} dataset")
+        result = process_dataset(name_dataset_pair)
+        results[result["dataset_name"]] = result
+        # Save the results in the json file after each dataset (for resume)
+        with open(args.save_path, 'w') as f:
+            json.dump(results, f, indent=4)
+    print("All datasets have been processed.")
+    # with Manager() as manager:
+    #     # Create shared dictionary and lock through the manager, accessible by all processes
+    #     progress = manager.dict(processed=0, results={})
+    #     progress_lock = manager.Lock()
+    #     # Callback function to update progress
+    #     def update_progress(result):
+    #         with progress_lock:
+    #             progress['processed'] += 1
+    #             print(f"{result['dataset_name']} - {progress['processed']}/{len(name_dataset_pairs)} datasets have been processed")
+    #             # Append the result to the shared dictionary
+    #             progress['results'][result["dataset_name"]] = result
+    #     with Pool(num_workers) as p:
+    #         for name_dataset_pair in name_dataset_pairs:
+    #             p.apply_async(process_dataset, args=(name_dataset_pair,), callback=update_progress)
+    #         # Close the pool and wait for the work to finish
+    #         p.close()
+    #         p.join()
+        # # Save the results in the json file
+        # with open(args.save_path, 'w') as f:
+        #     json.dump(progress['results'], f, indent=4)

data/compute_dataset_stat_hdf5.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+This file will compute the min, max, mean, and standard deviation of each datasets
+in `pretrain_datasets.json` or `pretrain_datasets.json`.
+"""
+import json
+import argparse
+import numpy as np
+from tqdm import tqdm
+# from data.hdf5_vla_dataset import TabletopHDF5VLADataset as HDF5VLADataset
+from data.hdf5_vla_dataset import AnubisHDF5VLADataset as HDF5VLADataset
+def process_hdf5_dataset(vla_dataset):
+    EPS = 1e-8
+    episode_cnt = 0
+    state_sum = 0
+    state_sum_sq = 0
+    z_state_sum = 0
+    z_state_sum_sq = 0
+    state_cnt = 0
+    nz_state_cnt = None
+    state_max = None
+    state_min = None
+    for i in tqdm(range(len(vla_dataset))):
+        # print(i)
+        episode = vla_dataset.get_item(i, state_only=True)
+        episode_cnt += 1
+        states = episode['state']
+        # Zero the values that are close to zero
+        z_states = states.copy()
+        z_states[np.abs(states) <= EPS] = 0
+        # Compute the non-zero count
+        if nz_state_cnt is None:
+            nz_state_cnt = np.zeros(states.shape[1])
+        nz_state_cnt += np.sum(np.abs(states) > EPS, axis=0)
+        # Update statistics
+        state_sum += np.sum(states, axis=0)
+        state_sum_sq += np.sum(states**2, axis=0)
+        z_state_sum += np.sum(z_states, axis=0)
+        z_state_sum_sq += np.sum(z_states**2, axis=0)
+        state_cnt += states.shape[0]
+        if state_max is None:
+            state_max = np.max(states, axis=0)
+            state_min = np.min(states, axis=0)
+        else:
+            state_max = np.maximum(state_max, np.max(states, axis=0))
+            state_min = np.minimum(state_min, np.min(states, axis=0))
+    # Add one to avoid division by zero
+    nz_state_cnt = np.maximum(nz_state_cnt, np.ones_like(nz_state_cnt))
+    result = {
+        "dataset_name": vla_dataset.get_dataset_name(),
+        "state_mean": (state_sum / state_cnt).tolist(),
+        "state_std": np.sqrt(
+            np.maximum(
+                (z_state_sum_sq / nz_state_cnt) - (z_state_sum / state_cnt)**2 * (state_cnt / nz_state_cnt),
+                np.zeros_like(state_sum_sq)
+            )
+        ).tolist(),
+        "state_min": state_min.tolist(),
+        "state_max": state_max.tolist(),
+    }
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--save_path', type=str,
+                        default="configs/dataset_stat.json",
+                        help="JSON file path to save the dataset statistics.")
+    parser.add_argument('--skip_exist', action='store_true',
+                        help="Whether to skip the existing dataset statistics.")
+    parser.add_argument('--dataset', type=str)
+    args = parser.parse_args()
+    vla_dataset = HDF5VLADataset(args.dataset)
+    dataset_name = vla_dataset.get_dataset_name()
+    try:
+        with open(args.save_path, 'r') as f:
+            results = json.load(f)
+    except FileNotFoundError:
+        results = {}
+    if args.skip_exist and dataset_name in results:
+        print(f"Skipping existed {dataset_name} dataset statistics")
+    else:
+        print(f"Processing {dataset_name} dataset")
+        result = process_hdf5_dataset(vla_dataset)
+        results[result["dataset_name"]] = result
+        with open(args.save_path, 'w') as f:
+            json.dump(results, f, indent=4)
+    print("All datasets have been processed.")

data/episode_transform.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import numpy as np
+import tensorflow as tf
+import yaml
+from data.preprocess import generate_json_state
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+# Read the config
+with open('configs/base.yaml', 'r') as file:
+    config = yaml.safe_load(file)
+# Load some constants from the config
+IMG_HISTORY_SIZE = config['common']['img_history_size']
+if IMG_HISTORY_SIZE < 1:
+    raise ValueError("Config `img_history_size` must be at least 1.")
+ACTION_CHUNK_SIZE = config['common']['action_chunk_size']
+if ACTION_CHUNK_SIZE < 1:
+    raise ValueError("Config `action_chunk_size` must be at least 1.")
+@tf.function
+def process_episode(epsd: dict, dataset_name: str,
+                    image_keys: list, image_mask: list) -> dict:
+    """
+    Process an episode to extract the frames and the json content.
+    """
+    # Frames of each camera
+    # Ugly code due to tf's poor compatibility
+    frames_0 = tf.TensorArray(dtype=tf.uint8, size=0, dynamic_size=True)
+    frames_1 = tf.TensorArray(dtype=tf.uint8, size=0, dynamic_size=True)
+    frames_2 = tf.TensorArray(dtype=tf.uint8, size=0, dynamic_size=True)
+    frames_3 = tf.TensorArray(dtype=tf.uint8, size=0, dynamic_size=True)
+    # Traverse the episode to collect...
+    for step in iter(epsd['steps']):
+        # Parse the image
+        frames_0 = frames_0.write(frames_0.size(),
+            tf.cond(
+                tf.equal(image_mask[0], 1),
+                lambda: step['observation'][image_keys[0]],
+                lambda: tf.zeros([0, 0, 0], dtype=tf.uint8)
+            ))
+        # Very ugly code due to tf's poor compatibility
+        frames_1 = frames_1.write(frames_1.size(),
+            tf.cond(
+                tf.equal(image_mask[1], 1),
+                lambda: step['observation'][image_keys[1]],
+                lambda: tf.zeros([0, 0, 0], dtype=tf.uint8)
+            ))
+        # print(image_mask)
+        frames_2 = frames_2.write(frames_2.size(),
+            tf.cond(
+                tf.equal(image_mask[2], 1),
+                lambda: step['observation'][image_keys[2]],
+                lambda: tf.zeros([0, 0, 0], dtype=tf.uint8)
+            ))
+        frames_3 = frames_3.write(frames_3.size(),
+            tf.cond(
+                tf.equal(image_mask[3], 1),
+                lambda: step['observation'][image_keys[3]],
+                lambda: tf.zeros([0, 0, 0], dtype=tf.uint8)
+            ))
+    # Calculate the past_frames_0 for each step
+    # Each step has a window of previous frames with size IMG_HISTORY_SIZE
+    # Use the first state to pad the frames
+    # past_frames_0 will have shape (num_steps, IMG_HISTORY_SIZE, height, width, channels)
+    frames_0 = frames_0.stack()
+    first_frame = tf.expand_dims(frames_0[0], axis=0)
+    first_frame = tf.repeat(first_frame, IMG_HISTORY_SIZE-1, axis=0)
+    padded_frames_0 = tf.concat([first_frame, frames_0], axis=0)
+    indices = tf.range(IMG_HISTORY_SIZE, tf.shape(frames_0)[0] + IMG_HISTORY_SIZE)
+    past_frames_0 = tf.map_fn(
+        lambda i: padded_frames_0[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.uint8
+    )
+    frames_0_time_mask = tf.ones([tf.shape(frames_0)[0]], dtype=tf.bool)
+    padded_frames_0_time_mask = tf.pad(frames_0_time_mask, [[IMG_HISTORY_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_frames_0_time_mask = tf.map_fn(
+        lambda i: padded_frames_0_time_mask[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # For past_frames_1
+    frames_1 = frames_1.stack()
+    first_frame = tf.expand_dims(frames_1[0], axis=0)
+    first_frame = tf.repeat(first_frame, IMG_HISTORY_SIZE-1, axis=0)
+    padded_frames_1 = tf.concat([first_frame, frames_1], axis=0)
+    indices = tf.range(IMG_HISTORY_SIZE, tf.shape(frames_1)[0] + IMG_HISTORY_SIZE)
+    past_frames_1 = tf.map_fn(
+        lambda i: padded_frames_1[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.uint8
+    )
+    frames_1_time_mask = tf.ones([tf.shape(frames_1)[0]], dtype=tf.bool)
+    padded_frames_1_time_mask = tf.pad(frames_1_time_mask, [[IMG_HISTORY_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_frames_1_time_mask = tf.map_fn(
+        lambda i: padded_frames_1_time_mask[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # For past_frames_2
+    frames_2 = frames_2.stack()
+    first_frame = tf.expand_dims(frames_2[0], axis=0)
+    first_frame = tf.repeat(first_frame, IMG_HISTORY_SIZE-1, axis=0)
+    padded_frames_2 = tf.concat([first_frame, frames_2], axis=0)
+    indices = tf.range(IMG_HISTORY_SIZE, tf.shape(frames_2)[0] + IMG_HISTORY_SIZE)
+    past_frames_2 = tf.map_fn(
+        lambda i: padded_frames_2[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.uint8
+    )
+    frames_2_time_mask = tf.ones([tf.shape(frames_2)[0]], dtype=tf.bool)
+    padded_frames_2_time_mask = tf.pad(frames_2_time_mask, [[IMG_HISTORY_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_frames_2_time_mask = tf.map_fn(
+        lambda i: padded_frames_2_time_mask[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # For past_frames_3
+    frames_3 = frames_3.stack()
+    first_frame = tf.expand_dims(frames_3[0], axis=0)
+    first_frame = tf.repeat(first_frame, IMG_HISTORY_SIZE-1, axis=0)
+    padded_frames_3 = tf.concat([first_frame, frames_3], axis=0)
+    indices = tf.range(IMG_HISTORY_SIZE, tf.shape(frames_3)[0] + IMG_HISTORY_SIZE)
+    past_frames_3 = tf.map_fn(
+        lambda i: padded_frames_3[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.uint8
+    )
+    frames_3_time_mask = tf.ones([tf.shape(frames_3)[0]], dtype=tf.bool)
+    padded_frames_3_time_mask = tf.pad(frames_3_time_mask, [[IMG_HISTORY_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_frames_3_time_mask = tf.map_fn(
+        lambda i: padded_frames_3_time_mask[i - IMG_HISTORY_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # Creat the ids for each step
+    step_id = tf.range(0, tf.shape(frames_0)[0])
+    return {
+        'dataset_name': dataset_name,
+        'episode_dict': epsd,
+        'step_id': step_id,
+        'past_frames_0': past_frames_0,
+        'past_frames_0_time_mask': past_frames_0_time_mask,
+        'past_frames_1': past_frames_1,
+        'past_frames_1_time_mask': past_frames_1_time_mask,
+        'past_frames_2': past_frames_2,
+        'past_frames_2_time_mask': past_frames_2_time_mask,
+        'past_frames_3': past_frames_3,
+        'past_frames_3_time_mask': past_frames_3_time_mask,
+    }
+@tf.function
+def bgr_to_rgb(epsd: dict):
+    """
+    Convert BGR images to RGB images.
+    """
+    past_frames_0 = epsd['past_frames_0']
+    past_frames_0 = tf.cond(
+        tf.equal(tf.shape(past_frames_0)[-1], 3),
+        lambda: tf.stack([
+            past_frames_0[..., 2],
+            past_frames_0[..., 1],
+            past_frames_0[..., 0]
+        ], axis=-1),
+        lambda: past_frames_0
+    )
+    past_frames_1 = epsd['past_frames_1']
+    past_frames_1 = tf.cond(
+        tf.equal(tf.shape(past_frames_1)[-1], 3),
+        lambda: tf.stack([
+            past_frames_1[..., 2],
+            past_frames_1[..., 1],
+            past_frames_1[..., 0]
+        ], axis=-1),
+        lambda: past_frames_1
+    )
+    past_frames_2 = epsd['past_frames_2']
+    past_frames_2 = tf.cond(
+        tf.equal(tf.shape(past_frames_2)[-1], 3),
+        lambda: tf.stack([
+            past_frames_2[..., 2],
+            past_frames_2[..., 1],
+            past_frames_2[..., 0]
+        ], axis=-1),
+        lambda: past_frames_2
+    )
+    past_frames_3 = epsd['past_frames_3']
+    past_frames_3 = tf.cond(
+        tf.equal(tf.shape(past_frames_3)[-1], 3),
+        lambda: tf.stack([
+            past_frames_3[..., 2],
+            past_frames_3[..., 1],
+            past_frames_3[..., 0]
+        ], axis=-1),
+        lambda: past_frames_3
+    )
+    return {
+        'dataset_name': epsd['dataset_name'],
+        'episode_dict': epsd['episode_dict'],
+        'step_id': epsd['step_id'],
+        'past_frames_0': past_frames_0,
+        'past_frames_0_time_mask': epsd['past_frames_0_time_mask'],
+        'past_frames_1': past_frames_1,
+        'past_frames_1_time_mask': epsd['past_frames_1_time_mask'],
+        'past_frames_2': past_frames_2,
+        'past_frames_2_time_mask': epsd['past_frames_2_time_mask'],
+        'past_frames_3': past_frames_3,
+        'past_frames_3_time_mask': epsd['past_frames_3_time_mask'],
+    }
+def flatten_episode(episode: dict) -> tf.data.Dataset:
+    """
+    Flatten the episode to a list of steps.
+    """
+    episode_dict = episode['episode_dict']
+    dataset_name = episode['dataset_name']
+    json_content, states, masks = generate_json_state(
+        episode_dict, dataset_name
+    )
+    # Calculate the past_states for each step
+    # Each step has a window of previous states with size ACTION_CHUNK_SIZE
+    # Use the first state to pad the states
+    # past_states will have shape (num_steps, ACTION_CHUNK_SIZE, state_dim)
+    first_state = tf.expand_dims(states[0], axis=0)
+    first_state = tf.repeat(first_state, ACTION_CHUNK_SIZE-1, axis=0)
+    padded_states = tf.concat([first_state, states], axis=0)
+    indices = tf.range(ACTION_CHUNK_SIZE, tf.shape(states)[0] + ACTION_CHUNK_SIZE)
+    past_states = tf.map_fn(
+        lambda i: padded_states[i - ACTION_CHUNK_SIZE:i],
+        indices,
+        dtype=tf.float32
+    )
+    states_time_mask = tf.ones([tf.shape(states)[0]], dtype=tf.bool)
+    padded_states_time_mask = tf.pad(states_time_mask, [[ACTION_CHUNK_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_states_time_mask = tf.map_fn(
+        lambda i: padded_states_time_mask[i - ACTION_CHUNK_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # Calculate the future_states for each step
+    # Each step has a window of future states with size ACTION_CHUNK_SIZE
+    # Use the last state to pad the states
+    # future_states will have shape (num_steps, ACTION_CHUNK_SIZE, state_dim)
+    last_state = tf.expand_dims(states[-1], axis=0)
+    last_state = tf.repeat(last_state, ACTION_CHUNK_SIZE, axis=0)
+    padded_states = tf.concat([states, last_state], axis=0)
+    indices = tf.range(1, tf.shape(states)[0] + 1)
+    future_states = tf.map_fn(
+        lambda i: padded_states[i:i + ACTION_CHUNK_SIZE],
+        indices,
+        dtype=tf.float32
+    )
+    states_time_mask = tf.ones([tf.shape(states)[0]], dtype=tf.bool)
+    padded_states_time_mask = tf.pad(states_time_mask, [[0, ACTION_CHUNK_SIZE]], "CONSTANT", constant_values=False)
+    future_states_time_mask = tf.map_fn(
+        lambda i: padded_states_time_mask[i:i + ACTION_CHUNK_SIZE],
+        indices,
+        dtype=tf.bool
+    )
+    # Calculate the mean and std for state
+    state_std = tf.math.reduce_std(states, axis=0, keepdims=True)
+    state_std = tf.repeat(state_std, tf.shape(states)[0], axis=0)
+    state_mean = tf.math.reduce_mean(states, axis=0, keepdims=True)
+    state_mean = tf.repeat(state_mean, tf.shape(states)[0], axis=0)
+    state_norm = tf.math.reduce_mean(
+        tf.math.square(states), axis=0, keepdims=True)
+    state_norm = tf.math.sqrt(state_norm)
+    state_norm = tf.repeat(state_norm, tf.shape(states)[0], axis=0)
+    # Create a list of steps
+    step_data = []
+    for i in range(tf.shape(states)[0]):
+        step_data.append({
+            'step_id': episode['step_id'][i],
+            'json_content': json_content,
+            'state_chunk': past_states[i],
+            'state_chunk_time_mask': past_states_time_mask[i],
+            'action_chunk': future_states[i],
+            'action_chunk_time_mask': future_states_time_mask[i],
+            'state_vec_mask': masks[i],
+            'past_frames_0': episode['past_frames_0'][i],
+            'past_frames_0_time_mask': episode['past_frames_0_time_mask'][i],
+            'past_frames_1': episode['past_frames_1'][i],
+            'past_frames_1_time_mask': episode['past_frames_1_time_mask'][i],
+            'past_frames_2': episode['past_frames_2'][i],
+            'past_frames_2_time_mask': episode['past_frames_2_time_mask'][i],
+            'past_frames_3': episode['past_frames_3'][i],
+            'past_frames_3_time_mask': episode['past_frames_3_time_mask'][i],
+            'state_std': state_std[i],
+            'state_mean': state_mean[i],
+            'state_norm': state_norm[i],
+        })
+    return step_data
+def flatten_episode_agilex(episode: dict) -> tf.data.Dataset:
+    """
+    Flatten the episode to a list of steps.
+    """
+    episode_dict = episode['episode_dict']
+    dataset_name = episode['dataset_name']
+    json_content, states, masks, acts = generate_json_state(
+        episode_dict, dataset_name
+    )
+    # Calculate the past_states for each step
+    # Each step has a window of previous states with size ACTION_CHUNK_SIZE
+    # Use the first state to pad the states
+    # past_states will have shape (num_steps, ACTION_CHUNK_SIZE, state_dim)
+    first_state = tf.expand_dims(states[0], axis=0)
+    first_state = tf.repeat(first_state, ACTION_CHUNK_SIZE-1, axis=0)
+    padded_states = tf.concat([first_state, states], axis=0)
+    indices = tf.range(ACTION_CHUNK_SIZE, tf.shape(states)[0] + ACTION_CHUNK_SIZE)
+    past_states = tf.map_fn(
+        lambda i: padded_states[i - ACTION_CHUNK_SIZE:i],
+        indices,
+        dtype=tf.float32
+    )
+    states_time_mask = tf.ones([tf.shape(states)[0]], dtype=tf.bool)
+    padded_states_time_mask = tf.pad(states_time_mask, [[ACTION_CHUNK_SIZE-1, 0]], "CONSTANT", constant_values=False)
+    past_states_time_mask = tf.map_fn(
+        lambda i: padded_states_time_mask[i - ACTION_CHUNK_SIZE:i],
+        indices,
+        dtype=tf.bool
+    )
+    # NOTE bg the future states shall be actions
+    # Calculate the future_states for each step
+    # Each step has a window of future states with size ACTION_CHUNK_SIZE
+    # Use the last action to pad the states
+    # future_states will have shape (num_steps, ACTION_CHUNK_SIZE, state_dim)
+    last_act = tf.expand_dims(acts[-1], axis=0)
+    last_act = tf.repeat(last_act, ACTION_CHUNK_SIZE, axis=0)
+    padded_states = tf.concat([acts, last_act], axis=0)
+    # indices = tf.range(1, tf.shape(states)[0] + 1)
+    indices = tf.range(0, tf.shape(acts)[0]) # NOTE time 0 action = time 1 state
+    future_states = tf.map_fn(
+        lambda i: padded_states[i:i + ACTION_CHUNK_SIZE],
+        indices,
+        dtype=tf.float32
+    )
+    states_time_mask = tf.ones([tf.shape(acts)[0]], dtype=tf.bool)
+    padded_states_time_mask = tf.pad(states_time_mask, [[0, ACTION_CHUNK_SIZE]], "CONSTANT", constant_values=False)
+    future_states_time_mask = tf.map_fn(
+        lambda i: padded_states_time_mask[i:i + ACTION_CHUNK_SIZE],
+        indices,
+        dtype=tf.bool
+    )
+    # Calculate the std and mean for state
+    state_std = tf.math.reduce_std(states, axis=0, keepdims=True)
+    state_std = tf.repeat(state_std, tf.shape(states)[0], axis=0)
+    state_mean = tf.math.reduce_mean(states, axis=0, keepdims=True)
+    state_mean = tf.repeat(state_mean, tf.shape(states)[0], axis=0)
+    state_norm = tf.math.reduce_mean(
+        tf.math.square(acts), axis=0, keepdims=True)
+    state_norm = tf.math.sqrt(state_norm)
+    state_norm = tf.repeat(state_norm, tf.shape(states)[0], axis=0)
+    # Create a list of steps
+    step_data = []
+    for i in range(tf.shape(states)[0]):
+        step_data.append({
+            'step_id': episode['step_id'][i],
+            'json_content': json_content,
+            'state_chunk': past_states[i],
+            'state_chunk_time_mask': past_states_time_mask[i],
+            'action_chunk': future_states[i],
+            'action_chunk_time_mask': future_states_time_mask[i],
+            'state_vec_mask': masks[i],
+            'past_frames_0': episode['past_frames_0'][i],
+            'past_frames_0_time_mask': episode['past_frames_0_time_mask'][i],
+            'past_frames_1': episode['past_frames_1'][i],
+            'past_frames_1_time_mask': episode['past_frames_1_time_mask'][i],
+            'past_frames_2': episode['past_frames_2'][i],
+            'past_frames_2_time_mask': episode['past_frames_2_time_mask'][i],
+            'past_frames_3': episode['past_frames_3'][i],
+            'past_frames_3_time_mask': episode['past_frames_3_time_mask'][i],
+            'state_std': state_std[i],
+            'state_mean': state_mean[i],
+            'state_norm': state_norm[i],
+        })
+    return step_data

data/filelock.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import fcntl
+class FileLock:
+    """
+    A file lock class.
+    """
+    def __init__(self, filename):
+        self.filename = filename
+        self.handle = None
+    def acquire_read_lock(self):
+        self.handle = open(self.filename + '.lock', 'r')
+        fcntl.flock(self.handle, fcntl.LOCK_SH | fcntl.LOCK_NB)
+    def acquire_write_lock(self):
+        self.handle = open(self.filename + '.lock', 'w')
+        fcntl.flock(self.handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    def release_lock(self):
+        if self.handle is not None:
+            fcntl.flock(self.handle, fcntl.LOCK_UN)
+            self.handle.close()
+            self.handle = None

data/hdf5_maniskill_dataset.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import h5py
+import yaml
+import numpy as np
+# Assuming STATE_VEC_IDX_MAPPING is a dictionary mapping state variable names to indices
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+import glob
+from scipy.interpolate import interp1d
+from PIL import Image
+def interpolate_action_sequence(action_sequence, target_size):
+    """
+    Extend the action sequece to `target_size` by linear interpolation.
+    Args:
+        action_sequence (np.ndarray): original action sequence, shape (N, D).
+        target_size (int): target sequence length.
+    Returns:
+        extended_sequence (np.ndarray): extended action sequence, shape (target_size, D).
+    """
+    N, D = action_sequence.shape
+    indices_old = np.arange(N)
+    indices_new = np.linspace(0, N - 1, target_size)
+    interp_func = interp1d(indices_old, action_sequence,
+                           kind='linear', axis=0, assume_sorted=True)
+    action_sequence_new = interp_func(indices_new)
+    return action_sequence_new
+class HDF5VLADataset:
+    """
+    This class is used to sample episodes from the embodiment dataset
+    stored in HDF5 files.
+    """
+    def __init__(self):
+        # The name of your dataset
+        self.DATASET_NAME = "agilex"
+        self.data_dir = "data/datasets/rdt-ft-data/demo_1k"
+        self.tasks = os.listdir(self.data_dir)
+        # Multiple tasks
+        self.tasks = ['PickCube-v1', 'StackCube-v1', 'PlugCharger-v1', 'PushCube-v1', 'PegInsertionSide-v1']
+        # Load configuration from YAML file
+        with open('configs/base.yaml', 'r') as file:
+            config = yaml.safe_load(file)
+        self.CHUNK_SIZE = config['common']['action_chunk_size']
+        self.IMG_HISTORY_SIZE = config['common']['img_history_size']
+        self.STATE_DIM = config['common']['state_dim']
+        self.num_episode_per_task = 1000
+        self.img = []
+        self.state = []
+        self.action = []
+        # open the hdf5 files in memory to speed up the data loading
+        for task in self.tasks:
+            file_path = glob.glob(os.path.join(self.data_dir, task, 'motionplanning', '*.h5'))[0]
+            with h5py.File(file_path, "r") as f:
+                trajs = f.keys() #  traj_0, traj_1,
+                # sort by the traj number
+                trajs = sorted(trajs, key=lambda x: int(x.split('_')[-1]))
+                for traj in trajs:
+                    # images = f[traj]['obs']['sensor_data']['base_camera']['rgb'][:]
+                    states = f[traj]['obs']['agent']['qpos'][:]
+                    actions = f[traj]['actions'][:]
+                    self.state.append(states)
+                    self.action.append(actions)
+                    # self.img.append(images)
+        self.state_min = np.concatenate(self.state).min(axis=0)
+        self.state_max = np.concatenate(self.state).max(axis=0)
+        self.action_min = np.concatenate(self.action).min(axis=0)
+        self.action_max = np.concatenate(self.action).max(axis=0)
+        self.action_std = np.concatenate(self.action).std(axis=0)
+        self.action_mean = np.concatenate(self.action).mean(axis=0)
+        self.task2lang = {
+            "PegInsertionSide-v1": "Pick up a orange-white peg and insert the orange end into the box with a hole in it.",
+            "PickCube-v1": "Grasp a red cube and move it to a target goal position.",
+            "StackCube-v1":  "Pick up a red cube and stack it on top of a green cube and let go of the cube without it falling.",
+            "PlugCharger-v1": "Pick up one of the misplaced shapes on the board/kit and insert it into the correct empty slot.",
+            "PushCube-v1": "Push and move a cube to a goal region in front of it."
+        }
+    def __len__(self):
+        # Assume each file contains 100 episodes
+        return len(self.tasks) * self.num_episode_per_task
+    def get_dataset_name(self):
+        return self.DATASET_NAME
+    def get_item(self, index=None):
+        """
+        Get a training sample at a random timestep.
+        Args:
+            index (int, optional): The index of the episode.
+                If not provided, a random episode will be selected.
+            state_only (bool, optional): Whether to return only the state.
+                In this way, the sample will contain a complete trajectory rather
+                than a single timestep. Defaults to False.
+        Returns:
+            sample (dict): A dictionary containing the training sample.
+        """
+        while True:
+            if index is None:
+                index = np.random.randint(0, self.__len__())
+            valid, sample = self.parse_hdf5_file(index)
+            if valid:
+                return sample
+            else:
+                index = np.random.randint(0, self.__len__())
+    def parse_hdf5_file(self, index):
+        """
+        Parse an HDF5 file to generate a training sample at a random timestep.
+        Args:
+            file_path (str): The path to the HDF5 file.
+        Returns:
+            valid (bool): Whether the episode is valid.
+            dict: A dictionary containing the training sample.
+        """
+        num_steps = len(self.action[index])
+        step_index = np.random.randint(0, num_steps)
+        task_index = index // self.num_episode_per_task
+        language = self.task2lang[self.tasks[task_index]]
+        task_inner_index = index % self.num_episode_per_task
+        # Skip these episodes since in the eef version dataset they are invalid.
+        if self.tasks[task_index] == 'PegInsertionSide-v1' and task_inner_index > 400:
+            return False, None
+        proc_index = task_inner_index // 100
+        episode_index = task_inner_index % 100
+        # images0 = self.img[index]
+        # normalize to -1, 1
+        states = (self.state[index] - self.state_min) / (self.state_max - self.state_min) * 2 - 1
+        states = states[:, :-1]  # remove the last state as it is replicate of the -2 state
+        actions = (self.action[index] - self.action_min) / (self.action_max - self.action_min) * 2 - 1
+        # Get image history
+        start_img_idx = max(0, step_index - self.IMG_HISTORY_SIZE + 1)
+        end_img_idx = step_index + 1
+        img_history = []
+        for i in range(start_img_idx, end_img_idx):
+            img_path = os.path.join(self.data_dir, self.tasks[task_index], 'motionplanning', f'{proc_index}', f'{episode_index}', f"{i + 1}.png")
+            img = np.array(Image.open(img_path))
+            img_history.append(img)
+        img_history = np.array(img_history)
+        # img_history = images0[start_img_idx:end_img_idx]
+        img_valid_len = img_history.shape[0]
+        # Pad images if necessary
+        if img_valid_len < self.IMG_HISTORY_SIZE:
+            padding = np.tile(img_history[0:1], (self.IMG_HISTORY_SIZE - img_valid_len, 1, 1, 1))
+            img_history = np.concatenate([padding, img_history], axis=0)
+        img_history_mask = np.array(
+            [False] * (self.IMG_HISTORY_SIZE - img_valid_len) + [True] * img_valid_len
+        )
+        # Compute state statistics
+        state_std = np.std(states, axis=0)
+        state_mean = np.mean(states, axis=0)
+        state_norm = np.sqrt(np.mean(states ** 2, axis=0))
+        # Get state and action at the specified timestep
+        state = states[step_index: step_index + 1]
+        runtime_chunksize = self.CHUNK_SIZE // 4
+        action_sequence = actions[step_index: step_index + runtime_chunksize]
+        # we use linear interpolation to pad the action sequence
+        # Pad action sequence if necessary
+        if action_sequence.shape[0] < runtime_chunksize:
+            padding = np.tile(action_sequence[-1:], (runtime_chunksize - action_sequence.shape[0], 1))
+            action_sequence = np.concatenate([action_sequence, padding], axis=0)
+        action_sequence = interpolate_action_sequence(action_sequence, self.CHUNK_SIZE)
+        # Fill state and action into unified vectors
+        def fill_in_state(values):
+            UNI_STATE_INDICES = [
+                STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(7)
+            ] + [
+                STATE_VEC_IDX_MAPPING[f"right_gripper_open"]
+            ]
+            uni_vec = np.zeros(values.shape[:-1] + (self.STATE_DIM,))
+            uni_vec[..., UNI_STATE_INDICES] = values
+            return uni_vec
+        state_indicator = fill_in_state(np.ones_like(state_std))
+        state = fill_in_state(state)
+        state_std = fill_in_state(state_std)
+        state_mean = fill_in_state(state_mean)
+        state_norm = fill_in_state(state_norm)
+        action_sequence = fill_in_state(action_sequence)
+        # Assemble the meta information
+        meta = {
+            "dataset_name": self.DATASET_NAME,
+            "#steps": num_steps,
+            "step_id": step_index,
+            "instruction": language
+        }
+        # Return the resulting sample
+        return True, {
+            "meta": meta,
+            "state": state,
+            "state_std": state_std,
+            "state_mean": state_mean,
+            "state_norm": state_norm,
+            "actions": action_sequence,
+            "state_indicator": state_indicator,
+            "cam_high": img_history,  # Assuming images0 are high-level camera images
+            "cam_high_mask": img_history_mask,
+            "cam_left_wrist": np.zeros((self.IMG_HISTORY_SIZE, 0, 0, 0)),
+            "cam_left_wrist_mask": np.zeros(self.IMG_HISTORY_SIZE, dtype=bool),
+            "cam_right_wrist": np.zeros((self.IMG_HISTORY_SIZE, 0, 0, 0)),
+            "cam_right_wrist_mask": np.zeros(self.IMG_HISTORY_SIZE, dtype=bool),
+        }
+if __name__ == "__main__":
+    from PIL import Image
+    ds = HDF5VLADataset()
+    json_data = {
+        'state_min': ds.state_min.tolist(),
+        'state_max': ds.state_max.tolist(),
+        'action_min': ds.action_min.tolist(),
+        'action_max': ds.action_max.tolist(),
+    }
+    print(json_data)

data/hdf5_vla_dataset.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import os
+import fnmatch
+import json
+import h5py
+import yaml
+import cv2
+import numpy as np
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+TABLETOP_6D_INDICES_NAMES = [
+    'left_eef_pos_x','left_eef_pos_y','left_eef_pos_z','left_eef_angle_0','left_eef_angle_1','left_eef_angle_2','left_eef_angle_3','left_eef_angle_4','left_eef_angle_5','left_gripper_open','right_eef_pos_x','right_eef_pos_y','right_eef_pos_z','right_eef_angle_0','right_eef_angle_1','right_eef_angle_2','right_eef_angle_3','right_eef_angle_4','right_eef_angle_5','right_gripper_open']
+TABLETOP_6D_INDICES = [STATE_VEC_IDX_MAPPING[n] for n in TABLETOP_6D_INDICES_NAMES]
+class TabletopHDF5VLADataset:
+    """
+    This class is used to sample episodes from the embododiment dataset
+    stored in HDF5.
+    """
+    def __init__(self, task_name) -> None:
+        # [Modify] The path to the HDF5 dataset directory
+        # Each HDF5 file contains one episode
+        dataset_name = task_name
+        HDF5_DIR = f"/data5/jellyho/tabletop/{dataset_name}/"
+        self.DATASET_NAME = dataset_name
+        self.file_paths = []
+        for root, _, files in os.walk(HDF5_DIR):
+            for filename in fnmatch.filter(files, '*.hdf5'):
+                file_path = os.path.join(root, filename)
+                self.file_paths.append(file_path)
+        # Load the config
+        with open('configs/base.yaml', 'r') as file:
+            config = yaml.safe_load(file)
+        self.CHUNK_SIZE = config['common']['action_chunk_size']
+        self.IMG_HISORY_SIZE = config['common']['img_history_size']
+        self.STATE_DIM = config['common']['state_dim']
+        # Get each episode's len
+        episode_lens = []
+        for file_path in self.file_paths:
+            valid, res = self.parse_hdf5_file_state_only(file_path)
+            _len = res['state'].shape[0] if valid else 0
+            episode_lens.append(_len)
+        self.episode_sample_weights = np.array(episode_lens) / np.sum(episode_lens)
+    def __len__(self):
+        return len(self.file_paths)
+    def get_dataset_name(self):
+        return self.DATASET_NAME
+    def get_item(self, index: int=None, state_only=False):
+        """Get a training sample at a random timestep.
+        Args:
+            index (int, optional): the index of the episode.
+                If not provided, a random episode will be selected.
+            state_only (bool, optional): Whether to return only the state.
+                In this way, the sample will contain a complete trajectory rather
+                than a single timestep. Defaults to False.
+        Returns:
+           sample (dict): a dictionary containing the training sample.
+        """
+        while True:
+            if index is None:
+                file_path = np.random.choice(self.file_paths, p=self.episode_sample_weights)
+            else:
+                file_path = self.file_paths[index]
+            valid, sample = self.parse_hdf5_file(file_path) \
+                if not state_only else self.parse_hdf5_file_state_only(file_path)
+            if valid:
+                return sample
+            else:
+                index = np.random.randint(0, len(self.file_paths))
+    def parse_hdf5_file(self, file_path):
+        """[Modify] Parse a hdf5 file to generate a training sample at
+            a random timestep.
+        Args:
+            file_path (str): the path to the hdf5 file
+        Returns:
+            valid (bool): whether the episode is valid, which is useful for filtering.
+                If False, this episode will be dropped.
+            dict: a dictionary containing the training sample,
+                {
+                    "meta": {
+                        "dataset_name": str,    # the name of your dataset.
+                        "#steps": int,          # the number of steps in the episode,
+                                                # also the total timesteps.
+                        "instruction": str      # the language instruction for this episode.
+                    },
+                    "step_id": int,             # the index of the sampled step,
+                                                # also the timestep t.
+                    "state": ndarray,           # state[t], (1, STATE_DIM).
+                    "state_std": ndarray,       # std(state[:]), (STATE_DIM,).
+                    "state_mean": ndarray,      # mean(state[:]), (STATE_DIM,).
+                    "state_norm": ndarray,      # norm(state[:]), (STATE_DIM,).
+                    "actions": ndarray,         # action[t:t+CHUNK_SIZE], (CHUNK_SIZE, STATE_DIM).
+                    "state_indicator", ndarray, # indicates the validness of each dim, (STATE_DIM,).
+                    "cam_high": ndarray,        # external camera image, (IMG_HISORY_SIZE, H, W, 3)
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                    "cam_high_mask": ndarray,   # indicates the validness of each timestep, (IMG_HISORY_SIZE,) boolean array.
+                                                # For the first IMAGE_HISTORY_SIZE-1 timesteps, the mask should be False.
+                    "cam_left_wrist": ndarray,  # left wrist camera image, (IMG_HISORY_SIZE, H, W, 3).
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                    "cam_left_wrist_mask": ndarray,
+                    "cam_right_wrist": ndarray, # right wrist camera image, (IMG_HISORY_SIZE, H, W, 3).
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                                                # If only one wrist, make it right wrist, plz.
+                    "cam_right_wrist_mask": ndarray
+                } or None if the episode is invalid.
+        """
+        with h5py.File(file_path, 'r') as f:
+            states = f['observations']['states']['ee_6d_pos'][:]
+            actions = f['actions']['ee_6d_pos'][:]
+            num_steps = states.shape[0]
+            # [Optional] We drop too-short episode
+            if num_steps < 20:
+                return False, None
+            # We randomly sample a timestep
+            step_id = np.random.randint(0, num_steps)
+            # You can also use precomputed language embeddings (recommended)
+            if self.DATASET_NAME == 'aloha_box_into_pot_easy':
+                instruction = f['observations']['states']['language_instruction'][0].decode('utf-8')
+            else:
+                instruction = f"lang_embed/{self.DATASET_NAME}.pt"
+            # Assemble the meta
+            meta = {
+                "dataset_name": self.DATASET_NAME,
+                "#steps": num_steps,
+                "step_id": step_id,
+                "instruction": instruction
+            }
+            # Rescale gripper to [0, 1]
+            states = states / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            actions = actions[step_id:step_id+self.CHUNK_SIZE] / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            # Parse the state and action
+            state = states[step_id:step_id+1]
+            state_std = np.std(states, axis=0)
+            state_mean = np.mean(states, axis=0)
+            state_norm = np.sqrt(np.mean(states**2, axis=0))
+            if actions.shape[0] < self.CHUNK_SIZE:
+                # Pad the actions using the last action
+                actions = np.concatenate([
+                    actions,
+                    np.tile(actions[-1:], (self.CHUNK_SIZE-actions.shape[0], 1))
+                ], axis=0)
+            # Fill the state/action into the unified vector
+            def fill_in_state(values):
+                uni_vec = np.zeros(values.shape[:-1] + (self.STATE_DIM,))
+                uni_vec[..., TABLETOP_6D_INDICES] = values
+                return uni_vec
+            state = fill_in_state(state)
+            state_indicator = fill_in_state(np.ones_like(state_std))
+            state_std = fill_in_state(state_std)
+            state_mean = fill_in_state(state_mean)
+            state_norm = fill_in_state(state_norm)
+            # If action's format is different from state's,
+            # you may implement fill_in_action()
+            actions = fill_in_state(actions)
+            # Parse the images
+            def parse_img(key):
+                imgs = []
+                for i in range(max(step_id-self.IMG_HISORY_SIZE+1, 0), step_id+1):
+                    img = f['observations']['images'][key][i]
+                    # imgs.append(cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR))
+                    imgs.append(img)
+                # print(imgs)
+                imgs = np.stack(imgs)
+                if imgs.shape[0] < self.IMG_HISORY_SIZE:
+                    # Pad the images using the first image
+                    imgs = np.concatenate([
+                        np.tile(imgs[:1], (self.IMG_HISORY_SIZE-imgs.shape[0], 1, 1, 1)),
+                        imgs
+                    ], axis=0)
+                return imgs
+            # `cam_high` is the external camera image
+            cam_high = parse_img('back')
+            # For step_id = first_idx - 1, the valid_len should be one
+            valid_len = min(step_id + 1, self.IMG_HISORY_SIZE)
+            cam_high_mask = np.array(
+                [False] * (self.IMG_HISORY_SIZE - valid_len) + [True] * valid_len
+            )
+            cam_left_wrist = parse_img('wrist_left')
+            cam_left_wrist_mask = cam_high_mask.copy()
+            cam_right_wrist = parse_img('wrist_right')
+            cam_right_wrist_mask = cam_high_mask.copy()
+            # print(cam_left_wrist is not None, cam_right_wrist is not None, cam_high is not None)
+            # Return the resulting sample
+            # For unavailable images, return zero-shape arrays, i.e., (IMG_HISORY_SIZE, 0, 0, 0)
+            # E.g., return np.zeros((self.IMG_HISORY_SIZE, 0, 0, 0)) for the key "cam_left_wrist",
+            # if the left-wrist camera is unavailable on your robot
+            return True, {
+                "meta": meta,
+                "state": state,
+                "state_std": state_std,
+                "state_mean": state_mean,
+                "state_norm": state_norm,
+                "actions": actions,
+                "state_indicator": state_indicator,
+                "cam_high": cam_high,
+                "cam_high_mask": cam_high_mask,
+                "cam_left_wrist": cam_left_wrist,
+                "cam_left_wrist_mask": cam_left_wrist_mask,
+                "cam_right_wrist": cam_right_wrist,
+                "cam_right_wrist_mask": cam_right_wrist_mask
+            }
+    def parse_hdf5_file_state_only(self, file_path):
+        """[Modify] Parse a hdf5 file to generate a state trajectory.
+        Args:
+            file_path (str): the path to the hdf5 file
+        Returns:
+            valid (bool): whether the episode is valid, which is useful for filtering.
+                If False, this episode will be dropped.
+            dict: a dictionary containing the training sample,
+                {
+                    "state": ndarray,           # state[:], (T, STATE_DIM).
+                    "action": ndarray,          # action[:], (T, STATE_DIM).
+                } or None if the episode is invalid.
+        """
+        with h5py.File(file_path, 'r') as f:
+            states = f['observations']['states']['ee_6d_pos'][:]
+            actions = f['actions']['ee_6d_pos'][:]
+            num_steps = states.shape[0]
+            step_id = np.random.randint(0, num_steps)
+            # Rescale gripper to [0, 1]
+            states = states / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            actions = actions[step_id:step_id+self.CHUNK_SIZE] / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            # Fill the state/action into the unified vector
+            def fill_in_state(values):
+                uni_vec = np.zeros(values.shape[:-1] + (self.STATE_DIM,))
+                uni_vec[..., TABLETOP_6D_INDICES] = values
+                return uni_vec
+            state = fill_in_state(states)
+            action = fill_in_state(actions)
+            # Return the resulting sample
+            return True, {
+                "state": state,
+                "action": action
+            }
+class AnubisHDF5VLADataset:
+    """
+    This class is used to sample episodes from the embododiment dataset
+    stored in HDF5.
+    """
+    def __init__(self, task_name) -> None:
+        # [Modify] The path to the HDF5 dataset directory
+        # Each HDF5 file contains one episode
+        dataset_name = task_name
+        HDF5_DIR = f"/data5/jellyho/anubis_hdf5/{dataset_name}/"
+        self.DATASET_NAME = dataset_name
+        self.file_paths = []
+        for root, _, files in os.walk(HDF5_DIR):
+            for filename in fnmatch.filter(files, '*.hdf5'):
+                file_path = os.path.join(root, filename)
+                self.file_paths.append(file_path)
+        # Load the config
+        with open('configs/base.yaml', 'r') as file:
+            config = yaml.safe_load(file)
+        self.CHUNK_SIZE = config['common']['action_chunk_size']
+        self.IMG_HISORY_SIZE = config['common']['img_history_size']
+        self.STATE_DIM = config['common']['state_dim']
+        # Get each episode's len
+        episode_lens = []
+        for file_path in self.file_paths:
+            valid, res = self.parse_hdf5_file_state_only(file_path)
+            _len = res['state'].shape[0] if valid else 0
+            episode_lens.append(_len)
+        self.episode_sample_weights = np.array(episode_lens) / np.sum(episode_lens)
+    def __len__(self):
+        return len(self.file_paths)
+    def get_dataset_name(self):
+        return self.DATASET_NAME
+    def get_item(self, index: int=None, state_only=False):
+        """Get a training sample at a random timestep.
+        Args:
+            index (int, optional): the index of the episode.
+                If not provided, a random episode will be selected.
+            state_only (bool, optional): Whether to return only the state.
+                In this way, the sample will contain a complete trajectory rather
+                than a single timestep. Defaults to False.
+        Returns:
+           sample (dict): a dictionary containing the training sample.
+        """
+        while True:
+            if index is None:
+                file_path = np.random.choice(self.file_paths, p=self.episode_sample_weights)
+            else:
+                file_path = self.file_paths[index]
+            valid, sample = self.parse_hdf5_file(file_path) \
+                if not state_only else self.parse_hdf5_file_state_only(file_path)
+            if valid:
+                return sample
+            else:
+                index = np.random.randint(0, len(self.file_paths))
+    def parse_hdf5_file(self, file_path):
+        """[Modify] Parse a hdf5 file to generate a training sample at
+            a random timestep.
+        Args:
+            file_path (str): the path to the hdf5 file
+        Returns:
+            valid (bool): whether the episode is valid, which is useful for filtering.
+                If False, this episode will be dropped.
+            dict: a dictionary containing the training sample,
+                {
+                    "meta": {
+                        "dataset_name": str,    # the name of your dataset.
+                        "#steps": int,          # the number of steps in the episode,
+                                                # also the total timesteps.
+                        "instruction": str      # the language instruction for this episode.
+                    },
+                    "step_id": int,             # the index of the sampled step,
+                                                # also the timestep t.
+                    "state": ndarray,           # state[t], (1, STATE_DIM).
+                    "state_std": ndarray,       # std(state[:]), (STATE_DIM,).
+                    "state_mean": ndarray,      # mean(state[:]), (STATE_DIM,).
+                    "state_norm": ndarray,      # norm(state[:]), (STATE_DIM,).
+                    "actions": ndarray,         # action[t:t+CHUNK_SIZE], (CHUNK_SIZE, STATE_DIM).
+                    "state_indicator", ndarray, # indicates the validness of each dim, (STATE_DIM,).
+                    "cam_high": ndarray,        # external camera image, (IMG_HISORY_SIZE, H, W, 3)
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                    "cam_high_mask": ndarray,   # indicates the validness of each timestep, (IMG_HISORY_SIZE,) boolean array.
+                                                # For the first IMAGE_HISTORY_SIZE-1 timesteps, the mask should be False.
+                    "cam_left_wrist": ndarray,  # left wrist camera image, (IMG_HISORY_SIZE, H, W, 3).
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                    "cam_left_wrist_mask": ndarray,
+                    "cam_right_wrist": ndarray, # right wrist camera image, (IMG_HISORY_SIZE, H, W, 3).
+                                                # or (IMG_HISORY_SIZE, 0, 0, 0) if unavailable.
+                                                # If only one wrist, make it right wrist, plz.
+                    "cam_right_wrist_mask": ndarray
+                } or None if the episode is invalid.
+        """
+        with h5py.File(file_path, 'r') as f:
+            states = f['observation']['eef_pose'][:]
+            actions = f['action']['eef_pose'][:]
+            num_steps = states.shape[0]
+            # [Optional] We drop too-short episode
+            if num_steps < 20:
+                return False, None
+            # We randomly sample a timestep
+            step_id = np.random.randint(0, num_steps)
+            # You can also use precomputed language embeddings (recommended)
+            if self.DATASET_NAME == 'aloha_box_into_pot_easy':
+                instruction = f['observations']['states']['language_instruction'][0].decode('utf-8')
+            else:
+                instruction = f"lang_embed/{self.DATASET_NAME}.pt"
+            # Assemble the meta
+            meta = {
+                "dataset_name": self.DATASET_NAME,
+                "#steps": num_steps,
+                "step_id": step_id,
+                "instruction": instruction
+            }
+            # Rescale gripper to [0, 1]
+            states = states / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            actions = actions[step_id:step_id+self.CHUNK_SIZE] / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            # Parse the state and action
+            state = states[step_id:step_id+1]
+            state_std = np.std(states, axis=0)
+            state_mean = np.mean(states, axis=0)
+            state_norm = np.sqrt(np.mean(states**2, axis=0))
+            if actions.shape[0] < self.CHUNK_SIZE:
+                # Pad the actions using the last action
+                actions = np.concatenate([
+                    actions,
+                    np.tile(actions[-1:], (self.CHUNK_SIZE-actions.shape[0], 1))
+                ], axis=0)
+            # Fill the state/action into the unified vector
+            def fill_in_state(values):
+                uni_vec = np.zeros(values.shape[:-1] + (self.STATE_DIM,))
+                uni_vec[..., TABLETOP_6D_INDICES] = values
+                return uni_vec
+            state = fill_in_state(state)
+            state_indicator = fill_in_state(np.ones_like(state_std))
+            state_std = fill_in_state(state_std)
+            state_mean = fill_in_state(state_mean)
+            state_norm = fill_in_state(state_norm)
+            # If action's format is different from state's,
+            # you may implement fill_in_action()
+            actions = fill_in_state(actions)
+            # Parse the images
+            def parse_img(key):
+                imgs = []
+                for i in range(max(step_id-self.IMG_HISORY_SIZE+1, 0), step_id+1):
+                    img = f['observation'][key][i]
+                    # imgs.append(cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR))
+                    imgs.append(img)
+                # print(imgs)
+                imgs = np.stack(imgs)
+                if imgs.shape[0] < self.IMG_HISORY_SIZE:
+                    # Pad the images using the first image
+                    imgs = np.concatenate([
+                        np.tile(imgs[:1], (self.IMG_HISORY_SIZE-imgs.shape[0], 1, 1, 1)),
+                        imgs
+                    ], axis=0)
+                return imgs
+            # `cam_high` is the external camera image
+            cam_high = parse_img('agentview_image')
+            # For step_id = first_idx - 1, the valid_len should be one
+            valid_len = min(step_id + 1, self.IMG_HISORY_SIZE)
+            cam_high_mask = np.array(
+                [False] * (self.IMG_HISORY_SIZE - valid_len) + [True] * valid_len
+            )
+            cam_left_wrist = parse_img('wrist_left_image')
+            cam_left_wrist_mask = cam_high_mask.copy()
+            cam_right_wrist = parse_img('wrist_right_image')
+            cam_right_wrist_mask = cam_high_mask.copy()
+            # print(cam_left_wrist is not None, cam_right_wrist is not None, cam_high is not None)
+            # Return the resulting sample
+            # For unavailable images, return zero-shape arrays, i.e., (IMG_HISORY_SIZE, 0, 0, 0)
+            # E.g., return np.zeros((self.IMG_HISORY_SIZE, 0, 0, 0)) for the key "cam_left_wrist",
+            # if the left-wrist camera is unavailable on your robot
+            return True, {
+                "meta": meta,
+                "state": state,
+                "state_std": state_std,
+                "state_mean": state_mean,
+                "state_norm": state_norm,
+                "actions": actions,
+                "state_indicator": state_indicator,
+                "cam_high": cam_high,
+                "cam_high_mask": cam_high_mask,
+                "cam_left_wrist": cam_left_wrist,
+                "cam_left_wrist_mask": cam_left_wrist_mask,
+                "cam_right_wrist": cam_right_wrist,
+                "cam_right_wrist_mask": cam_right_wrist_mask
+            }
+    def parse_hdf5_file_state_only(self, file_path):
+        """[Modify] Parse a hdf5 file to generate a state trajectory.
+        Args:
+            file_path (str): the path to the hdf5 file
+        Returns:
+            valid (bool): whether the episode is valid, which is useful for filtering.
+                If False, this episode will be dropped.
+            dict: a dictionary containing the training sample,
+                {
+                    "state": ndarray,           # state[:], (T, STATE_DIM).
+                    "action": ndarray,          # action[:], (T, STATE_DIM).
+                } or None if the episode is invalid.
+        """
+        with h5py.File(file_path, 'r') as f:
+            states = f['observation']['eef_pose'][:]
+            actions = f['action']['eef_pose'][:]
+            num_steps = states.shape[0]
+            step_id = np.random.randint(0, num_steps)
+            # Rescale gripper to [0, 1]
+            states = states / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            actions = actions[step_id:step_id+self.CHUNK_SIZE] / np.array(
+               [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            # Fill the state/action into the unified vector
+            def fill_in_state(values):
+                uni_vec = np.zeros(values.shape[:-1] + (self.STATE_DIM,))
+                uni_vec[..., TABLETOP_6D_INDICES] = values
+                return uni_vec
+            state = fill_in_state(states)
+            action = fill_in_state(actions)
+            # Return the resulting sample
+            return True, {
+                "state": state,
+                "action": action
+            }
+if __name__ == "__main__":
+    ds = TabletopHDF5VLADataset()
+    for i in range(len(ds)):
+        print(f"Processing episode {i}/{len(ds)}...")
+        ds.get_item(i)

data/preprocess.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import json
+import tensorflow as tf
+import yaml
+from data.preprocess_scripts import *
+from configs.state_vec import STATE_VEC_IDX_MAPPING, STATE_VEC_LEN
+from data.utils import capitalize_and_period
+# The dataset without state
+DATASET_NAMES_NO_STATE = [
+    'nyu_door_opening_surprising_effectiveness',
+    "usc_cloth_sim_converted_externally_to_rlds",
+    'cmu_franka_exploration_dataset_converted_externally_to_rlds',
+    'imperialcollege_sawyer_wrist_cam'
+]
+# Read the image keys of each dataset
+with open('configs/dataset_img_keys.json', 'r') as file:
+    IMAGE_KEYS = json.load(file)
+# Read the config
+with open('configs/base.yaml', 'r') as file:
+    config = yaml.safe_load(file)
+def assemble_state_vec(arm_concat: tf.Tensor, arm_format: str,
+                       base_concat=None, base_format=None) -> tf.Tensor:
+    """
+    Assemble the state/action vector from the arm and base.
+    """
+    state_vec = tf.zeros(STATE_VEC_LEN, dtype=tf.float32)
+    mask_vec = tf.zeros(STATE_VEC_LEN, dtype=tf.float32)
+    # Assemble the arm state
+    arm_concat = tf.cast(arm_concat, tf.float32)
+    arm_format = arm_format.split(',')
+    # Use the scatter_nd to avoid the duplicate indices
+    state_vec = tf.tensor_scatter_nd_update(
+        state_vec,
+        [[STATE_VEC_IDX_MAPPING[name]] for name in arm_format],
+        arm_concat
+    )
+    mask_vec = tf.tensor_scatter_nd_update(
+        mask_vec,
+        [[STATE_VEC_IDX_MAPPING[name]] for name in arm_format],
+        tf.ones(len(arm_format), dtype=tf.float32)
+    )
+    # Assemble the base state if exists
+    if base_concat is not None:
+        base_concat = tf.cast(base_concat, tf.float32)
+        base_format = base_format.split(',')
+        state_vec = tf.tensor_scatter_nd_update(
+            state_vec,
+            [[STATE_VEC_IDX_MAPPING[name]] for name in base_format],
+            base_concat
+        )
+        mask_vec = tf.tensor_scatter_nd_update(
+            mask_vec,
+            [[STATE_VEC_IDX_MAPPING[name]] for name in base_format],
+            tf.ones(len(base_format), dtype=tf.float32)
+        )
+    return state_vec, mask_vec
+@tf.autograph.experimental.do_not_convert
+def _generate_json_state_agilex(episode: dict, dataset_name: str):
+    """
+    Generate the json dict and state for a given episode.
+    """
+    # Load some constants from the config
+    IMG_HISTORY_SIZE = config['common']['img_history_size']
+    if IMG_HISTORY_SIZE < 1:
+        raise ValueError("Config `img_history_size` must be at least 1.")
+    ACTION_CHUNK_SIZE = config['common']['action_chunk_size']
+    if ACTION_CHUNK_SIZE < 1:
+        raise ValueError("Config `action_chunk_size` must be at least 1.")
+    # Initialize the episode_metadata
+    episode_metadata = {
+        'dataset_name': dataset_name,
+        '#steps': 0,
+        'instruction': None
+    }
+    # Check whether this episode has an 'END'
+    base_act = None
+    last_base_act = None
+    episode_states = []
+    episode_acts = []
+    episode_masks = []
+    has_base = None
+    for step_id, step in enumerate(iter(episode['steps'])):
+        # Parse the action
+        action = step['action']
+        if has_base is None:
+            has_base = 'base_concat' in action
+        if has_base:
+            base_act = action['base_concat']
+        # Parse the state
+        state = step['observation']
+        arm_format = state['format'].numpy().decode('utf-8')
+        base_format = None
+        if has_base:
+            act_format = action['format'].numpy().decode('utf-8')
+            base_formate_idx = act_format.find('base')
+            base_format = act_format[base_formate_idx:]
+        arm_state = state['arm_concat']
+        base_state = None
+        if has_base:
+            if last_base_act is None:
+                base_state = base_act * 0
+            else:
+                base_state = last_base_act
+        last_base_act = base_act
+        # Assemble the state vector
+        state_vec, mask_vec = assemble_state_vec(
+            arm_state, arm_format, base_state, base_format)
+        act_vec, mask_vec = assemble_state_vec(
+            action['arm_concat'], arm_format, base_state, base_format
+        )
+        episode_states.append(state_vec)
+        episode_masks.append(mask_vec)
+        episode_acts.append(act_vec)
+        # Parse the task instruction
+        instr = step['observation']['natural_language_instruction']
+        instr = instr.numpy().decode('utf-8')
+        instr = capitalize_and_period(instr)
+        # Write to the episode_metadata
+        if episode_metadata['instruction'] is None:
+            episode_metadata['instruction'] = instr
+    episode_metadata['#steps'] = step_id
+    episode_states = tf.stack(episode_states)
+    episode_masks = tf.stack(episode_masks)
+    episode_acts = tf.stack(episode_acts)
+    return episode_metadata, episode_states, episode_masks, episode_acts
+@tf.autograph.experimental.do_not_convert
+def _generate_json_state(episode: dict, dataset_name: str):
+    """
+    Generate the json dict and state for a given episode.
+    """
+    # Load some constants from the config
+    IMG_HISTORY_SIZE = config['common']['img_history_size']
+    if IMG_HISTORY_SIZE < 1:
+        raise ValueError("Config `img_history_size` must be at least 1.")
+    ACTION_CHUNK_SIZE = config['common']['action_chunk_size']
+    if ACTION_CHUNK_SIZE < 1:
+        raise ValueError("Config `action_chunk_size` must be at least 1.")
+    # Initialize the episode_metadata
+    episode_metadata = {
+        'dataset_name': dataset_name,
+        '#steps': 0,
+        'instruction': None
+    }
+    # Check whether this episode has an 'END'
+    base_act = None
+    last_base_act = None
+    episode_states = []
+    episode_masks = []
+    has_base = None
+    for step_id, step in enumerate(iter(episode['steps'])):
+        # Parse the action
+        action = step['action']
+        if has_base is None:
+            has_base = 'base_concat' in action
+        if has_base:
+            base_act = action['base_concat']
+        # Parse the state
+        state = step['observation']
+        arm_format = state['format'].numpy().decode('utf-8')
+        base_format = None
+        if has_base:
+            act_format = action['format'].numpy().decode('utf-8')
+            base_formate_idx = act_format.find('base')
+            base_format = act_format[base_formate_idx:]
+        arm_state = state['arm_concat']
+        base_state = None
+        if has_base:
+            if last_base_act is None:
+                base_state = base_act * 0
+            else:
+                base_state = last_base_act
+        last_base_act = base_act
+        # Assemble the state vector
+        state_vec, mask_vec = assemble_state_vec(
+            arm_state, arm_format, base_state, base_format)
+        episode_states.append(state_vec)
+        episode_masks.append(mask_vec)
+        # Parse the task instruction
+        instr = step['observation']['natural_language_instruction']
+        instr = instr.numpy().decode('utf-8')
+        instr = capitalize_and_period(instr)
+        # Write to the episode_metadata
+        if episode_metadata['instruction'] is None:
+            episode_metadata['instruction'] = instr
+    episode_metadata['#steps'] = step_id
+    episode_states = tf.stack(episode_states)
+    episode_masks = tf.stack(episode_masks)
+    return episode_metadata, episode_states, episode_masks
+@tf.autograph.experimental.do_not_convert
+def _generate_json_state_nostate_ds(episode: dict, dataset_name: str):
+    """
+    Generate the json dict and state for an episode in the dataset without state.
+    If not state, we use the last action as current state.
+    """
+    # Load some constants from the config
+    IMG_HISTORY_SIZE = config['common']['img_history_size']
+    if IMG_HISTORY_SIZE < 1:
+        raise ValueError("Config `img_history_size` must be at least 1.")
+    ACTION_CHUNK_SIZE = config['common']['action_chunk_size']
+    if ACTION_CHUNK_SIZE < 1:
+        raise ValueError("Config `action_chunk_size` must be at least 1.")
+    # Initialize the episode_metadata
+    episode_metadata = {
+        'dataset_name': dataset_name,
+        '#steps': 0,
+        'instruction': None
+    }
+    last_base_act = None
+    last_arm_act = None
+    episode_states = []
+    episode_masks = []
+    has_base = None
+    for step_id, step in enumerate(iter(episode['steps'])):
+        # Parse the action
+        action = step['action']
+        if has_base is None:
+            has_base = 'base_concat' in action
+        if has_base:
+            base_act = action['base_concat']
+            if last_base_act is None:
+                last_base_act = base_act * 0 # Initialize
+        # Parse the arm action
+        arm_act = action['arm_concat']
+        if last_arm_act is None:
+            last_arm_act = arm_act * 0 # Initialize
+        # Parse the act format
+        # Action format as the state format
+        act_format = action['format'].numpy().decode('utf-8')
+        # Assemble the state vector
+        if has_base:
+            last_act_concat = tf.concat([last_arm_act, last_base_act], axis=0)
+        else:
+            last_act_concat = last_arm_act
+        state_vec, mask_vec = assemble_state_vec(
+            last_act_concat, act_format)
+        episode_states.append(state_vec)
+        episode_masks.append(mask_vec)
+        # Parse the task instruction
+        instr = step['observation']['natural_language_instruction']
+        instr = instr.numpy().decode('utf-8')
+        instr = capitalize_and_period(instr)
+        # Write to the episode_metadata
+        if episode_metadata['instruction'] is None:
+            episode_metadata['instruction'] = instr
+        # Update the last_arm_act and last_base_act
+        last_arm_act = arm_act
+        if has_base:
+            last_base_act = base_act
+    episode_metadata['#steps'] = step_id
+    episode_states = tf.stack(episode_states)
+    episode_masks = tf.stack(episode_masks)
+    return episode_metadata, episode_states, episode_masks
+@tf.autograph.experimental.do_not_convert
+def generate_json_state(episode: dict, dataset_name: str):
+    """
+    Generate the json dict and state for an episode.
+    """
+    if isinstance(dataset_name, tf.Tensor):
+        dataset_name = dataset_name.numpy().decode('utf-8')
+    # Process each step in the episode
+    episode['steps'] = episode['steps'].map(
+        globals()[dataset_name].process_step,
+    )
+    if dataset_name == "agilex":
+        return _generate_json_state_agilex(episode, dataset_name)
+    if dataset_name in DATASET_NAMES_NO_STATE:
+        return _generate_json_state_nostate_ds(episode, dataset_name)
+    return _generate_json_state(episode, dataset_name)

data/preprocess_scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from . import fractal20220817_data
+from . import bridge
+from . import jaco_play
+from . import nyu_door_opening_surprising_effectiveness
+from . import taco_play
+from . import berkeley_cable_routing
+from . import roboturk
+from . import viola
+from . import berkeley_autolab_ur5
+from . import toto
+from . import columbia_cairlab_pusht_real
+from . import stanford_kuka_multimodal_dataset_converted_externally_to_rlds
+from . import nyu_rot_dataset_converted_externally_to_rlds
+from . import austin_buds_dataset_converted_externally_to_rlds
+from . import nyu_franka_play_dataset_converted_externally_to_rlds
+from . import cmu_franka_exploration_dataset_converted_externally_to_rlds
+from . import kuka
+from . import utokyo_xarm_bimanual_converted_externally_to_rlds
+from . import maniskill_dataset_converted_externally_to_rlds
+from . import stanford_hydra_dataset_converted_externally_to_rlds
+from . import ucsd_kitchen_dataset_converted_externally_to_rlds
+from . import ucsd_pick_and_place_dataset_converted_externally_to_rlds
+from . import austin_sailor_dataset_converted_externally_to_rlds
+from . import austin_sirius_dataset_converted_externally_to_rlds
+from . import bc_z
+from . import usc_cloth_sim_converted_externally_to_rlds
+from . import utokyo_pr2_opening_fridge_converted_externally_to_rlds
+from . import utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds
+from . import utokyo_xarm_pick_and_place_converted_externally_to_rlds
+from . import berkeley_mvp_converted_externally_to_rlds
+from . import berkeley_rpt_converted_externally_to_rlds
+from . import kaist_nonprehensile_converted_externally_to_rlds
+from . import stanford_mask_vit_converted_externally_to_rlds
+from . import tokyo_u_lsmo_converted_externally_to_rlds
+from . import dlr_sara_pour_converted_externally_to_rlds
+from . import dlr_sara_grid_clamp_converted_externally_to_rlds
+from . import dlr_edan_shared_control_converted_externally_to_rlds
+from . import asu_table_top_converted_externally_to_rlds
+from . import stanford_robocook_converted_externally_to_rlds
+from . import roboturk_real_laundrylayout
+from . import roboturk_real_towercreation
+from . import roboturk_real_objectsearch
+from . import robomimic_lift_ph
+from . import robomimic_can_ph
+from . import robomimic_square_ph
+from . import robomimic_transport_ph
+from . import robomimic_tool_hang_ph
+from . import eth_agent_affordances
+from . import imperialcollege_sawyer_wrist_cam
+from . import iamlab_cmu_pickup_insert_converted_externally_to_rlds
+from . import uiuc_d3field
+from . import utaustin_mutex
+from . import berkeley_fanuc_manipulation
+from . import cmu_play_fusion
+from . import cmu_stretch
+from . import berkeley_gnm_recon
+from . import berkeley_gnm_cory_hall
+from . import berkeley_gnm_sac_son
+from . import language_table
+from . import furniture_bench_dataset_converted_externally_to_rlds
+from . import robo_net
+from . import bridgev2
+from . import aloha_mobile
+from . import aloha_static
+from . import droid
+from . import fmb
+from . import dobbe
+from . import qut_dexterous_manpulation
+from . import roboset
+from . import agilex
+from . import rh20t
+from . import calvin
+from . import aloha_dish_drainer, aloha_handover_box, aloha_shoes_table, aloha_lift_box, aloha_box_into_pot

data/preprocess_scripts/aloha_shoes_table.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action_dict = step['action']
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    action['arm_concat'] = action_dict['ee_6d_pos']
+    # Write the action format
+    action['format'] = tf.constant(
+        "left_eef_pos_x,left_eef_pos_y,left_eef_pos_z,left_eef_angle_0,left_eef_angle_1,left_eef_angle_2,left_eef_angle_3,left_eef_angle_4,left_eef_angle_5,left_gripper_open,right_eef_pos_x,right_eef_pos_y,right_eef_pos_z,right_eef_angle_0,right_eef_angle_1,right_eef_angle_2,right_eef_angle_3,right_eef_angle_4,right_eef_angle_5,right_gripper_open"
+    )
+    # Convert raw state to our state
+    # Robot state
+    state_dict = step['observation']['state']
+    state = {}
+    state['arm_concat'] = state_dict
+    # Write the state format
+    state['format'] = tf.constant(
+        "left_eef_pos_x,left_eef_pos_y,left_eef_pos_z,left_eef_angle_0,left_eef_angle_1,left_eef_angle_2,left_eef_angle_3,left_eef_angle_4,left_eef_angle_5,left_gripper_open,right_eef_pos_x,right_eef_pos_y,right_eef_pos_z,right_eef_angle_0,right_eef_angle_1,right_eef_angle_2,right_eef_angle_3,right_eef_angle_4,right_eef_angle_5,right_gripper_open"
+    )
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    # instr = clean_task_instruction(instr, replacements)
+    step['observation'] = state
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    pass

data/preprocess_scripts/austin_buds_dataset_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate'] = step['is_terminal']
+    eef_delta_pos = origin_action[:3]
+    eef_ang=origin_action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    # gripper_open: -1-open, 1-closed
+    grip_open=tf.where(tf.equal(origin_action[6:],tf.constant(-1.0)),tf.constant(1.0),tf.constant(0.0))
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_delta_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    # Concatenate the state
+    eef_mat = tf.transpose(tf.reshape(state['state'][8:], (4, 4)))
+    eef_pos = eef_mat[:3, 3]
+    rotaion_matrix = eef_mat[:3, :3]
+    eef_ang = rotation_matrix_to_ortho6d(rotaion_matrix)
+    joint_pos = state['state'][:7]
+    grip_open = state['state'][7:8] * 12.5 # rescale to [0, 1]
+    state['arm_concat'] = tf.concat([joint_pos,grip_open,eef_pos,eef_ang],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_joint_0_pos,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'austin_buds_dataset_converted_externally_to_rlds'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/berkeley_autolab_ur5.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, \
+    quaternion_to_rotation_matrix, rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    action['terminate'] = terminate_act_to_bool(action['terminate_episode'])
+    eef_delta_pos = action['world_vector']
+    eef_ang = action['rotation_delta']
+    eef_ang = euler_to_quaternion(eef_ang)
+    # Ignore action['gripper_open']: 1 if close gripper, -1 if open gripper, 0 if no change.
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w")
+    # Convert raw state to our state
+    state = step['observation']
+    # state['robot_state']:[joint0, joint1, joint2, joint3, joint4, joint5, x,y,z, qx,qy,qz,qw, gripper_is_closed, action_blocked]
+    robot_state = state['robot_state']
+    joint_pos=robot_state[:6]
+    eef_pos = robot_state[6:9]
+    eef_quat = robot_state[9:13]
+    eef_ang = quaternion_to_rotation_matrix(eef_quat)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    # gripper_is_closed is binary: 0 = fully open; 1 = fully closed
+    grip_closed = robot_state[13:14]
+    grip_open= 1-grip_closed
+    # action_blocked is binary: 0 = not blocked; 1 = blocked
+    # action_blocked = robot_state[14:15]
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([joint_pos, grip_open,eef_pos,eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,gripper_open,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['observation']['natural_language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'berkeley_autolab_ur5'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/berkeley_cable_routing.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_rotation_matrix, rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.equal(terminate_act, tf.constant(1.0, dtype=tf.float32))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    action['terminate'] = terminate_act_to_bool(action['terminate_episode'])
+    eef_delta_pos = action['world_vector']
+    eef_ang=action['rotation_delta']
+    # No gripper_open found
+    # No base found
+    # Concatenate the action
+    arm_action=tf.concat([eef_delta_pos,eef_ang],axis=0)
+    action['arm_concat']=arm_action
+    #base_action = tf.concat([base_pos, base_ang], axis=0)
+    #action['base_concat'] = base_action
+    # Write the action format
+    action['format']=tf.constant("eef_vel_x,eef_vel_y,eef_vel_z,eef_angular_vel_roll,eef_angular_vel_pitch,eef_angular_vel_yaw")
+    # Convert raw state to our state
+    state = step['observation']
+    eef_pos = state['robot_state'][:3]
+    eef_ang = quaternion_to_rotation_matrix(state['robot_state'][3:])
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    # Concatenate the state
+    state['arm_concat']=tf.concat([eef_pos,eef_ang],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Define the task instruction
+    step['observation']['natural_language_instruction'] = tf.constant(
+        "Route cable through the tight-fitting clip mounted on the table.")
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod/'
+    DATASET_NAME = 'berkeley_cable_routing'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/berkeley_gnm_sac_son.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix, \
+    rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate'] = step['is_terminal']
+    eef_pos=tf.cast(origin_action, dtype=tf.float32)
+    eef_ang=tf.cast(step['action_angle'][2:3], dtype=tf.float32)
+    eef_ang = euler_to_quaternion(tf.stack([0,0,eef_ang[0]], axis=0))
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_pos,eef_ang],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w")
+    # Convert raw state to our state
+    state = step['observation']
+    # Concatenate the state
+    eef_pos=tf.cast(state['position'],dtype=tf.float32)
+    eef_ang=tf.cast(state['yaw'],dtype=tf.float32)
+    eef_ang = euler_to_rotation_matrix(tf.stack([0,0,eef_ang[0]],axis=0))
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    state['arm_concat'] = tf.concat([eef_pos/100,eef_ang],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'berkeley_gnm_sac_son'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step['action'][6:7])

data/preprocess_scripts/berkeley_rpt_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_euler
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.int32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    # 	Robot action, consists of [7 delta joint pos,1x gripper binary state].
+    delta_joint_pos = action[:7]
+    grip_open = tf.expand_dims(1 - action[7], axis=0)
+    # Concatenate the action
+    # action['arm_concat'] = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    step['action'] = {}
+    action = step['action']
+    action['arm_concat'] = tf.concat([delta_joint_pos, grip_open], axis=0)
+    action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "arm_joint_0_delta_pos,arm_joint_1_delta_pos,arm_joint_2_delta_pos,arm_joint_3_delta_pos,arm_joint_4_delta_pos,arm_joint_5_delta_pos,arm_joint_6_delta_pos,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    # xArm joint positions (7 DoF).
+    arm_joint_pos = state['joint_pos']
+    # Binary gripper state (1 - closed, 0 - open)
+    grip_open = tf.expand_dims(1 - tf.cast(state['gripper'],dtype=tf.float32), axis=0)
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([arm_joint_pos, grip_open], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'fractal20220817_data'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/calvin.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+import tensorflow as tf
+import os
+import fnmatch
+import random
+def _parse_function(proto):
+    keys_to_features = {
+        'action': tf.io.FixedLenFeature([], tf.string),
+        'robot_obs': tf.io.FixedLenFeature([], tf.string),
+        'rgb_static': tf.io.FixedLenFeature([], tf.string),
+        'rgb_gripper': tf.io.FixedLenFeature([], tf.string),
+        'terminate_episode': tf.io.FixedLenFeature([], tf.int64),
+        'instruction': tf.io.FixedLenFeature([], tf.string),
+    }
+    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
+    action = tf.io.parse_tensor(parsed_features['action'], out_type=tf.float64)
+    robot_obs = tf.io.parse_tensor(parsed_features['robot_obs'], out_type=tf.float64)
+    rgb_static = tf.io.parse_tensor(parsed_features['rgb_static'], out_type=tf.uint8)
+    rgb_gripper = tf.io.parse_tensor(parsed_features['rgb_gripper'], out_type=tf.uint8)
+    instruction = parsed_features['instruction']
+    terminate_episode = tf.cast(parsed_features['terminate_episode'], tf.int64)
+    action = tf.reshape(action, [7])
+    action = tf.cast(action, tf.float32)
+    robot_obs = tf.reshape(robot_obs, [15])
+    robot_obs = tf.cast(robot_obs, tf.float32)
+    rgb_static = tf.reshape(rgb_static, [200, 200, 3])
+    rgb_gripper = tf.reshape(rgb_gripper, [84, 84, 3])
+    # RGB to BGR
+    # rgb_static = rgb_static[:, :, ::-1]
+    # rgb_gripper = rgb_gripper[:, :, ::-1]
+    return {
+        'action': action,
+        'observation':{
+            'robot_obs': robot_obs,
+            'rgb_static': rgb_static,
+            'rgb_gripper': rgb_gripper,
+        },
+        'instruction': instruction,
+        'terminate_episode': terminate_episode
+    }
+def dataset_generator_from_tfrecords(seed):
+    tfrecord_path = './data/datasets/calvin/tfrecords/'
+    filepaths = []
+    for root, dirs, files in os.walk(tfrecord_path):
+        for filename in fnmatch.filter(files, '*.tfrecord'):
+            filepath = os.path.join(root, filename)
+            filepaths.append(filepath)
+    random.seed(seed)
+    random.shuffle(filepaths)
+    for filepath in filepaths:
+        raw_dataset = tf.data.TFRecordDataset(filepath)
+        dataset = raw_dataset.map(_parse_function)
+        yield {
+            'steps': dataset
+        }
+def load_dataset(seed):
+    dataset = tf.data.Dataset.from_generator(
+        lambda: dataset_generator_from_tfrecords(seed),
+        output_signature={
+            'steps': tf.data.DatasetSpec(
+                element_spec={
+                    'action': tf.TensorSpec(shape=(7,), dtype=tf.float32),
+                    'observation':{
+                        'robot_obs': tf.TensorSpec(shape=(15,), dtype=tf.float32),
+                        'rgb_static': tf.TensorSpec(shape=(200,200,3), dtype=tf.uint8),
+                        'rgb_gripper': tf.TensorSpec(shape=(84,84,3), dtype=tf.uint8),
+                    },
+                    'instruction': tf.TensorSpec(shape=(), dtype=tf.string),
+                    'terminate_episode': tf.TensorSpec(shape=(), dtype=tf.int64),
+                }
+            )
+        }
+    )
+    return dataset
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(
+        tf.equal(terminate_act, tf.constant(0, dtype=tf.int64)),
+        tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    old_action = step['action']
+    step['action'] = {}
+    action = step['action']
+    step['action']['terminate'] = terminate_act_to_bool(step['terminate_episode'])
+    # ['actions']
+    # (dtype=np.float32, shape=(7,))
+    # tcp position (3): x,y,z in absolute world coordinates
+    # tcp orientation (3): euler angles x,y,z in absolute world coordinates
+    # gripper_action (1): binary (close = -1, open = 1)
+    eef_pos = old_action[:3]
+    eef_ang = euler_to_rotation_matrix(old_action[3:6])
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    gripper_open = (old_action[6] + 1) / 2
+    gripper_open = tf.expand_dims(gripper_open, axis=0)
+    # # No base found
+    arm_action = tf.concat([eef_pos, eef_ang, gripper_open], axis=0)
+    action['arm_concat'] = arm_action
+    # # Write the action format
+    action['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,gripper_open")
+    state = step['observation']
+    # ['robot_obs']
+    # (dtype=np.float32, shape=(15,))
+    # tcp position (3): x,y,z in world coordinates
+    # tcp orientation (3): euler angles x,y,z in world coordinates
+    # gripper opening width (1): in meter
+    # arm_joint_states (7): in rad
+    # gripper_action (1): binary (close = -1, open = 1)
+    eef_pos = state['robot_obs'][:3]
+    eef_ang = euler_to_rotation_matrix(state['robot_obs'][3:6])
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    gripper_open = (state['robot_obs'][14] + 1) / 2
+    gripper_open = tf.expand_dims(gripper_open, axis=0)
+    qpos = state['robot_obs'][7:14]
+    state['arm_concat'] = tf.concat([qpos,gripper_open,eef_pos,eef_ang], axis=0)
+    # # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_open,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['instruction']
+    instr=  clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    # Load the dataset
+    dataset = load_dataset(1717055919)
+    for data in dataset.take(1):
+        for step in data['steps']:
+            step = process_step(step)
+            print(step['observation']['natural_language_instruction'])

data/preprocess_scripts/cmu_franka_exploration_dataset_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_euler,euler_to_quaternion
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate']=terminate_act_to_bool(origin_action[7])
+    # gripper_open: 1-open, 0-closed
+    eef_pos=origin_action[:3]
+    eef_ang=origin_action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open=origin_action[6:7]
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # No state found
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'cmu_franka_exploration_dataset_converted_externally_to_rlds'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step['action'][6:7])

data/preprocess_scripts/cmu_play_fusion.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_euler
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate']=terminate_act_to_bool(origin_action[8])
+    eef_pos=origin_action[:3]
+    # eef_ang=quaternion_to_euler(origin_action[3:7])
+    eef_ang = origin_action[3:7]
+    grip_open=origin_action[7:8]
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    # Concatenate the state
+    arm_joint_ang=state['state'][:7]
+    grip_open=state['state'][7:8] * 11.765  # rescale to [0, 1]
+    state['arm_concat'] = tf.concat([arm_joint_ang,grip_open],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_joint_0_pos")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'cmu_play_fusion'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step['action'][6:7])

data/preprocess_scripts/cmu_stretch.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_euler,euler_to_quaternion
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate']=terminate_act_to_bool(origin_action[7])
+    eef_pos=origin_action[:3]
+    eef_ang=origin_action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open=origin_action[6:7]
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    # Concatenate the state
+    eef_pos_x = state['state'][0:1]
+    eef_pos_z = state['state'][2:3]
+    grip_open = state['state'][3:4]
+    state['arm_concat'] = tf.concat(
+        [eef_pos_x, eef_pos_z, grip_open], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_z,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'cmu_stretch'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step['action'][6:7])

data/preprocess_scripts/droid.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action_dict = step['action_dict']
+    # Robot action
+    eef_pos = action_dict['cartesian_position'][:3]
+    eef_ang = action_dict['cartesian_position'][3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    eef_pos_vel = action_dict['cartesian_velocity'][:3]
+    eef_ang_vel = action_dict['cartesian_velocity'][3:6]
+    joint_pos = action_dict['joint_position']
+    joint_vel = action_dict['joint_velocity']
+    grip_pos = action_dict['gripper_position']
+    grip_vel = action_dict['gripper_velocity']
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    arm_action = tf.concat([eef_pos, eef_ang, eef_pos_vel, eef_ang_vel, joint_pos, joint_vel, grip_pos, grip_vel], axis=0)
+    action['arm_concat'] = arm_action
+    action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,eef_vel_x,eef_vel_y,eef_vel_z,eef_angular_vel_roll,eef_angular_vel_pitch,eef_angular_vel_yaw,arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos,gripper_joint_0_vel")
+    # Convert raw state to our state
+    # Robot state
+    state = step['observation']
+    eef_pos = state['cartesian_position'][:3]
+    eef_ang = state['cartesian_position'][3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    joint_pos = state['joint_position']
+    grip_pos = 1 - state['gripper_position']
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([
+        joint_pos,grip_pos,eef_pos,eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_joint_0_pos,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    pass

data/preprocess_scripts/fractal20220817_data.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, quaternion_to_rotation_matrix,\
+    rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.int32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    action['terminate'] = terminate_act_to_bool(action['terminate_episode'])
+    eef_delta_pos = action['world_vector']
+    eef_ang = action['rotation_delta']
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = 1 - (action['gripper_closedness_action'] + 1) / 2
+    # Multiplied by 3 Hz to get units m/s and rad/s
+    base_delta_pos = action['base_displacement_vector'] * 3
+    base_delta_ang = action['base_displacement_vertical_rotation'] * 3
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['arm_concat'] = arm_action
+    base_action = tf.concat([base_delta_pos, base_delta_ang], axis=0)
+    action['base_concat'] = base_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open,base_vel_x,base_vel_y,base_angular_vel")
+    # Convert raw state to our state
+    state = step['observation']
+    eef_pos = state['base_pose_tool_reached'][:3]
+    # eef_ang = quaternion_to_euler(state['base_pose_tool_reached'][3:])
+    eef_ang = quaternion_to_rotation_matrix(state['base_pose_tool_reached'][3:])
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    grip_open = 1 - state['gripper_closed']
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([eef_pos, eef_ang, grip_open], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['observation']['natural_language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'fractal20220817_data'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/iamlab_cmu_pickup_insert_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction,quaternion_to_euler
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    eef_delta_pos = origin_action[:3]
+    # delta ZYX euler angles
+    # eef_ang=quaternion_to_euler(origin_action[3:7])
+    eef_ang = origin_action[3:7]
+    grip_open=origin_action[7:8]
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_delta_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    # Concatenate the state
+    # 7x robot joint angles, 1x gripper status, 6x joint torques, 6x end-effector force
+    arm_joint_ang=state['state'][:7]
+    grip_open=state['state'][7:8]
+    state['arm_concat'] = tf.concat([arm_joint_ang,grip_open],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'iamlab_cmu_pickup_insert_converted_externally_to_rlds'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/libero_goal_no_noops.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action_dict = step['action']
+    # Robot action
+    # eef_pos = action_dict['ee_pos'][:3]
+    # eef_ang = action_dict['ee_pos'][3:6]
+    # eef_ang = euler_to_rotation_matrix(eef_ang)
+    # eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    eef_pos_vel = action_dict[:3]
+    eef_ang_vel = action_dict[3:6]
+    # joint_pos = action_dict['joint_pos'][:-1]
+    # joint_vel = action_dict['delta_joint'][:-1]
+    grip_pos = 1 - tf.clip_by_value(action_dict[-1:], 0, 1)
+    # grip_vel = action_dict['gripper_velocity']
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    arm_action = tf.concat([eef_pos_vel, eef_ang_vel, grip_pos], axis=0)
+    action['arm_concat'] = arm_action
+    # action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_vel_x,eef_vel_y,eef_vel_z,eef_angular_vel_roll,eef_angular_vel_pitch,eef_angular_vel_yaw,gripper_joint_0_pos")
+    # Convert raw state to our state
+    # Robot state
+    state = step['observation']
+    # print(state.keys())
+    # image = step['observation']['image']
+    eef_pos = state['state'][:3]
+    eef_ang = state['state'][3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    # joint_pos = state['joint_pos'][:-1]
+    grip_pos = state['state'][-2:]
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([
+        grip_pos,eef_pos,eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "gripper_joint_0_pos,gripper_joint_1_pos,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    # instr = clean_task_instruction(instr, replacements)
+    step['observation'] = state
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    pass

data/preprocess_scripts/libero_spatial_no_noops.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action_dict = step['action']
+    # Robot action
+    # eef_pos = action_dict['ee_pos'][:3]
+    # eef_ang = action_dict['ee_pos'][3:6]
+    # eef_ang = euler_to_rotation_matrix(eef_ang)
+    # eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    eef_pos_vel = action_dict[:3]
+    eef_ang_vel = action_dict[3:6]
+    # joint_pos = action_dict['joint_pos'][:-1]
+    # joint_vel = action_dict['delta_joint'][:-1]
+    grip_pos = 1 - tf.clip_by_value(action_dict[-1:], 0, 1)
+    # grip_vel = action_dict['gripper_velocity']
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    arm_action = tf.concat([eef_pos_vel, eef_ang_vel, grip_pos], axis=0)
+    action['arm_concat'] = arm_action
+    # action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_vel_x,eef_vel_y,eef_vel_z,eef_angular_vel_roll,eef_angular_vel_pitch,eef_angular_vel_yaw,gripper_joint_0_pos")
+    # Convert raw state to our state
+    # Robot state
+    state = step['observation']
+    # print(state.keys())
+    # image = step['observation']['image']
+    eef_pos = state['state'][:3]
+    eef_ang = state['state'][3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    # joint_pos = state['joint_pos'][:-1]
+    grip_pos = state['state'][-2:]
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([
+        grip_pos,eef_pos,eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "gripper_joint_0_pos,gripper_joint_1_pos,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    # instr = clean_task_instruction(instr, replacements)
+    step['observation'] = state
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    pass

data/preprocess_scripts/nyu_rot_dataset_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix, \
+    rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    origin_action = step['action']
+    step['action']={}
+    action=step['action']
+    action['terminate'] = step['is_terminal']
+    eef_delta_pos = origin_action[:3]
+    eef_ang=origin_action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    # gripper_open: 0-open, 1-closed
+    grip_open=tf.where(tf.equal(origin_action[6:],tf.constant(0.0)),tf.constant(1.0),tf.constant(0.0))
+    # No base found
+    # Concatenate the action
+    action['arm_concat'] = tf.concat([eef_delta_pos,eef_ang,grip_open],axis=0)
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    eef_pos=state['state'][:3]
+    eef_ang=state['state'][3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    grip_open=1-state['state'][6:7]
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([eef_pos,eef_ang,grip_open],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'nyu_rot_dataset_converted_externally_to_rlds'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/robo_net.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import tensorflow as tf
+import numpy as np
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix, \
+    rotation_matrix_to_ortho6d
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    eef_delta_pos = action[:3]
+    eef_delta_angle_yaw = action[3:4]
+    eef_ang = tf.stack([0.0, 0.0, eef_delta_angle_yaw[0]], axis=0)
+    eef_ang = euler_to_quaternion(eef_ang)
+    eef_gripper_open = (1 - action[4:5]) / 2
+    step['action'] = {}
+    action = step['action']
+    action['terminate'] = step['is_terminal']
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang, eef_gripper_open], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    eef_pos = state['state'][:3]
+    eef_ang_yaw = state['state'][3:4]
+    eef_ang = tf.stack([0.0, 0.0, eef_ang_yaw[0]], axis=0)
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    grip_joint_pos = state['state'][4:5]
+    # If abs(grip_joint_pos) > 3.15, then convert it to the radian
+    grip_joint_pos = tf.cond(tf.greater(tf.abs(grip_joint_pos), 3.15),
+                             lambda: grip_joint_pos / 180 * np.pi,
+                             lambda: grip_joint_pos)
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([eef_pos,eef_ang,grip_joint_pos],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step

data/preprocess_scripts/robomimic_lift_ph.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import tensorflow as tf
+import tensorflow_datasets as tfds
+from data.utils import clean_task_instruction, quaternion_to_euler
+def load_dataset():
+    builder = tfds.builder('robomimic_ph/lift_ph_image')
+    builder.download_and_prepare()
+    ds = builder.as_dataset(split='train', shuffle_files=True)
+    return ds
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # format refers to https://www.tensorflow.org/datasets/catalog/robomimic_mg
+    # Convert raw action to our action
+    eef = step['action']
+    step['action'] = {}
+    action = step['action']
+    action['terminate'] = step['is_terminal']
+    eef_delta_pos = eef[:3]
+    eef_ang = quaternion_to_euler(eef[3:])
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_roll,eef_delta_angle_pitch,eef_delta_angle_yaw")
+    # Convert raw state to our state
+    state = step['observation']
+    arm_joint_pos = state['robot0_joint_pos']
+    arm_joint_vel = state['robot0_joint_vel']
+    gripper_pos = state['robot0_gripper_qpos']
+    gripper_vel = state['robot0_gripper_qvel']
+    eef_pos = state['robot0_eef_pos']
+    eef_ang = quaternion_to_euler(state['robot0_eef_quat'])
+    state['arm_concat'] = tf.concat([arm_joint_pos, arm_joint_vel, gripper_pos,gripper_vel,eef_pos,eef_ang], axis=0)
+    # convert to tf32
+    state['arm_concat'] = tf.cast(state['arm_concat'], tf.float32)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos,gripper_joint_1_pos,gripper_joint_0_vel,gripper_joint_1_vel,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_roll,eef_angle_pitch,eef_angle_yaw")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    # manual added by lbg
+    instr = "lift the object on the table"
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'roboturk'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all').take(1)
+    # Inspect the dataset
+    ze=tf.constant(0.0)
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)
+            break

data/preprocess_scripts/robomimic_square_ph.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import tensorflow as tf
+import tensorflow_datasets as tfds
+from data.utils import clean_task_instruction, quaternion_to_euler
+def load_dataset():
+    builder = tfds.builder('robomimic_ph/square_ph_image')
+    builder.download_and_prepare()
+    ds = builder.as_dataset(split='train', shuffle_files=True)
+    return ds
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # format refers to https://www.tensorflow.org/datasets/catalog/robomimic_mg
+    # Convert raw action to our action
+    eef = step['action']
+    step['action'] = {}
+    action = step['action']
+    action['terminate'] = step['is_terminal']
+    eef_delta_pos = eef[:3]
+    eef_ang = quaternion_to_euler(eef[3:])
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_roll,eef_delta_angle_pitch,eef_delta_angle_yaw")
+    # Convert raw state to our state
+    state = step['observation']
+    arm_joint_pos = state['robot0_joint_pos']
+    arm_joint_vel = state['robot0_joint_vel']
+    gripper_pos = state['robot0_gripper_qpos']
+    gripper_vel = state['robot0_gripper_qvel']
+    eef_pos = state['robot0_eef_pos']
+    eef_ang = quaternion_to_euler(state['robot0_eef_quat'])
+    state['arm_concat'] = tf.concat([arm_joint_pos, arm_joint_vel, gripper_pos,gripper_vel,eef_pos,eef_ang], axis=0)
+    # convert to tf32
+    state['arm_concat'] = tf.cast(state['arm_concat'], tf.float32)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos,gripper_joint_1_pos,gripper_joint_0_vel,gripper_joint_1_vel,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_roll,eef_angle_pitch,eef_angle_yaw")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    # manual added by lbg
+    instr = "move the square across the cube"
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'roboturk'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all').take(1)
+    # Inspect the dataset
+    ze=tf.constant(0.0)
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)
+            break

data/preprocess_scripts/roboset.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import tensorflow as tf
+import tensorflow_datasets as tfds
+from data.utils import clean_task_instruction, quaternion_to_euler
+import tensorflow as tf
+import h5py
+import numpy as np
+from tqdm import tqdm
+import os
+import imageio
+import concurrent.futures
+import fnmatch
+import cv2
+import random
+path2json = {
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaPlanarPushReal_v2d/set_17_plannar_push_eval": [
+        "Push the green object to the red line."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinReorientRealRP03_v2d_set16/set_16_bin_reorient_3": [
+        "Pick up the bottle and and stand it upright."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/demonstrations/orange_block/set_7_orange_block": [
+        "Pick up the orange block."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/demonstrations/wooden_block": [
+        "Pick up the wooden block."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinReorientReal_v2d-orig/set_15_bin_reorient_eval_2": [
+        "Pick up the bottle and and stand it upright."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinPushReal_v2d_set14/set_14_bin_push": [
+        "Push the object to the red line."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinPickRealRP05_v2d/set_11_bottle_pick": [
+        "Pick up the bottle."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinPickRealRP03_v2d/set_10_bin_pick_2004": [
+        "Pick up the wooden block."
+    ],
+    "mnt/raid5/data/jaydv/bin_pick_data_30029/set_2_softtoys": [
+        "Pick up one toy in the basket."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_banana_from_toaster_place_on_table_data": [
+        "Pick banana from toaster and place on table."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinPickReal_v2d/set_12_bin_pick_eval": [
+        "Pick up the block."
+    ],
+    "mnt/raid5/data/roboset/v0.3/flap_open_toaster_oven_data": [
+        "Flap open toaster."
+    ],
+    "mnt/raid5/data/jaydv/robohive_base/episodes/franka-FrankaBinReorientReal_v2d_set13/set_13_bin_reorient": [
+        "Pick up the bottle and stand it upright."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_mug_from_right_to_left_data": [
+        "Drag mug right to left."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_strainer_backward_data": [
+        "Drag strainer backwards."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_4/baking_prep_slide_close_drawer_scene_4": [
+        "Slide and close the drawer."
+    ],
+    "mnt/raid5/data/roboset/v0.4/heat_soup/scene_4/baking_slide_in_bowl_scene_4": [
+        "Place the bowl into the container."
+    ],
+    "set_5_bottle_cube_14": [
+        "Pick up the bottle."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_mug_forward_data": [
+        "Drag mug forwards."
+    ],
+    "mnt/raid5/data/roboset/v0.4/clean_kitchen/scene_3/clean_kitchen_pick_towel_scene_3": [
+        "Pick up the towel from the oven."
+    ],
+    "mnt/raid5/data/roboset/v0.4/heat_soup/scene_2/baking_pick_bowl_scene_2": [
+        "Pick up the bowl."
+    ],
+    "mnt/raid5/data/roboset/v0.4/heat_soup/scene_2/baking_slide_in_bowl_scene_2": [
+        "Place the bowl into the oven."
+    ],
+    "mnt/raid5/data/roboset/v0.4/heat_soup/scene_2/baking_close_oven_scene_2": [
+        "Flap and close the oven."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_banana_place_in_strainer_data": [
+        "Pick banana and place it in strainer."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_banana_place_in_mug_data": [
+        "Pick banana and place it in mug."
+    ],
+    "mnt/raid5/data/roboset/v0.4/make_tea/scene_2/make_tea_pick_tea_scene_2": [
+        "Pick up the tea from the container."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_strainer_forward_data": [
+        "Drag strainer forwards."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_ketchup_from_strainer_place_on_table_data": [
+        "Pick ketchup from strainer and place it on the table."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_4/baking_prep_place_butter_scene_4": [
+        "Place the butter on the cutting board."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_1/baking_prep_slide_open_drawer_scene_1": [
+        "Slide and open the drawer."
+    ],
+    "mnt/raid5/data/roboset/v0.3/drag_strainer_right_to_left_data": [
+        "Drag strainer right to left."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_banana_from_plate_place_on_table_data": [
+        "Pick banana from plate and place on table."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_ketchup_from_plate_place_on_table_data": [
+        "Pick ketchup from plate and place it on table."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_1/baking_prep_slide_close_drawer_scene_1": [
+        "Slide and close the drawer."
+    ],
+    "mnt/raid5/data/roboset/v0.3/drag_strainer_left_to_right_data": [
+        "Drag strainer left to right."
+    ],
+    "pick_ketchup_place_on_toaster_data": [
+        "Pick ketchup from table and place on toaster."
+    ],
+    "mnt/raid5/data/roboset/v0.4/make_tea/scene_2/make_tea_place_tea_scene_2": [
+        "Place the tea into the cup."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_ketchup_place_in_strainer_data": [
+        "Pick ketchup from the table and place it in strainer."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_ketchup_place_on_plate_data": [
+        "Pick ketchup from table and place on plate."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_mug_backward_data": [
+        "Drag mug backwards."
+    ],
+    "set_1_blocks_897": [
+        "Pick up one block in the basket."
+    ],
+    "mnt/raid5/data/roboset/v0.4/make_tea/scene_2/make_tea_place_lid_scene_2": [
+        "Place lid on the cutting board."
+    ],
+    "mnt/raid5/data/roboset/v0.4/heat_soup/scene_4/baking_pick_bowl_scene_4": [
+        "Pick up the bowl."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_4/baking_prep_pick_butter_scene_4": [
+        "Pick up the butter from the drawer."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_1/baking_prep_pick_butter_scene_1": [
+        "Pick up the butter from the drawer."
+    ],
+    "home/jaydv/Documents/RoboSet/flap_close_toaster_oven_data": [
+        "Flap close toaster."
+    ],
+    "home/jaydv/Documents/RoboSet/drag_mug_from_left_to_right_data": [
+        "Drag mug left to right."
+    ],
+    "set_6_planar_push_120": [
+        "Push the object from left to right."
+    ],
+    "mnt/raid5/data/roboset/v0.4/clean_kitchen/scene_3/clean_kitchen_slide_close_drawer_scene_3": [
+        "Slide and close the drawer."
+    ],
+    "set_4_med_block_7": [
+        "Pick up the wooden block."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_banana_place_on_toaster_data": [
+        "Pick banana from table and place on toaster."
+    ],
+    "mnt/raid5/data/roboset/v0.3/pick_ketchup_from_toaster_place_on_table_data": [
+        "Pick ketchup from toaster and place it on table."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_1/baking_prep_place_butter_scene_1": [
+        "Place the butter on the cutting board."
+    ],
+    "mnt/raid5/data/roboset/v0.4/baking_prep/scene_4/baking_prep_slide_open_drawer_scene_4": [
+        "Slide and open the drawer."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_banana_place_on_plate_data": [
+        "Pick banana from table and place on plate."
+    ],
+    "set_8_pick_bottle_10": [
+        "Pick up the bottle."
+    ],
+    "home/jaydv/Documents/RoboSet/pick_ketchup_place_in_toaster_data": [
+        "Pick ketchup from the table and place in toaster."
+    ]
+}
+image_shape = (240, 424, 3)
+Dmanus = ['']
+def stash_image_into_observation(step):
+    step['observation'] = {'cam_high': [], 'cam_left_wrist': [], 'cam_right_wrist':[]}
+    step['observation']['cam_high'] = step['cam_high']
+    step['observation']['cam_left_wrist'] = step['cam_left_wrist']
+    step['observation']['cam_right_wrist'] = step['cam_right_wrist']
+    return step
+def _parse_function(proto,instruction):
+    # Update the keys_to_features dictionary to match the new TFRecord format
+    keys_to_features = {
+        'action': tf.io.FixedLenFeature([], tf.string),
+        'action_gripper': tf.io.FixedLenFeature([], tf.string),
+        'qpos': tf.io.FixedLenFeature([], tf.string),
+        'qvel': tf.io.FixedLenFeature([], tf.string),
+        'qpos_gripper': tf.io.FixedLenFeature([], tf.string),
+        'qvel_gripper': tf.io.FixedLenFeature([], tf.string),
+        'rgb_left': tf.io.FixedLenFeature([], tf.string),
+        'rgb_right': tf.io.FixedLenFeature([], tf.string),
+        'rgb_top': tf.io.FixedLenFeature([], tf.string),
+        'terminate_episode': tf.io.FixedLenFeature([], tf.int64)
+    }
+    # Parse the incoming features according to the dictionary
+    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
+    # Deserialize and reshape tensors as necessary
+    action = tf.io.parse_tensor(parsed_features['action'], out_type=tf.float16)
+    action_gripper = tf.io.parse_tensor(parsed_features['action_gripper'], out_type=tf.float16)
+    qpos = tf.io.parse_tensor(parsed_features['qpos'], out_type=tf.float16)
+    qvel = tf.io.parse_tensor(parsed_features['qvel'], out_type=tf.float16)
+    qpos_gripper = tf.io.parse_tensor(parsed_features['qpos_gripper'], out_type=tf.float16)
+    qvel_gripper = tf.io.parse_tensor(parsed_features['qvel_gripper'], out_type=tf.float16)
+    rgb_left = tf.io.parse_tensor(parsed_features['rgb_left'], out_type=tf.uint8)
+    rgb_right = tf.io.parse_tensor(parsed_features['rgb_right'], out_type=tf.uint8)
+    rgb_top = tf.io.parse_tensor(parsed_features['rgb_top'], out_type=tf.uint8)
+    terminate_episode = tf.cast(parsed_features['terminate_episode'], tf.int64)
+    # Reshape or modify other fields as needed to fit the model input
+    rgb_left = tf.reshape(rgb_left, image_shape)
+    rgb_right = tf.reshape(rgb_right, image_shape)
+    rgb_top = tf.reshape(rgb_top, image_shape)
+    return {
+        "action": action,
+        "action_gripper": action_gripper,
+        "qpos": qpos,
+        "qvel": qvel,
+        "qpos_gripper": qpos_gripper,
+        "qvel_gripper": qvel_gripper,
+        "observation": {
+            "rgb_left": rgb_left,
+            "rgb_right": rgb_right,
+            "rgb_top": rgb_top
+        },
+        "terminate_episode": terminate_episode,
+        "instruction": instruction
+    }
+def dataset_generator_from_tfrecords(seed):
+    tfrecord_path = './data/datasets/roboset/tfrecords/'
+    failure = [f'set_{i}' for i in range(10, 18)]
+    filepaths = []
+    for root, dirs, files in os.walk(tfrecord_path):
+        # skip datasets with failure
+        fail = False
+        for f in failure:
+            if f in root:
+                fail = True
+                break
+        if fail:
+            continue
+        for filename in fnmatch.filter(files, '*.tfrecord'):
+            filepath = os.path.join(root, filename)
+            filepaths.append(filepath)
+    random.seed(seed)
+    random.shuffle(filepaths)
+    for filepath in filepaths:
+        for path in path2json:
+            if path in filepath:
+                instruction = path2json[path]
+        raw_dataset = tf.data.TFRecordDataset(filepath)
+        dataset = raw_dataset.map(lambda x: _parse_function(x,instruction))
+        yield {
+            'steps': dataset
+        }
+def load_dataset(seed):
+    dataset = tf.data.Dataset.from_generator(
+        lambda: dataset_generator_from_tfrecords(seed),
+        output_signature={
+            'steps': tf.data.DatasetSpec(
+                element_spec={
+                    'action': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'action_gripper': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'qpos': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'qvel': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'qpos_gripper': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'qvel_gripper': tf.TensorSpec(shape=(None), dtype=tf.float16),
+                    'observation': {
+                        'rgb_left': tf.TensorSpec(shape=image_shape, dtype=tf.uint8),
+                        'rgb_right': tf.TensorSpec(shape=image_shape, dtype=tf.uint8),
+                        'rgb_top': tf.TensorSpec(shape=image_shape, dtype=tf.uint8),
+                    },
+                    'terminate_episode': tf.TensorSpec(shape=(), dtype=tf.int64),
+                    'instruction': tf.TensorSpec(shape=(None), dtype=tf.string)
+                }
+            )
+        }
+    )
+    return dataset
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float16)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    step['action'] = {}
+    step['action']['terminate'] = step['terminate_episode']
+    # undetermined action
+    state = step['observation']
+    qpos = tf.cast(step['qpos'], tf.float32)
+    # qvel = tf.cast(step['qvel'], tf.float32)
+    gripper_pos = tf.expand_dims(tf.cast(step['qpos_gripper'], tf.float32), axis=0)
+    # delete due to all zeros
+    # gripper_vel = tf.expand_dims(tf.cast(step['qvel_gripper'], tf.float32), axis=0)
+    # state['arm_concat'] = tf.concat([qpos, qvel, gripper_pos, gripper_vel], axis=0)
+    state['arm_concat'] = tf.concat([qpos, gripper_pos], axis=0)
+    # state['format'] = tf.constant(
+    #     "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos,gripper_joint_0_vel"
+    #     )
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_joint_0_pos"
+        )
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['instruction'][0]
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    dataset = load_dataset()
+    for step in dataset.take(100):
+        for data in step['steps']:
+            data = process_step(data)
+            print(data)
+            break

data/preprocess_scripts/roboturk.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, quaternion_to_euler, euler_to_quaternion
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    action['terminate'] = terminate_act_to_bool(action['terminate_episode'])
+    eef_delta_pos = action['world_vector']
+    eef_ang = action['rotation_delta']
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = tf.where(action['gripper_closedness_action']<0,tf.constant(1.0),tf.constant(0.0))
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # No state found
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['observation']['natural_language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'roboturk'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all').take(1)
+    # Inspect the dataset
+    ze=tf.constant(0.0)
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)
+            break

data/preprocess_scripts/roboturk_real_objectsearch.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import tensorflow as tf
+import tensorflow_datasets as tfds
+from data.utils import clean_task_instruction, quaternion_to_euler
+import tensorflow as tf
+import h5py
+import numpy as np
+from tqdm import tqdm
+import os
+import imageio
+import concurrent.futures
+def get_frames(file_path):
+    if not os.path.exists(file_path) or not os.path.isfile(file_path) or not file_path.endswith('.mp4'):
+        return []
+    frames = []
+    with imageio.get_reader(file_path, 'ffmpeg') as reader:
+        for frame in reader:
+            frame = np.array(frame, dtype=np.uint8)
+            frames.append(frame)
+    return frames
+def parallel_get_frames(paths):
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_path = {executor.submit(get_frames, path): path for path in paths}
+        return [future.result() for future in concurrent.futures.as_completed(future_to_path)]
+def count_total_samples(filename):
+    total_samples = 0
+    with h5py.File(filename, 'r') as f:
+        data = f['data']
+        for user_key in data.keys():
+            user = data[user_key]
+            for demo_key in user.keys():
+                total_samples += 1
+    return total_samples
+def dataset_generator(filename, total_samples):
+    with h5py.File(filename, 'r') as f:
+        data = f['data']
+        for user_key in data.keys():
+            user = data[user_key]
+            for demo_key in user.keys():
+                demo = user[demo_key]
+                robot_observation = demo['robot_observation']
+                user_control = demo['user_control']
+                eef_poses = robot_observation['eef_poses']
+                joint_states_arm = robot_observation['joint_states_arm']
+                joint_states_gripper = robot_observation['joint_states_gripper']
+                user_control_data = user_control['user_control']
+                attrs = dict(demo.attrs)
+                top_depth_video_file = attrs['top_depth_video_file']
+                top_rgb_video_file = attrs['top_rgb_video_file']
+                front_rgb_video_file = attrs['front_rgb_video_file']
+                video_root_path = './data/datasets/roboturk/'
+                top_depth_frames = get_frames(os.path.join(video_root_path, top_depth_video_file))
+                top_rgb_frames = get_frames(os.path.join(video_root_path, top_rgb_video_file))
+                front_rgb_frames = get_frames(os.path.join(video_root_path, front_rgb_video_file))
+                if len(top_rgb_frames) == 0 or len(front_rgb_frames) == 0:
+                    continue
+                steps = []
+                for i in range(len(eef_poses)):
+                    task_demo_id = f"SawyerTowerCreation_{demo_key}_{i}"
+                    step = {
+                        'task_demo_id': task_demo_id,
+                        'eef_poses': eef_poses[i],
+                        'joint_states_arm': joint_states_arm[i],
+                        'joint_states_gripper': joint_states_gripper[i],
+                        'user_control': user_control_data[i] if user_control_data.shape[0] > 0 else np.zeros(22),
+                        'observation':{
+                            'top_depth_frame': top_depth_frames[i] if i < len(top_depth_frames) else np.zeros((0,0, 3), dtype=np.uint8),
+                            'top_rgb_frame': top_rgb_frames[i] if i < len(top_rgb_frames) else np.zeros((0, 0, 3), dtype=np.uint8),
+                            'front_rgb_frame': front_rgb_frames[i] if i < len(front_rgb_frames) else np.zeros((0, 0, 3), dtype=np.uint8),
+                        },
+                        'terminate_episode': i == len(eef_poses) - 1
+                    }
+                    steps.append(step)
+                steps_dataset = tf.data.Dataset.from_generator(
+                    lambda: iter(steps),
+                    output_signature={
+                        'task_demo_id': tf.TensorSpec(shape=(), dtype=tf.string),
+                        'eef_poses': tf.TensorSpec(shape=(7,), dtype=tf.float32),
+                        'joint_states_arm': tf.TensorSpec(shape=(27,), dtype=tf.float32),
+                        'joint_states_gripper': tf.TensorSpec(shape=(3,), dtype=tf.float32),
+                        'user_control': tf.TensorSpec(shape=(22,), dtype=tf.float32),
+                        'observation':{
+                            'top_depth_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                            'top_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                            'front_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        },
+                        'terminate_episode': tf.TensorSpec(shape=(), dtype=tf.bool),
+                    }
+                )
+                yield {'steps': steps_dataset}
+def load_dataset():
+    filename = './data/datasets/roboturk/SawyerObjectSearch_aligned_dataset.hdf5'
+    total_samples = count_total_samples(filename)
+    dataset = tf.data.Dataset.from_generator(
+        lambda: dataset_generator(filename, total_samples),
+        output_signature={
+            'steps': tf.data.DatasetSpec(
+                element_spec={
+                    'task_demo_id': tf.TensorSpec(shape=(), dtype=tf.string),
+                    'eef_poses': tf.TensorSpec(shape=(7,), dtype=tf.float32),
+                    'joint_states_arm': tf.TensorSpec(shape=(27,), dtype=tf.float32),
+                    'joint_states_gripper': tf.TensorSpec(shape=(3,), dtype=tf.float32),
+                    'user_control': tf.TensorSpec(shape=(22,), dtype=tf.float32),
+                    'observation':{
+                        'top_depth_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        'top_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        'front_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                    },
+                    'terminate_episode': tf.TensorSpec(shape=(), dtype = tf.bool),
+                }
+            )
+        }
+    )
+    return dataset
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    step['action'] = {}
+    action = step['action']
+    action['terminate'] = step['terminate_episode']
+    eef_delta_pos = step['eef_poses'][:3]
+    eef_ang = step['eef_poses'][3:]
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w")
+    # No state found
+    state = step['observation']
+    # joint_states_arm: dataset of (num_timestamps, 27) shape where each of the 9 joints is represented by the JointState message
+    # (the nine joints are in order by their ROSBAG names: ['head_pan', 'right_j0', 'right_j1', 'right_j2', 'right_j3', 'right_j4', 'right_j5', 'right_j6', 'torso_t0']. For the most part, head_pan and torso should be zeros)
+    # [0] the position of the first joint (rad or m)
+    # [1] the velocity of the first joint (rad/s or m/s)
+    # [2] the effort that is applied in the first joint
+    # [3] the position of the second joint...
+    joint_states_arm = step['joint_states_arm']
+    joint_pos = joint_states_arm[3:24:3]
+    joint_vel = joint_states_arm[4:25:3]
+    # joint_states_gripper: dataset of (num_timestamps, 3) shape
+    # [0] the position of the gripper (rad or m)
+    # [1] the velocity of the gripper (rad/s or m/s)
+    # [2] the effort that is applied in the gripper
+    joint_states_gripper = step['joint_states_gripper']
+    gripper_pos = joint_states_gripper[:1]
+    # remove gripper_vel due to they are all zeros
+    # gripper_vel = joint_states_gripper[1:2]
+    # Concatenate the state
+    # state['arm_concat'] = tf.concat([joint_pos,joint_vel,gripper_pos,gripper_vel], axis=0)
+    state['arm_concat'] = tf.concat([joint_pos,joint_vel,gripper_pos], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    # copied from openxembod
+    instr = b'create tower'
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = '/cephfs-thu/gsm_data/openx_embod'
+    DATASET_NAME = 'roboturk_real_laundrylayout'
+    # Load the dataset
+    dataset = load_dataset()
+    # save_dir = os.path.join(DATASET_DIR, DATASET_NAME)
+    # if not os.path.exists(save_dir):
+    #     os.makedirs(save_dir)
+    # tf.data.experimental.save(dataset, save_dir)

data/preprocess_scripts/roboturk_real_towercreation.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import tensorflow as tf
+import tensorflow_datasets as tfds
+from data.utils import clean_task_instruction, quaternion_to_euler
+import tensorflow as tf
+import h5py
+import numpy as np
+from tqdm import tqdm
+import os
+import imageio
+import concurrent.futures
+def get_frames(file_path):
+    if not os.path.exists(file_path) or not os.path.isfile(file_path) or not file_path.endswith('.mp4'):
+        return []
+    frames = []
+    with imageio.get_reader(file_path, 'ffmpeg') as reader:
+        for frame in reader:
+            frame = np.array(frame, dtype=np.uint8)
+            frames.append(frame)
+    return frames
+def parallel_get_frames(paths):
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_path = {executor.submit(get_frames, path): path for path in paths}
+        return [future.result() for future in concurrent.futures.as_completed(future_to_path)]
+def count_total_samples(filename):
+    total_samples = 0
+    with h5py.File(filename, 'r') as f:
+        data = f['data']
+        for user_key in data.keys():
+            user = data[user_key]
+            for demo_key in user.keys():
+                total_samples += 1
+    return total_samples
+def dataset_generator(filename, total_samples):
+    with h5py.File(filename, 'r') as f:
+        data = f['data']
+        for user_key in data.keys():
+            user = data[user_key]
+            for demo_key in user.keys():
+                demo = user[demo_key]
+                robot_observation = demo['robot_observation']
+                user_control = demo['user_control']
+                eef_poses = robot_observation['eef_poses']
+                joint_states_arm = robot_observation['joint_states_arm']
+                joint_states_gripper = robot_observation['joint_states_gripper']
+                user_control_data = user_control['user_control']
+                attrs = dict(demo.attrs)
+                top_depth_video_file = attrs['top_depth_video_file']
+                top_rgb_video_file = attrs['top_rgb_video_file']
+                front_rgb_video_file = attrs['front_rgb_video_file']
+                video_root_path = './data/datasets/roboturk/'
+                top_depth_frames = get_frames(os.path.join(video_root_path, top_depth_video_file))
+                top_rgb_frames = get_frames(os.path.join(video_root_path, top_rgb_video_file))
+                front_rgb_frames = get_frames(os.path.join(video_root_path, front_rgb_video_file))
+                if len(top_rgb_frames) == 0 or len(front_rgb_frames) == 0:
+                    continue
+                # video_root_path = '/cephfs-thu/gsm_data/robotruck'
+                # video_paths = [
+                #     os.path.join(video_root_path, attrs['top_depth_video_file']),
+                #     os.path.join(video_root_path, attrs['top_rgb_video_file']),
+                #     os.path.join(video_root_path, attrs['front_rgb_video_file'])
+                # ]
+                # top_depth_frames, top_rgb_frames, front_rgb_frames = parallel_get_frames(video_paths)
+                steps = []
+                for i in range(len(eef_poses)):
+                    task_demo_id = f"SawyerTowerCreation_{demo_key}_{i}"
+                    step = {
+                        'task_demo_id': task_demo_id,
+                        'eef_poses': eef_poses[i],
+                        'joint_states_arm': joint_states_arm[i],
+                        'joint_states_gripper': joint_states_gripper[i],
+                        'user_control': user_control_data[i] if user_control_data.shape[0] > 0 else np.zeros(22),
+                        'observation':{
+                            'top_depth_frame': top_depth_frames[i] if i < len(top_depth_frames) else np.zeros((0,0, 3), dtype=np.uint8),
+                            'top_rgb_frame': top_rgb_frames[i] if i < len(top_rgb_frames) else np.zeros((0, 0, 3), dtype=np.uint8),
+                            'front_rgb_frame': front_rgb_frames[i] if i < len(front_rgb_frames) else np.zeros((0, 0, 3), dtype=np.uint8),
+                        },
+                        'terminate_episode': i == len(eef_poses) - 1
+                    }
+                    steps.append(step)
+                steps_dataset = tf.data.Dataset.from_generator(
+                    lambda: iter(steps),
+                    output_signature={
+                        'task_demo_id': tf.TensorSpec(shape=(), dtype=tf.string),
+                        'eef_poses': tf.TensorSpec(shape=(7,), dtype=tf.float32),
+                        'joint_states_arm': tf.TensorSpec(shape=(27,), dtype=tf.float32),
+                        'joint_states_gripper': tf.TensorSpec(shape=(3,), dtype=tf.float32),
+                        'user_control': tf.TensorSpec(shape=(22,), dtype=tf.float32),
+                        'observation':{
+                            'top_depth_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                            'top_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                            'front_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        },
+                        'terminate_episode': tf.TensorSpec(shape=(), dtype=tf.bool),
+                    }
+                )
+                yield {'steps': steps_dataset}
+def load_dataset():
+    filename = './data/datasets/roboturk/SawyerTowerCreation_aligned_dataset.hdf5'
+    total_samples = count_total_samples(filename)
+    dataset = tf.data.Dataset.from_generator(
+        lambda: dataset_generator(filename, total_samples),
+        output_signature={
+            'steps': tf.data.DatasetSpec(
+                element_spec={
+                    'task_demo_id': tf.TensorSpec(shape=(), dtype=tf.string),
+                    'eef_poses': tf.TensorSpec(shape=(7,), dtype=tf.float32),
+                    'joint_states_arm': tf.TensorSpec(shape=(27,), dtype=tf.float32),
+                    'joint_states_gripper': tf.TensorSpec(shape=(3,), dtype=tf.float32),
+                    'user_control': tf.TensorSpec(shape=(22,), dtype=tf.float32),
+                    'observation':{
+                        'top_depth_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        'top_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                        'front_rgb_frame': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
+                    },
+                    'terminate_episode': tf.TensorSpec(shape=(), dtype = tf.bool),
+                }
+            )
+        }
+    )
+    return dataset
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    step['action'] = {}
+    action = step['action']
+    action['terminate'] = step['terminate_episode']
+    eef_delta_pos = step['eef_poses'][:3]
+    eef_ang = quaternion_to_euler(step['eef_poses'][3:])
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_roll,eef_delta_angle_pitch,eef_delta_angle_yaw")
+    # No state found
+    state = step['observation']
+    # joint_states_arm: dataset of (num_timestamps, 27) shape where each of the 9 joints is represented by the JointState message
+    # (the nine joints are in order by their ROSBAG names: ['head_pan', 'right_j0', 'right_j1', 'right_j2', 'right_j3', 'right_j4', 'right_j5', 'right_j6', 'torso_t0']. For the most part, head_pan and torso should be zeros)
+    # [0] the position of the first joint (rad or m)
+    # [1] the velocity of the first joint (rad/s or m/s)
+    # [2] the effort that is applied in the first joint
+    # [3] the position of the second joint...
+    joint_states_arm = step['joint_states_arm']
+    joint_pos = joint_states_arm[3:24:3]
+    joint_vel = joint_states_arm[4:25:3]
+    # joint_states_gripper: dataset of (num_timestamps, 3) shape
+    # [0] the position of the gripper (rad or m)
+    # [1] the velocity of the gripper (rad/s or m/s)
+    # [2] the effort that is applied in the gripper
+    joint_states_gripper = step['joint_states_gripper']
+    gripper_pos = joint_states_gripper[:1]
+    # remove gripper_vel due to they are all zeros
+    # gripper_vel = joint_states_gripper[1:2]
+    # Concatenate the state
+    # state['arm_concat'] = tf.concat([joint_pos,joint_vel,gripper_pos,gripper_vel], axis=0)
+    state['arm_concat'] = tf.concat([joint_pos,joint_vel,gripper_pos], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,gripper_joint_0_pos")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    # copied from openxembod
+    instr = b'create tower'
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = '/cephfs-thu/gsm_data/openx_embod'
+    DATASET_NAME = 'roboturk_real_laundrylayout'
+    # Load the dataset
+    dataset = load_dataset()
+    # save_dir = os.path.join(DATASET_DIR, DATASET_NAME)
+    # if not os.path.exists(save_dir):
+    #     os.makedirs(save_dir)
+    # tf.data.experimental.save(dataset, save_dir)

data/preprocess_scripts/stanford_hydra_dataset_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix,\
+    rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.int32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    eef_delta_pos = action[:3]
+    eef_ang = action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = tf.expand_dims(1 - action[6], axis=0)
+    # Concatenate the action
+    # action['arm_concat'] = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    step['action'] = {}
+    action = step['action']
+    action['arm_concat'] = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    state_vec = state['state']
+    # Robot state, consists of [3x EEF position,4x EEF orientation in quaternion,3x EEF orientation in euler angle,7x robot joint angles, 7x robot joint velocities,3x gripper state.
+    arm_joint_pos = state_vec[10:17]
+    arm_joint_vel = state_vec[17:24]
+    eef_pos = state_vec[:3]
+    eef_ang = state_vec[7:10]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    # Rescale gripper width to [0, 1]
+    grip_joint_pos = tf.concat([
+        state_vec[24:25] * 12.324, state_vec[25:27]
+    ], axis=0)
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([arm_joint_pos, grip_joint_pos, arm_joint_vel, eef_pos, eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_joint_0_pos,gripper_joint_1_pos,gripper_joint_2_pos,arm_joint_0_vel,arm_joint_1_vel,arm_joint_2_vel,arm_joint_3_vel,arm_joint_4_vel,arm_joint_5_vel,arm_joint_6_vel,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'fractal20220817_data'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/tokyo_u_lsmo_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix, \
+    rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.int32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    # Robot action, consists of [3x endeffector position, 3x euler angles,1x gripper action].
+    eef_delta_pos = action[:3]
+    eef_ang = action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = tf.expand_dims(1 - action[6], axis=0)
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    action['arm_concat'] = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    state_vec = state['state']
+    # Robot state, consists of [3x endeffector position, 3x euler angles,6x robot joint angles, 1x gripper position].
+    eef_pos = state_vec[:3]
+    eef_ang = state_vec[3:6]
+    eef_ang = euler_to_rotation_matrix(eef_ang)
+    eef_ang = rotation_matrix_to_ortho6d(eef_ang)
+    arm_joint_ang = state_vec[6:12]
+    grip_joint_pos = 1 - state_vec[12:13]
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([arm_joint_ang, grip_joint_pos, eef_pos, eef_ang], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,gripper_joint_0_pos,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'fractal20220817_data'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/utokyo_pr2_opening_fridge_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, euler_to_rotation_matrix, \
+    rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.float32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    # Robot action, consists of [3x end effector pos, 3x robot rpy angles, 1x gripper open/close command, 1x terminal action].
+    eef_delta_pos = action[:3]/1000 # change from mm to m
+    eef_ang = action[3:6]
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = tf.expand_dims(1 - action[6], axis=0)
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    action['arm_concat'] = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['terminate'] = step['is_terminal']
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']['state']
+    # Robot state, consists of [3x end effector pos, 3x robot rpy angles, 1x gripper position].
+    gripper_pos = state[:3]/1000 # change from mm to m
+    gripper_ang = state[3:6]
+    gripper_ang = euler_to_rotation_matrix(gripper_ang)
+    gripper_ang = rotation_matrix_to_ortho6d(gripper_ang)
+    gripper_open = state[6:7]/1000 * 11.54 # rescale to [0, 1]
+    # Concatenate the state
+    state = step['observation']
+    state['arm_concat'] = tf.concat([gripper_pos, gripper_ang, gripper_open], axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5,gripper_joint_0_pos")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'fractal20220817_data'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/preprocess_scripts/utokyo_xarm_bimanual_converted_externally_to_rlds.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import json
+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_rotation_matrix, rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.reduce_all(tf.equal(terminate_act, tf.constant([1, 0, 0], dtype=tf.int32)))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    # TODO
+    # action : Tensor	(14,)	[3x EEF position (L), 3x EEF orientation yaw/pitch/roll (L), 1x gripper open/close position (L), 3x EEF position (R), 3x EEF orientation yaw/pitch/roll (R), 1x gripper open/close position (R)].
+    eef_pos_left = action[0:3]
+    eef_angle_left = tf.gather(action[3:6], [2, 1, 0])
+    eef_angle_left = euler_to_rotation_matrix(eef_angle_left)
+    eef_angle_left = rotation_matrix_to_ortho6d(eef_angle_left)
+    gripper_open_left = 1 - action[6:7]
+    eef_pos_right = action[7:10]
+    eef_angle_right = tf.gather(action[10:13], [2, 1, 0])
+    eef_angle_right = euler_to_rotation_matrix(eef_angle_right)
+    eef_angle_right = rotation_matrix_to_ortho6d(eef_angle_right)
+    gripper_open_right = 1 - action[13:14]
+    # Concatenate the action
+    step['action'] = {}
+    action = step['action']
+    # Concatenate the action
+    arm_action = tf.concat([eef_pos_left,eef_angle_left,gripper_open_left,eef_pos_right,eef_angle_right,gripper_open_right], axis=0)
+    action['arm_concat'] = arm_action
+    action['terminate'] = step['is_terminal']
+    # print("action len:", len(action['arm_concat']) + len(action['base_concat']))
+    action['format'] = tf.constant(
+        "left_eef_pos_x,left_eef_pos_y,left_eef_pos_z,left_eef_angle_0,left_eef_angle_1,left_eef_angle_2,left_eef_angle_3,left_eef_angle_4,left_eef_angle_5,left_gripper_open,right_eef_pos_x,right_eef_pos_y,right_eef_pos_z,right_eef_angle_0,right_eef_angle_1,right_eef_angle_2,right_eef_angle_3,right_eef_angle_4,right_eef_angle_5,right_gripper_open")
+    # action good for kuka same as example
+    # Convert raw state to our state
+    action = step['observation']['action_l']
+    # [3x EEF position, 3x EEF orientation yaw/pitch/roll, 1x gripper open/close position].
+    eef_pos_left = action[0:3]
+    eef_angle_left = tf.gather(action[3:6], [2, 1, 0])
+    eef_angle_left = euler_to_rotation_matrix(eef_angle_left)
+    eef_angle_left = rotation_matrix_to_ortho6d(eef_angle_left)
+    gripper_open_left = 1 - action[6:7]
+    action = step['observation']['action_r']
+    eef_pos_right = action[0:3]
+    eef_angle_right = tf.gather(action[3:6], [2, 1, 0])
+    eef_angle_right = euler_to_rotation_matrix(eef_angle_right)
+    eef_angle_right = rotation_matrix_to_ortho6d(eef_angle_right)
+    gripper_open_right = 1 - action[6:7]
+    # Write the state format TODO how to link 12 joint pos to 7 joint pos ??
+    state = step['observation']
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([eef_pos_left,eef_angle_left,gripper_open_left,eef_pos_right,eef_angle_right,gripper_open_right], axis=0)
+    state['format'] = tf.constant(
+        "left_eef_pos_x,left_eef_pos_y,left_eef_pos_z,left_eef_angle_0,left_eef_angle_1,left_eef_angle_2,left_eef_angle_3,left_eef_angle_4,left_eef_angle_5,left_gripper_open,right_eef_pos_x,right_eef_pos_y,right_eef_pos_z,right_eef_angle_0,right_eef_angle_1,right_eef_angle_2,right_eef_angle_3,right_eef_angle_4,right_eef_angle_5,right_gripper_open")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'utokyo_xarm_bimanual_converted_externally_to_rlds'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # with open('example.txt', 'w') as file:
+    # Inspect the dataset
+    episode_num = len(dataset)
+    print(f"episode_num: {episode_num}")
+    for episode in dataset.take(1):
+        # print("episode")
+        # print(list(episode.keys()))
+        for step in episode['steps']:
+            process_step(step)
+            break

data/preprocess_scripts/viola.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import tensorflow as tf
+from data.utils import clean_task_instruction, euler_to_quaternion, rotation_matrix_to_ortho6d
+def terminate_act_to_bool(terminate_act: tf.Tensor) -> tf.Tensor:
+    """
+    Convert terminate action to a boolean, where True means terminate.
+    """
+    return tf.where(tf.equal(terminate_act, tf.constant(0.0, dtype=tf.float32)),tf.constant(False),tf.constant(True))
+def process_step(step: dict) -> dict:
+    """
+    Unify the action format and clean the task instruction.
+    DO NOT use python list, use tf.TensorArray instead.
+    """
+    # Convert raw action to our action
+    action = step['action']
+    action['terminate'] = terminate_act_to_bool(action['terminate_episode'])
+    eef_delta_pos = action['world_vector']
+    eef_ang = action['rotation_delta']
+    eef_ang = euler_to_quaternion(eef_ang)
+    grip_open = tf.reshape(tf.where(action['gripper_closedness_action']<0,tf.constant(1.0),tf.constant(0.0)),(1,))
+    # No base found
+    # Concatenate the action
+    arm_action = tf.concat([eef_delta_pos, eef_ang, grip_open], axis=0)
+    action['arm_concat'] = arm_action
+    # Write the action format
+    action['format'] = tf.constant(
+        "eef_delta_pos_x,eef_delta_pos_y,eef_delta_pos_z,eef_delta_angle_x,eef_delta_angle_y,eef_delta_angle_z,eef_delta_angle_w,gripper_open")
+    # Convert raw state to our state
+    state = step['observation']
+    joint_pos=state['joint_states']
+    grip_open=state['gripper_states'] * 12.905  # rescale to [0, 1]
+    state_ee=state['ee_states']
+    transform_matrix = tf.transpose(tf.reshape(state_ee, (4, 4)))
+    eef_pos = transform_matrix[:3, 3]
+    rotation_matrix = transform_matrix[:3, :3]
+    eef_ang = rotation_matrix_to_ortho6d(rotation_matrix)
+    # Concatenate the state
+    state['arm_concat'] = tf.concat([joint_pos,grip_open,eef_pos,eef_ang],axis=0)
+    # Write the state format
+    state['format'] = tf.constant(
+        "arm_joint_0_pos,arm_joint_1_pos,arm_joint_2_pos,arm_joint_3_pos,arm_joint_4_pos,arm_joint_5_pos,arm_joint_6_pos,gripper_open,eef_pos_x,eef_pos_y,eef_pos_z,eef_angle_0,eef_angle_1,eef_angle_2,eef_angle_3,eef_angle_4,eef_angle_5")
+    # Clean the task instruction
+    # Define the replacements (old, new) as a dictionary
+    replacements = {
+        '_': ' ',
+        '1f': ' ',
+        '4f': ' ',
+        '-': ' ',
+        '50': ' ',
+        '55': ' ',
+        '56': ' ',
+    }
+    instr = step['observation']['natural_language_instruction']
+    instr = clean_task_instruction(instr, replacements)
+    step['observation']['natural_language_instruction'] = instr
+    return step
+if __name__ == "__main__":
+    import tensorflow_datasets as tfds
+    from data.utils import dataset_to_path
+    DATASET_DIR = 'data/datasets/openx_embod'
+    DATASET_NAME = 'viola'
+    # Load the dataset
+    dataset = tfds.builder_from_directory(
+        builder_dir=dataset_to_path(
+            DATASET_NAME, DATASET_DIR))
+    dataset = dataset.as_dataset(split='all')
+    # Inspect the dataset
+    for episode in dataset:
+        for step in episode['steps']:
+            print(step)

data/producer.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import time
+import json
+import os
+import time
+import argparse
+import sys
+import signal
+import random
+from multiprocessing import Process
+import numpy as np
+import tensorflow as tf
+import yaml
+from data.vla_dataset import VLADataset
+from data.filelock import FileLock
+# Producer does not need GPU
+tf.config.set_visible_devices([], 'GPU')
+# Read the config
+with open('configs/base.yaml', 'r') as file:
+    config = yaml.safe_load(file)
+# Load some constants from the config
+BUF_PATH = config['dataset']['buf_path']
+BUF_NUM_CHUNKS = config['dataset']['buf_num_chunks']
+if BUF_NUM_CHUNKS < 1:
+    raise ValueError("Config `buf_num_chunks` must be at least 1.")
+BUF_CHUNK_SIZE = config['dataset']['buf_chunk_size']
+if BUF_CHUNK_SIZE < 1:
+    raise ValueError("Config `buf_chunk_size` must be at least 1.")
+def get_dirty_item(chunk_dir):
+    """
+    Get indexes of dirty items in a chunk.
+    """
+    dirty_bit = read_dirty_bit(chunk_dir)
+    return np.where(dirty_bit)[0].tolist()
+def get_clean_item(chunk_dir):
+    """
+    Get indexes of clean items in a chunk.
+    """
+    dirty_bit = read_dirty_bit(chunk_dir)
+    return np.where(1 - dirty_bit)[0].tolist()
+def save_dirty_bit(chunk_dir, dirty_bit):
+    """
+    Save the dirty bit to the chunk directory.
+    """
+    time_stmp = time.time()
+    while time.time() - time_stmp < 10.0:
+        try:
+            file_path = os.path.join(chunk_dir, "dirty_bit")
+            lock = FileLock(file_path)
+            lock.acquire_write_lock()
+            with open(file_path, 'wb') as file:
+                file.write(dirty_bit.tobytes())
+            lock.release_lock()
+            return
+        except KeyboardInterrupt:
+            lock.release_lock()
+            raise KeyboardInterrupt
+        except BaseException:
+            lock.release_lock()
+            continue
+    # raise RuntimeError("Failed to save dirty bit.")
+    print("Failed to save dirty bit.")
+def read_dirty_bit(chunk_dir):
+    """
+    Read the dirty bit from the chunk directory.
+    """
+    # If error occurs, retry
+    time_stmp = time.time()
+    while time.time() - time_stmp < 10.0:
+        try:
+            file_path = os.path.join(chunk_dir, "dirty_bit")
+            lock = FileLock(file_path)
+            lock.acquire_read_lock()
+            with open(file_path, 'rb') as file:
+                dirty_bit = np.frombuffer(file.read(), dtype=np.uint8).copy()
+            lock.release_lock()
+            assert len(dirty_bit) == BUF_CHUNK_SIZE
+            return dirty_bit
+        except KeyboardInterrupt:
+            lock.release_lock()
+            raise KeyboardInterrupt
+        except BaseException:
+            lock.release_lock()
+            continue
+    # If failed to read the dirty bit, return all ones for robustness
+    return np.ones(BUF_CHUNK_SIZE, dtype=np.uint8)
+def save_sample(step_dict, chunk_dir, chunk_item_idx):
+    """
+    Save a sample to the chunk directory.
+    """
+    # Save the json content
+    time_stmp = time.time()
+    while time.time() - time_stmp < 10.0:
+        try:
+            locks = []
+            json_content = step_dict['json_content']
+            file_path = os.path.join(chunk_dir, f"json_content_{chunk_item_idx}.json")
+            lock = FileLock(file_path)
+            locks.append(lock)
+            lock.acquire_write_lock()
+            with open(file_path, 'w') as file:
+                json.dump(json_content, file, indent=4)
+            lock.release_lock()
+            # Save all other tensors in a npz
+            file_path = os.path.join(chunk_dir, f"sample_{chunk_item_idx}.npz")
+            lock = FileLock(file_path)
+            locks.append(lock)
+            lock.acquire_write_lock()
+            with open(file_path, 'wb') as file:
+                np.savez(
+                    file,
+                    step_id=step_dict['step_id'].numpy(),
+                    state_chunk=step_dict['state_chunk'].numpy(),
+                    state_chunk_time_mask=step_dict['state_chunk_time_mask'].numpy(),
+                    action_chunk=step_dict['action_chunk'].numpy(),
+                    action_chunk_time_mask=step_dict['action_chunk_time_mask'].numpy(),
+                    state_vec_mask=step_dict['state_vec_mask'].numpy(),
+                    past_frames_0=step_dict['past_frames_0'].numpy(),
+                    past_frames_0_time_mask=step_dict['past_frames_0_time_mask'].numpy(),
+                    past_frames_1=step_dict['past_frames_1'].numpy(),
+                    past_frames_1_time_mask=step_dict['past_frames_1_time_mask'].numpy(),
+                    past_frames_2=step_dict['past_frames_2'].numpy(),
+                    past_frames_2_time_mask=step_dict['past_frames_2_time_mask'].numpy(),
+                    past_frames_3=step_dict['past_frames_3'].numpy(),
+                    past_frames_3_time_mask=step_dict['past_frames_3_time_mask'].numpy(),
+                    state_std=step_dict['state_std'].numpy(),
+                    state_mean=step_dict['state_mean'].numpy(),
+                    state_norm=step_dict['state_norm'].numpy(),
+                )
+            lock.release_lock()
+            return
+        except KeyboardInterrupt:
+            for lock in locks:
+                lock.release_lock()
+            raise KeyboardInterrupt
+        except BaseException:
+            for lock in locks:
+                lock.release_lock()
+            continue
+    # raise RuntimeError("Failed to save sample.")
+    print("Failed to save sample.")
+def run_producer(seed, num_workers, worker_id, fill_up, clean_dirty, dataset_type):
+    """
+    Run the producer.
+    The producer will first fill up the buffer with samples.
+    Then it will keep replacing dirty samples
+    (i.e., samples that have been read by the consumer)
+    with new samples.
+    """
+    vla_dataset = VLADataset(seed=seed, dataset_type=dataset_type)
+    chunk_start_idx = worker_id * BUF_NUM_CHUNKS // num_workers
+    chunk_end_idx = (worker_id + 1) * BUF_NUM_CHUNKS // num_workers
+    if fill_up:
+        print(f"Worker {worker_id}: Start filling up the buffer...")
+    elif clean_dirty:
+        # Only refresh the dirty bits
+        print(f"Worker {worker_id}: Start refreshing the dirty bits...")
+        for chunk_idx in range(chunk_start_idx, chunk_end_idx):
+            chunk_dir = os.path.join(BUF_PATH, f"chunk_{chunk_idx}")
+            dirty_bit = np.zeros(BUF_CHUNK_SIZE, dtype=np.uint8)
+            save_dirty_bit(chunk_dir, dirty_bit)
+        print(f"Worker {worker_id}: Refreshed the dirty bits.")
+    fill_chunk_idx = chunk_start_idx
+    fill_chunk_item_idx = 0
+    dirty_chunk_idx = chunk_start_idx
+    dirty_chunk_item_idxs = []
+    time_stmp = time.time()
+    for episode_steps in vla_dataset:
+        for step in episode_steps:
+            if fill_up and fill_chunk_idx < chunk_end_idx:
+                # Fill up the buffer
+                chunk_dir = os.path.join(BUF_PATH, f"chunk_{fill_chunk_idx}")
+                if fill_chunk_item_idx == 0:
+                    # Create a new chunk
+                    os.makedirs(chunk_dir, exist_ok=True)
+                    # Write the dirty bit of size BUF_CHUNK_SIZE
+                    dirty_bit = np.zeros(BUF_CHUNK_SIZE, dtype=np.uint8)
+                    save_dirty_bit(chunk_dir, dirty_bit)
+                # Save the sample
+                save_sample(step, chunk_dir, fill_chunk_item_idx)
+                # print(f"Filled up chunk {fill_chunk_item_idx+1}/{BUF_CHUNK_SIZE} {fill_chunk_idx+1}/{BUF_NUM_CHUNKS}")
+                local_fill_chunk_idx = fill_chunk_idx - chunk_start_idx
+                local_num_chunks = chunk_end_idx - chunk_start_idx
+                if (local_fill_chunk_idx % 10 == 0 or local_fill_chunk_idx == local_num_chunks - 1) and fill_chunk_item_idx == 0:
+                    print(f"Worker {worker_id}: Filled up chunk {local_fill_chunk_idx+1}/{local_num_chunks}")
+                fill_chunk_item_idx += 1
+                if fill_chunk_item_idx == BUF_CHUNK_SIZE:
+                    fill_chunk_idx += 1
+                    fill_chunk_item_idx = 0
+                if fill_chunk_idx == BUF_NUM_CHUNKS:
+                    print(f"Worker {worker_id}: Buffer filled up. Start replacing dirty samples...")
+            else:
+                # Search for the dirty chunk to replace
+                while len(dirty_chunk_item_idxs) == 0:
+                    dirty_chunk_dir = os.path.join(BUF_PATH, f"chunk_{dirty_chunk_idx}")
+                    dirty_chunk_item_idxs = get_dirty_item(dirty_chunk_dir)
+                    # Print the dirty ratio
+                    if time.time() - time_stmp > 2.0:
+                        dirty_ratio = len(dirty_chunk_item_idxs) / BUF_CHUNK_SIZE
+                        print(f"Worker {worker_id}: Dirty Ratio for Chunk {dirty_chunk_idx}: {dirty_ratio:.2f}")
+                        time_stmp = time.time()
+                    if len(dirty_chunk_item_idxs) > 0:
+                        # Lock the chunk
+                        dirty_bit = np.ones(BUF_CHUNK_SIZE, dtype=np.uint8)
+                        save_dirty_bit(dirty_chunk_dir, dirty_bit)
+                    # Iterate over the chunks
+                    dirty_chunk_idx += 1
+                    if dirty_chunk_idx == chunk_end_idx:
+                        dirty_chunk_idx = chunk_start_idx
+                # Replace the dirty item
+                dirty_item_idx = dirty_chunk_item_idxs.pop()
+                chunk_dir = os.path.join(BUF_PATH, f"chunk_{dirty_chunk_idx}")
+                # Save the sample
+                save_sample(step, chunk_dir, dirty_item_idx)
+                # If we have replaced all dirty items in the chunk
+                if len(dirty_chunk_item_idxs) == 0:
+                    # Unlock the chunk
+                    dirty_bit = np.zeros(BUF_CHUNK_SIZE, dtype=np.uint8)
+                    save_dirty_bit(dirty_chunk_dir, dirty_bit)
+                    print(f"Worker {worker_id}: Replaced dirty chunk {dirty_chunk_idx}.")
+if __name__ == '__main__':
+    # Args: n_workers, fill_up
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--n_workers', type=int, default=2, help="Number of parallel workers. It should be less than or equal to the number of chunks.")
+    parser.add_argument('--fill_up', action='store_true', help="Whether to fill up the buffer before replacing dirty samples.")
+    parser.add_argument('--clean_dirty', action='store_true', help="Whether to clean the dirty bits before replacing dirty samples. This option is ignored when `fill_up` is set.")
+    parser.add_argument('--seed', type=int, default=None, help="Random seed. If not set, the seed will be randomly generated.")
+    parser.add_argument('--dataset_type', type=str,
+                        default="pretrain",
+                        help="Whether to load the pretrain dataset or finetune dataset.")
+    # Run the producer
+    args = parser.parse_args()
+    if args.seed is not None:
+        print(f"Base seed: {args.seed}")
+        random.seed(args.seed)
+    processes = []
+    process_seeds = [random.randint(0, 2**32) for _ in range(args.n_workers)]
+    print(f"Process seeds: {process_seeds}")
+    def signal_handler(sig, frame):
+        print("Ctrl+C received. Terminating child processes...")
+        for p in processes:
+            p.terminate()
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    for worker_id in range(args.n_workers):
+        p = Process(target=run_producer, args=(
+            process_seeds[worker_id], args.n_workers, worker_id, args.fill_up, args.clean_dirty, args.dataset_type))
+        p.start()
+        processes.append(p)
+    for p in processes:
+        p.join()

data/utils.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import tensorflow as tf
+import tensorflow_graphics.geometry.transformation.euler as tf_euler
+import tensorflow_graphics.geometry.transformation.quaternion as tf_quat
+import tensorflow_graphics.geometry.transformation.rotation_matrix_3d as tf_rotmat
+def dataset_to_path(dataset_name: str, dir_name: str) -> str:
+    """
+    Return the path to the dataset.
+    """
+    if dataset_name == 'robo_net' or \
+        dataset_name == 'cmu_playing_with_food' or \
+        dataset_name == 'droid':
+        version = '1.0.0'
+    elif dataset_name == 'language_table' or \
+        dataset_name == 'fmb' or \
+        dataset_name == 'dobbe':
+        version = '0.0.1'
+    elif dataset_name == 'nyu_door_opening_surprising_effectiveness':
+        version = ''
+    elif dataset_name == 'cmu_play_fusion':
+        version=''
+    elif dataset_name=='berkeley_gnm_recon':
+        version=''
+    elif dataset_name=='vla_benchmark_ee':
+        version = '1.0.0'
+    else:
+        version = '1.0.0'
+    return f'{dir_name}/{dataset_name}/{version}'
+def clean_task_instruction(
+        task_instruction: tf.Tensor, replacements: dict) -> tf.Tensor:
+    """
+    Clean up the natural language task instruction.
+    """
+    # Create a function that applies all replacements
+    def apply_replacements(tensor):
+        for old, new in replacements.items():
+            tensor = tf.strings.regex_replace(tensor, old, new)
+        return tensor
+    # Apply the replacements and strip leading and trailing spaces
+    cleaned_task_instruction = apply_replacements(task_instruction)
+    cleaned_task_instruction = tf.strings.strip(cleaned_task_instruction)
+    return cleaned_task_instruction
+def quaternion_to_euler(quaternion: tf.Tensor) -> tf.Tensor:
+    """
+    Convert a quaternion (x, y, z, w) to Euler angles (roll, pitch, yaw).
+    The (roll, pitch, yaw) corresponds to `Rotation.as_euler("xyz")` convention.
+    """
+    # Normalize the quaternion
+    quaternion = tf.nn.l2_normalize(quaternion, axis=-1)
+    return tf_euler.from_quaternion(quaternion)
+def euler_to_quaternion(euler: tf.Tensor) -> tf.Tensor:
+    """
+    Convert Euler angles (roll, pitch, yaw) to a quaternion (x, y, z, w).
+    The (roll, pitch, yaw) corresponds to `Rotation.as_euler("xyz")` convention.
+    """
+    quaternion = tf_quat.from_euler(euler)
+    return tf.nn.l2_normalize(quaternion, axis=-1)
+def rotation_matrix_to_euler(matrix: tf.Tensor) -> tf.Tensor:
+    """
+    Convert a 3x3 rotation matrix to Euler angles (roll, pitch, yaw).
+    The (roll, pitch, yaw) corresponds to `Rotation.as_euler("xyz")` convention.
+    """
+    return tf_euler.from_rotation_matrix(matrix)
+def rotation_matrix_to_quaternion(matrix: tf.Tensor) -> tf.Tensor:
+    """
+    Convert a 3x3 rotation matrix to a quaternion (x, y, z, w).
+    """
+    quaternion = tf_quat.from_rotation_matrix(matrix)
+    return tf.nn.l2_normalize(quaternion, axis=-1)
+def euler_to_rotation_matrix(euler: tf.Tensor) -> tf.Tensor:
+    """
+    Convert Euler angles (roll, pitch, yaw) to a 3x3 rotation matrix.
+    The (roll, pitch, yaw) corresponds to `Rotation.as_euler("xyz")` convention.
+    """
+    return tf_rotmat.from_euler(euler)
+def quaternion_to_rotation_matrix(quaternion: tf.Tensor) -> tf.Tensor:
+    """
+    Convert a quaternion (x, y, z, w) to a 3x3 rotation matrix.
+    """
+    # Normalize the quaternion
+    quaternion = tf.nn.l2_normalize(quaternion, axis=-1)
+    return tf_rotmat.from_quaternion(quaternion)
+def quaternion_to_rotation_matrix_wo_static_check(quaternion: tf.Tensor) -> tf.Tensor:
+    """
+    Convert a quaternion (x, y, z, w) to a 3x3 rotation matrix.
+    This function is used to make tensorflow happy.
+    """
+    # Normalize the quaternion
+    quaternion = tf.nn.l2_normalize(quaternion, axis=-1)
+    x = quaternion[..., 0]
+    y = quaternion[..., 1]
+    z = quaternion[..., 2]
+    w = quaternion[..., 3]
+    tx = 2.0 * x
+    ty = 2.0 * y
+    tz = 2.0 * z
+    twx = tx * w
+    twy = ty * w
+    twz = tz * w
+    txx = tx * x
+    txy = ty * x
+    txz = tz * x
+    tyy = ty * y
+    tyz = tz * y
+    tzz = tz * z
+    matrix = tf.stack((1.0 - (tyy + tzz), txy - twz, txz + twy,
+                       txy + twz, 1.0 - (txx + tzz), tyz - twx,
+                       txz - twy, tyz + twx, 1.0 - (txx + tyy)),
+                      axis=-1)  # pyformat: disable
+    output_shape = tf.concat((tf.shape(input=quaternion)[:-1], (3, 3)), axis=-1)
+    return tf.reshape(matrix, shape=output_shape)
+"""
+Below is a continuous 6D rotation representation adapted from
+On the Continuity of Rotation Representations in Neural Networks
+https://arxiv.org/pdf/1812.07035.pdf
+https://github.com/papagina/RotationContinuity/blob/master/sanity_test/code/tools.py
+"""
+def rotation_matrix_to_ortho6d(matrix: tf.Tensor) -> tf.Tensor:
+    """
+    The orhto6d represents the first two column vectors a1 and a2 of the
+    rotation matrix: [ | , |,  | ]
+                     [ a1, a2, a3]
+                     [ | , |,  | ]
+    Input: (A1, ..., An, 3, 3)
+    Output: (A1, ..., An, 6)
+    """
+    ortho6d = matrix[..., :, :2]
+    # Transpose the last two dimension
+    perm = list(range(len(ortho6d.shape)))
+    perm[-2], perm[-1] = perm[-1], perm[-2]
+    ortho6d = tf.transpose(ortho6d, perm)
+    # Flatten the last two dimension
+    ortho6d = tf.reshape(ortho6d, ortho6d.shape[:-2] + [6])
+    return ortho6d
+def rotation_matrix_to_ortho6d_1d(matrix: tf.Tensor) -> tf.Tensor:
+    """
+    The orhto6d represents the first two column vectors a1 and a2 of the
+    rotation matrix: [ | , |,  | ]
+                     [ a1, a2, a3]
+                     [ | , |,  | ]
+    Input: (3, 3)
+    Output: (6,)
+    This function is used to make tensorflow happy.
+    """
+    ortho6d = matrix[:, :2]
+    # Transpose the last two dimension
+    ortho6d = tf.transpose(ortho6d)
+    # Flatten the last two dimension
+    ortho6d = tf.reshape(ortho6d, [6])
+    return ortho6d
+def normalize_vector(v):
+    """
+    v: (..., N)
+    """
+    v_mag = tf.sqrt(tf.reduce_sum(tf.square(v), axis=-1, keepdims=True))
+    v_mag = tf.maximum(v_mag, 1e-8)
+    v_normalized = v / v_mag
+    return v_normalized
+def cross_product(u, v):
+    """
+    u: (..., 3)
+    v: (..., 3)
+    u x v: (..., 3)
+    """
+    i = u[..., 1] * v[..., 2] - u[..., 2] * v[..., 1]
+    j = u[..., 2] * v[..., 0] - u[..., 0] * v[..., 2]
+    k = u[..., 0] * v[..., 1] - u[..., 1] * v[..., 0]
+    out = tf.stack([i, j, k], axis=-1)
+    return out
+def ortho6d_to_rotation_matrix(ortho6d: tf.Tensor) -> tf.Tensor:
+    """
+    The orhto6d represents the first two column vectors a1 and a2 of the
+    rotation matrix: [ | , |,  | ]
+                     [ a1, a2, a3]
+                     [ | , |,  | ]
+    Input: (A1, ..., An, 6)
+    Output: (A1, ..., An, 3, 3)
+    """
+    x_raw = ortho6d[..., 0:3]
+    y_raw = ortho6d[..., 3:6]
+    x = normalize_vector(x_raw)
+    z = cross_product(x, y_raw)
+    z = normalize_vector(z)
+    y = cross_product(z, x)
+    # Stack x, y, z to form the matrix
+    matrix = tf.stack([x, y, z], axis=-1)
+    return matrix
+def capitalize_and_period(instr: str) -> str:
+    """
+    Capitalize the first letter of a string and add a period to the end if it's not there.
+    """
+    if len(instr) > 0:
+        # if the first letter is not capital, make it so
+        if not instr[0].isupper():
+            # if the first letter is not capital, make it so
+            instr = instr[0].upper() + instr[1:]
+        # add period to the end if it's not there
+        if instr[-1] != '.':
+            # add period to the end if it's not there
+            instr = instr + '.'
+    return instr

data/vla_dataset.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import json
+import random
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import yaml
+from data.episode_transform import process_episode, flatten_episode, \
+    flatten_episode_agilex, bgr_to_rgb
+from data.utils import dataset_to_path
+from data.preprocess_scripts import *
+# Producer does not need GPU
+tf.config.set_visible_devices([], 'GPU')
+OPENX_EMBOD_DIR = 'data/datasets/openx_embod'
+DATASET_NAMES_NOOPENX = [
+    "aloha_mobile",
+    "aloha_static",
+    "roboset",
+    "agilex",
+    "rh20t",
+    'calvin',
+    "bridgev2"
+]
+# Read the config
+with open('configs/base.yaml', 'r') as file:
+    config = yaml.safe_load(file)
+# Load some constants from the config
+EPSD_LEN_THRESH_LOW = config['dataset']['epsd_len_thresh_low']
+EPSD_LEN_THRESH_HIGH = config['dataset']['epsd_len_thresh_high']
+# Read the image keys of each dataset
+with open('configs/dataset_img_keys.json', 'r') as file:
+    IMAGE_KEYS = json.load(file)
+class VLADataset:
+    """
+    This class is used to sample episodes from the embododiment dataset.
+    """
+    def __init__(self, seed, dataset_type, repeat=True):
+        '''
+        seed: the random seed
+        dataset_type: 'pretrain' or 'finetune', which dataset to load
+        repeat: whether to repeat to infinite length
+        '''
+        dataset_names_cfg = 'configs/pretrain_datasets.json' \
+            if dataset_type == "pretrain" else 'configs/finetune_datasets.json'
+        with open(dataset_names_cfg, 'r') as file:
+            DATASET_NAMES = json.load(file)
+        self.dataset_names = DATASET_NAMES
+        sample_weights_cfg = 'configs/pretrain_sample_weights.json' \
+            if dataset_type == "pretrain" else 'configs/finetune_sample_weights.json'
+        # Load the sample weights
+        with open(sample_weights_cfg, 'r') as file:
+            SAMPLE_WEIGHTS = json.load(file)
+        self.openx_dir = OPENX_EMBOD_DIR
+        self.epsd_len_thresh_low = EPSD_LEN_THRESH_LOW
+        self.epsd_len_thresh_high = EPSD_LEN_THRESH_HIGH
+        self.repeat = repeat
+        # Set the random seed
+        tf.random.set_seed(seed)
+        np.random.seed(seed)
+        # Weights of the each dataset in the collection to sample from
+        sample_weights = []
+        self.name2dataset = {}
+        for dataset_name in self.dataset_names:
+            if dataset_name in DATASET_NAMES_NOOPENX:
+                dataset = globals()[dataset_name].load_dataset(seed)
+            else:
+                dataset_path = dataset_to_path(dataset_name, self.openx_dir)
+                dataset = tfds.builder_from_directory(builder_dir=dataset_path)
+                dataset = dataset.as_dataset(split='all', shuffle_files=True)
+                # You can add filter for other datasets
+                if dataset_name == 'kuka':
+                    dataset = dataset.filter(
+                        lambda x: x['success'])
+                elif dataset_name == 'bc_z':
+                    dataset = dataset.filter(
+                        lambda x: tf.math.greater(
+                            next(iter(x['steps']))['observation']['episode_success'], 0.5))
+                elif dataset_name == 'ucsd_pick_and_place_dataset_converted_externally_to_rlds':
+                    dataset = dataset.filter(
+                        lambda x: x['episode_metadata']['success'])
+                elif dataset_name == 'utokyo_xarm_bimanual_converted_externally_to_rlds':
+                    # Only preserve the meaningful episodes
+                    dataset = dataset.filter(
+                        lambda x: tf.math.equal(
+                            next(iter(x['steps']))['language_instruction'],
+                            tf.constant('Unfold a wrinkled towel.')))
+            # Note: use cache() will cause the unexpected crash
+            # dataset = dataset.map().cache().shuffle().repeat()
+            print(dataset_name)
+            dataset = dataset\
+                .map(
+                    lambda x: process_episode(x, dataset_name,
+                        IMAGE_KEYS[dataset_name]['image_keys'],
+                        IMAGE_KEYS[dataset_name]['image_mask'])
+                )
+            # Change BGR to RGB if needed
+            if dataset_name == 'fmb':
+                dataset = dataset.map(bgr_to_rgb)
+            if self.repeat:
+                dataset = dataset.repeat()
+            self.name2dataset[dataset_name] = iter(dataset)
+            print(SAMPLE_WEIGHTS)
+            sample_weights.append(SAMPLE_WEIGHTS[dataset_name])
+        # Normalize the sample weights
+        sample_weights = np.array(sample_weights)
+        self.sample_weights = sample_weights / np.sum(sample_weights)
+    def __iter__(self):
+        '''
+        Sample batches of episodes for an epoch.
+        '''
+        while True:
+            dataset_name = np.random.choice(self.dataset_names, p=self.sample_weights)
+            episode = next(self.name2dataset[dataset_name])
+            if dataset_name == "agilex":
+                episode_steps = flatten_episode_agilex(episode)
+            else:
+                episode_steps = flatten_episode(episode)
+            # Filter too short
+            if len(episode_steps) < self.epsd_len_thresh_low:
+                continue
+            # Randomly sample too long
+            if len(episode_steps) > self.epsd_len_thresh_high:
+                episode_steps = random.sample(episode_steps, self.epsd_len_thresh_high)
+            yield episode_steps
+if __name__ == "__main__":
+    dataset = VLADataset(0, 'finetune')
+    for episode in dataset:
+        print(episode[0])
+        break

encode_lang.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import torch
+import yaml
+from models.multimodal_encoder.t5_encoder import T5Embedder
+GPU = 0
+MODEL_PATH = "google/t5-v1_1-xxl"
+CONFIG_PATH = "configs/base.yaml"
+SAVE_DIR = "lang_embed/"
+# Modify this to your task name and instruction
+TASK_NAME = "anubis_carrot_to_bag"
+# INSTRUCTION = "take the towel off the kirby doll"
+# INSTRUCTION = "insert the brush to the dustpan"
+INSTRUCTION = "pick up the carrot and put into the bag"
+# Note: if your GPU VRAM is less than 24GB,
+# it is recommended to enable offloading by specifying an offload directory.
+# OFFLOAD_DIR = '/home/jellyho/OFFLOAD'  # Specify your offload directory here, ensuring the directory exists.
+def main():
+    with open(CONFIG_PATH, "r") as fp:
+        config = yaml.safe_load(fp)
+    device = torch.device(f"cuda:{GPU}")
+    text_embedder = T5Embedder(
+        from_pretrained=MODEL_PATH,
+        model_max_length=config["dataset"]["tokenizer_max_length"],
+        device=device,
+        # use_offload_folder=OFFLOAD_DIR
+    )
+    tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+    tokens = tokenizer(
+        INSTRUCTION, return_tensors="pt",
+        padding="longest",
+        truncation=True
+    )["input_ids"].to(device)
+    tokens = tokens.view(1, -1)
+    with torch.no_grad():
+        pred = text_encoder(tokens).last_hidden_state.detach().cpu()
+    save_path = os.path.join(SAVE_DIR, f"{TASK_NAME}.pt")
+    # We save the embeddings in a dictionary format
+    torch.save({
+            "name": TASK_NAME,
+            "instruction": INSTRUCTION,
+            "embeddings": pred
+        }, save_path
+    )
+    print(f'\"{INSTRUCTION}\" from \"{TASK_NAME}\" is encoded by \"{MODEL_PATH}\" into shape {pred.shape} and saved to \"{save_path}\"')
+if __name__ == "__main__":
+    main()

finetune.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth1
+export NCCL_DEBUG=INFO
+export NCCL_NVLS_ENABLE=0
+export MASTER_PORT=$2
+export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
+export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384"
+export OUTPUT_DIR="./checkpoints/$1"
+export CFLAGS="-I/usr/include"
+export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
+export CUTLASS_PATH="/home/jellyho/cutlass"
+export WANDB_PROJECT="robotics_diffusion_transformer"
+if [ ! -d "$OUTPUT_DIR" ]; then
+    mkdir "$OUTPUT_DIR"
+    echo "Folder '$OUTPUT_DIR' created"
+else
+    echo "Folder '$OUTPUT_DIR' already exists"
+fi
+# For run in a single node/machine
+# accelerate launch main.py \
+#     --deepspeed="./configs/zero2.json" \
+#     ...
+# --hostfile=hostfile.txt
+accelerate launch --main_process_port $2 --num_processes 2 --num_machines 1 --mixed_precision bf16 main.py \
+    --deepspeed="./configs/zero2.json" \
+    --pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \
+    --pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
+    --pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
+    --output_dir=$OUTPUT_DIR \
+    --train_batch_size=8 \
+    --sample_batch_size=8 \
+    --max_train_steps=50000 \
+    --checkpointing_period=5000 \
+    --sample_period=1000 \
+    --checkpoints_total_limit=10 \
+    --lr_scheduler="constant" \
+    --learning_rate=1e-4 \
+    --mixed_precision="bf16" \
+    --dataloader_num_workers=16 \
+    --image_aug \
+    --dataset_type="finetune" \
+    --gradient_accumulation_steps 1 \
+    --report_to=wandb \
+    --load_from_hdf5 \
+    --dataset_name $1 \
+    --precomp_lang_embed
+    # --resume_from_checkpoint="checkpoint-50000"
+    # Use this to resume training from some previous checkpoint
+    # --resume_from_checkpoint="checkpoint-36000" \
+    # Use this to load from saved lanuage instruction embeddings,
+    # instead of calculating it during training

finetune_maniskill.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_DEBUG=INFO
+export NCCL_NVLS_ENABLE=0
+export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
+export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384"
+export OUTPUT_DIR="./checkpoints/rdt-finetune-1b-sim"
+export CFLAGS="-I/usr/include"
+export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
+export CUTLASS_PATH="/data/lingxuan/cutlass"
+export WANDB_PROJECT="robotic_diffusion_transformer"
+if [ ! -d "$OUTPUT_DIR" ]; then
+    mkdir "$OUTPUT_DIR"
+    echo "Folder '$OUTPUT_DIR' created"
+else
+    echo "Folder '$OUTPUT_DIR' already exists"
+fi
+# For run in a single node/machine
+# accelerate launch main.py \
+#     --deepspeed="./configs/zero2.json" \
+#     ...
+accelerate launch main.py \
+    --deepspeed="./configs/zero2.json" \
+    --pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \
+    --pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
+    --pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
+    --output_dir=$OUTPUT_DIR \
+    --train_batch_size=24 \
+    --sample_batch_size=32 \
+    --max_train_steps=400000 \
+    --checkpointing_period=10000 \
+    --sample_period=500 \
+    --checkpoints_total_limit=40 \
+    --lr_scheduler="constant" \
+    --learning_rate=1e-4 \
+    --mixed_precision="bf16" \
+    --dataloader_num_workers=8 \
+    --image_aug \
+    --dataset_type="finetune" \
+    --state_noise_snr=40 \
+    --load_from_hdf5 \
+    --report_to=wandb

inference.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+python -m scripts.agilex_inference \
+    --use_actions_interpolation \
+    --pretrained_model_name_or_path="checkpoints/your_finetuned_ckpt.pt" \  # your finetuned checkpoint: e.g., checkpoints/rdt-finetune-1b/checkpoint-<STEP NUMBER>, checkpoints/rdt-finetune-1b/checkpoint-<STEP NUMBER>/pytorch_model/mp_rank_00_model_states.pt,
+    --lang_embeddings_path="outs/lang_embeddings/your_instr.pt" \
+    --ctrl_freq=25    # your control frequency

main.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import argparse
+import os
+from train.train import train
+from accelerate.logging import get_logger
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Main script for training RDT.")
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="configs/base.yaml",
+        help="Path to the configuration file. Default is `configs/base.yaml`.",
+    )
+    parser.add_argument(
+        "--deepspeed",
+        type=str,
+        default=None,
+        help="Enable DeepSpeed and pass the path to its config file or an already initialized DeepSpeed config dictionary",
+    )
+    parser.add_argument(
+        "--pretrained_text_encoder_name_or_path",
+        type=str,
+        default=None,
+        help="Pretrained text encoder name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--pretrained_vision_encoder_name_or_path",
+        type=str,
+        default=None,
+        help="Pretrained vision encoder name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="checkpoints",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--load_from_hdf5",
+        action="store_true",
+        default=False,
+        help=(
+            "Whether to load the dataset directly from HDF5 files. "
+            "If False, the dataset will be loaded using producer-consumer pattern, "
+            "where the producer reads TFRecords and saves them to buffer, and the consumer reads from buffer."
+        )
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=8, help="Batch size (per device) for the sampling dataloader."
+    )
+    parser.add_argument(
+        "--num_sample_batches", type=int, default=2, help="Number of batches to sample from the dataset."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_period",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_period`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "Path or name of a pretrained checkpoint to load the model from.\n",
+            "   This can be either:\n"
+            "   - a string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co, e.g., `robotics-diffusion-transformer/rdt-1b`,\n"
+            "   - a path to a *directory* containing model weights saved using [`~RDTRunner.save_pretrained`] method, e.g., `./my_model_directory/`.\n"
+            "   - a path to model checkpoint (*.pt), .e.g, `my_model_directory/checkpoint-10000/pytorch_model/mp_rank_00_model_states.pt`"
+            "   - `None` if you are randomly initializing model using configuration at `config_path`."
+        )
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--cond_mask_prob",
+        type=float,
+        default=0.1,
+        help=(
+            "The probability to randomly mask the conditions (except states) during training. "
+            "If set to 0, the conditions are not masked."
+        ),
+    )
+    parser.add_argument(
+        "--cam_ext_mask_prob",
+        type=float,
+        default=-1.0,
+        help=(
+            "The probability to randomly mask the external camera image during training. "
+            "If set to < 0, the external camera image is masked with the probability of `cond_mask_prob`."
+        ),
+    )
+    parser.add_argument(
+        "--state_noise_snr",
+        type=float,
+        default=None,
+        help=(
+            "The signal-to-noise ratio (SNR, unit: dB) for adding noise to the states. "
+            "Default is None, which means no noise is added."
+        ),
+    )
+    parser.add_argument(
+        "--image_aug",
+        action="store_true",
+        default=False,
+        help="Whether or not to apply image augmentation (ColorJitter, blur, noise, etc) to the input images.",
+    )
+    parser.add_argument(
+        "--precomp_lang_embed",
+        action="store_true",
+        default=False,
+        help="Whether or not to use precomputed language embeddings.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--alpha", type=float, default=0.9, help="The moving average coefficient for each dataset's loss.")
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--sample_period",
+        type=int,
+        default=-1,
+        help=(
+            "Run sampling every X steps. During the sampling phase, the model will sample a trajectory"
+            " and report the error between the sampled trajectory and groud-truth trajectory"
+            " in the training batch."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument('--dataset_type',
+        type=str,
+        default="pretrain",
+        required=False,
+        help="Whether to load the pretrain dataset or finetune dataset."
+    )
+    parser.add_argument('--dataset_name', type=str)
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    return args
+if __name__ == "__main__":
+    logger = get_logger(__name__)
+    args = parse_args()
+    train(args, logger)

models/ema_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+    def __init__(
+        self,
+        model,
+        update_after_step=0,
+        inv_gamma=1.0,
+        power=2 / 3,
+        min_value=0.0,
+        max_value=0.9999
+    ):
+        """
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        Args:
+            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+            power (float): Exponential factor of EMA warmup. Default: 2/3.
+            min_value (float): The minimum EMA decay rate. Default: 0.
+        """
+        self.averaged_model = model
+        self.averaged_model.eval()
+        self.averaged_model.requires_grad_(False)
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        self.max_value = max_value
+        self.decay = 0.0
+        self.optimization_step = 0
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        value = 1 - (1 + step / self.inv_gamma) ** -self.power
+        if step <= 0:
+            return 0.0
+        return max(self.min_value, min(value, self.max_value))
+    @torch.no_grad()
+    def step(self, new_model):
+        self.decay = self.get_decay(self.optimization_step)
+        # old_all_dataptrs = set()
+        # for param in new_model.parameters():
+        #     data_ptr = param.data_ptr()
+        #     if data_ptr != 0:
+        #         old_all_dataptrs.add(data_ptr)
+        all_dataptrs = set()
+        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
+            for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
+                # iterative over immediate parameters only.
+                if isinstance(param, dict):
+                    raise RuntimeError('Dict parameter not supported')
+                # data_ptr = param.data_ptr()
+                # if data_ptr != 0:
+                #     all_dataptrs.add(data_ptr)
+                if isinstance(module, _BatchNorm):
+                    # skip batchnorms
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                elif not param.requires_grad:
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                else:
+                    ema_param.mul_(self.decay)
+                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+        # verify that iterating over module and then parameters is identical to parameters recursively.
+        # assert old_all_dataptrs == all_dataptrs
+        self.optimization_step += 1