Upload policy weights, train config and readme

Browse files

Files changed (4) hide show

README.md +4 -4
config.json +44 -27
model.safetensors +2 -2
train_config.json +59 -38

README.md CHANGED Viewed

@@ -2,20 +2,20 @@
 datasets: Beable/pushtModel-Collected-fixed
 library_name: lerobot
 license: apache-2.0
-model_name: act
 pipeline_tag: robotics
 tags:
 - lerobot
-- act
 - robotics
 ---
-# Model Card for act
 <!-- Provide a quick summary of what the model is/does. -->
-[Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).

 datasets: Beable/pushtModel-Collected-fixed
 library_name: lerobot
 license: apache-2.0
+model_name: diffusion
 pipeline_tag: robotics
 tags:
 - lerobot
+- diffusion
 - robotics
 ---
+# Model Card for diffusion
 <!-- Provide a quick summary of what the model is/does. -->
+[Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation.
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).

config.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
-    "type": "act",
-    "n_obs_steps": 1,
     "normalization_mapping": {
         "VISUAL": "MEAN_STD",
-        "STATE": "MEAN_STD",
-        "ACTION": "MEAN_STD"
     },
     "input_features": {
         "observation.image": {
@@ -37,28 +37,45 @@
     "private": null,
     "tags": null,
     "license": null,
-    "chunk_size": 100,
-    "n_action_steps": 100,
     "vision_backbone": "resnet18",
-    "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-    "replace_final_stride_with_dilation": false,
-    "pre_norm": false,
-    "dim_model": 512,
-    "n_heads": 8,
-    "dim_feedforward": 3200,
-    "feedforward_activation": "relu",
-    "n_encoder_layers": 4,
-    "n_decoder_layers": 1,
-    "use_vae": true,
-    "latent_dim": 32,
-    "n_vae_encoder_layers": 4,
-    "temporal_ensemble_coeff": null,
-    "dropout": 0.1,
-    "kl_weight": 100.0,
-    "predict_action_std": false,
-    "action_std_min": 1e-05,
-    "stochastic_inference": false,
-    "optimizer_lr": 5e-05,
-    "optimizer_weight_decay": 0.0001,
-    "optimizer_lr_backbone": 1e-05
 }

 {
+    "type": "diffusion",
+    "n_obs_steps": 2,
     "normalization_mapping": {
         "VISUAL": "MEAN_STD",
+        "STATE": "MIN_MAX",
+        "ACTION": "MIN_MAX"
     },
     "input_features": {
         "observation.image": {
     "private": null,
     "tags": null,
     "license": null,
+    "horizon": 80,
+    "n_action_steps": 60,
+    "drop_n_last_frames": 7,
     "vision_backbone": "resnet18",
+    "crop_shape": [
+        84,
+        84
+    ],
+    "crop_is_random": true,
+    "pretrained_backbone_weights": null,
+    "use_group_norm": true,
+    "spatial_softmax_num_keypoints": 32,
+    "use_separate_rgb_encoder_per_camera": false,
+    "down_dims": [
+        512,
+        1024,
+        2048
+    ],
+    "kernel_size": 5,
+    "n_groups": 8,
+    "diffusion_step_embed_dim": 128,
+    "use_film_scale_modulation": true,
+    "noise_scheduler_type": "DDPM",
+    "num_train_timesteps": 100,
+    "beta_schedule": "squaredcos_cap_v2",
+    "beta_start": 0.0001,
+    "beta_end": 0.02,
+    "prediction_type": "epsilon",
+    "clip_sample": true,
+    "clip_sample_range": 1.0,
+    "num_inference_steps": null,
+    "do_mask_loss_for_padding": false,
+    "optimizer_lr": 0.0001,
+    "optimizer_betas": [
+        0.95,
+        0.999
+    ],
+    "optimizer_eps": 1e-08,
+    "optimizer_weight_decay": 1e-06,
+    "scheduler_name": "cosine",
+    "scheduler_warmup_steps": 500
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f427558f85e31ed7a864d66f6e99788faccf1f7af0a961c2abef1ac0c85e482
-size 206667888

 version https://git-lfs.github.com/spec/v1
+oid sha256:0c0cc527bfad2c591481140196de98bd16ecdff2809ed73aec885052e07715ab
+size 1050862408

train_config.json CHANGED Viewed

@@ -62,18 +62,18 @@
         },
         "revision": "v2.2",
         "use_imagenet_stats": true,
-        "video_backend": "pyav",
         "tolerance_s": 0.1,
         "timestamps_check": "warn"
     },
     "env": null,
     "policy": {
-        "type": "act",
-        "n_obs_steps": 1,
         "normalization_mapping": {
             "VISUAL": "MEAN_STD",
-            "STATE": "MEAN_STD",
-            "ACTION": "MEAN_STD"
         },
         "input_features": {
             "observation.image": {
@@ -106,55 +106,76 @@
         "private": null,
         "tags": null,
         "license": null,
-        "chunk_size": 100,
-        "n_action_steps": 100,
         "vision_backbone": "resnet18",
-        "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-        "replace_final_stride_with_dilation": false,
-        "pre_norm": false,
-        "dim_model": 512,
-        "n_heads": 8,
-        "dim_feedforward": 3200,
-        "feedforward_activation": "relu",
-        "n_encoder_layers": 4,
-        "n_decoder_layers": 1,
-        "use_vae": true,
-        "latent_dim": 32,
-        "n_vae_encoder_layers": 4,
-        "temporal_ensemble_coeff": null,
-        "dropout": 0.1,
-        "kl_weight": 100.0,
-        "predict_action_std": false,
-        "action_std_min": 1e-05,
-        "stochastic_inference": false,
-        "optimizer_lr": 5e-05,
-        "optimizer_weight_decay": 0.0001,
-        "optimizer_lr_backbone": 1e-05
     },
-    "output_dir": "outputs/train/2025-11-06/02-21-07_act",
-    "job_name": "act",
     "resume": false,
     "seed": 1000,
     "num_workers": 4,
-    "batch_size": 72,
-    "steps": 100000,
     "eval_freq": 20000,
     "log_freq": 200,
     "save_checkpoint": true,
-    "save_freq": 30000,
     "use_policy_training_preset": true,
     "optimizer": {
-        "type": "adamw",
-        "lr": 5e-05,
-        "weight_decay": 0.0001,
         "grad_clip_norm": 10.0,
         "betas": [
-            0.9,
             0.999
         ],
         "eps": 1e-08
     },
-    "scheduler": null,
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,

         },
         "revision": "v2.2",
         "use_imagenet_stats": true,
+        "video_backend": "torchcodec",
         "tolerance_s": 0.1,
         "timestamps_check": "warn"
     },
     "env": null,
     "policy": {
+        "type": "diffusion",
+        "n_obs_steps": 2,
         "normalization_mapping": {
             "VISUAL": "MEAN_STD",
+            "STATE": "MIN_MAX",
+            "ACTION": "MIN_MAX"
         },
         "input_features": {
             "observation.image": {
         "private": null,
         "tags": null,
         "license": null,
+        "horizon": 80,
+        "n_action_steps": 60,
+        "drop_n_last_frames": 7,
         "vision_backbone": "resnet18",
+        "crop_shape": [
+            84,
+            84
+        ],
+        "crop_is_random": true,
+        "pretrained_backbone_weights": null,
+        "use_group_norm": true,
+        "spatial_softmax_num_keypoints": 32,
+        "use_separate_rgb_encoder_per_camera": false,
+        "down_dims": [
+            512,
+            1024,
+            2048
+        ],
+        "kernel_size": 5,
+        "n_groups": 8,
+        "diffusion_step_embed_dim": 128,
+        "use_film_scale_modulation": true,
+        "noise_scheduler_type": "DDPM",
+        "num_train_timesteps": 100,
+        "beta_schedule": "squaredcos_cap_v2",
+        "beta_start": 0.0001,
+        "beta_end": 0.02,
+        "prediction_type": "epsilon",
+        "clip_sample": true,
+        "clip_sample_range": 1.0,
+        "num_inference_steps": null,
+        "do_mask_loss_for_padding": false,
+        "optimizer_lr": 0.0001,
+        "optimizer_betas": [
+            0.95,
+            0.999
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 1e-06,
+        "scheduler_name": "cosine",
+        "scheduler_warmup_steps": 500
     },
+    "output_dir": "outputs/train/2025-11-08/13-36-33_diffusion",
+    "job_name": "diffusion",
     "resume": false,
     "seed": 1000,
     "num_workers": 4,
+    "batch_size": 68,
+    "steps": 20000,
     "eval_freq": 20000,
     "log_freq": 200,
     "save_checkpoint": true,
+    "save_freq": 10000,
     "use_policy_training_preset": true,
     "optimizer": {
+        "type": "adam",
+        "lr": 0.0001,
+        "weight_decay": 1e-06,
         "grad_clip_norm": 10.0,
         "betas": [
+            0.95,
             0.999
         ],
         "eps": 1e-08
     },
+    "scheduler": {
+        "type": "diffuser",
+        "num_warmup_steps": 500,
+        "name": "cosine"
+    },
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,