Spaces:
Runtime error
Runtime error
| from datasets import load_dataset, Dataset | |
| from huggingface_hub import HfApi, Repository | |
| import os | |
| # Configurations | |
| source_dataset_name = "nvidia/Nemotron-Personas" # example dataset | |
| columns_to_keep = ["persona", "sex", "age"] # example columns to extract | |
| new_dataset_name = "K00B404/personas" # change this to your dataset repo name on HF | |
| hf_token = os.getenv("HF_TOKEN") # replace with your token | |
| # Step 1: Load the original dataset | |
| dataset = load_dataset(source_dataset_name, split="train") # or "test" or "validation" | |
| # Step 2: Extract only the specified columns | |
| new_data = dataset.remove_columns( | |
| [col for col in dataset.column_names if col not in columns_to_keep] | |
| ) | |
| # Step 3: Save locally before pushing | |
| new_data.save_to_disk("local_new_dataset") | |
| # Step 4: Push dataset to your Hugging Face hub | |
| api = HfApi() | |
| # Create a new dataset repo if it doesn't exist yet | |
| try: | |
| api.create_repo(new_dataset_name, repo_type="dataset", private=False, token=hf_token) | |
| print(f"Created new dataset repo: {new_dataset_name}") | |
| except Exception as e: | |
| print(f"Repo may already exist or failed to create: {e}") | |
| # Push the dataset to HF Hub | |
| new_data.push_to_hub(new_dataset_name, token=hf_token) | |
| print(f"Dataset pushed to https://huggingface.co/datasets/{new_dataset_name}") |