# Conversion of the Latest Dataframe to Parquet

We need to store our dataset in a warehouse so we use parquet

In [11]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Load the Pandas DataFrame
df = pd.read_csv('dialogues.csv', sep='\t', encoding='utf-8')

In [12]:
# Convert Pandas DataFrame to Arrow Table
table = pa.Table.from_pandas(df)
# Specify the output file path for the Parquet file
parquet_file_path = './data/parquet/dialogues.parquet'

# Write the Arrow Table to a Parquet file
pq.write_table(table, parquet_file_path)

print(f'DataFrame saved to Parquet file: {parquet_file_path}')

DataFrame saved to Parquet file: ./data/parquet/dialogues.parquet


In [13]:
# Read Parquet file into Arrow Table
table = pq.read_table(parquet_file_path)

# Convert Arrow Table to Pandas DataFrame
df = table.to_pandas()


In [15]:
df.head()

Unnamed: 0,Description,Patient,Doctor
0,Q. What does abutment of the nerve root mean?,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...
1,Q. What should I do to reduce my weight gained...,"Hi doctor, I am a 22-year-old female who was d...",Hi. You have really done well with the hypothy...
2,Q. I have started to get lots of acne on my fa...,Hi doctor! I used to have clear skin but since...,Hi there Acne has multifactorial etiology. Onl...
3,Q. Why do I have uncomfortable feeling between...,"Hello doctor,I am having an uncomfortable feel...",Hello. The popping and discomfort what you fel...
4,Q. My symptoms after intercourse threatns me e...,"Hello doctor,Before two years had sex with a c...",Hello. The HIV test uses a finger prick blood ...


In [20]:
import os
import pyarrow.parquet as pq

def generate_hf_metadata(parquet_file_path, dataset_name, split_name='train', split_path_pattern='data/train-*'):
    # Read Parquet file into Arrow Table
    table = pq.read_table(parquet_file_path)

    # Convert Arrow Table to Pandas DataFrame
    df = table.to_pandas()

    # Get information about the dataset
    num_bytes = os.path.getsize(parquet_file_path)
    num_examples = len(df)

    # Create metadata dictionary without the 'metadata' key
    metadata = {
        'configs': [
            {
                'config_name': 'default',
                'data_files': [
                    {
                        'split': split_name,
                        'path': split_path_pattern
                    }
                ]
            }
        ],
        'dataset_info': {
            'features': [{'name': col, 'dtype': str(df[col].dtype)} for col in df.columns],
            'splits': [
                {
                    'name': split_name,
                    'num_bytes': num_bytes,
                    'num_examples': num_examples
                }
            ],
            'download_size': num_bytes,
            'dataset_size': num_bytes
        }
    }

    # Save metadata to a YAML file
    metadata_file_path = f'{dataset_name}_metadata.yaml'
    with open(metadata_file_path, 'w') as metadata_file:
        metadata_file.write(str(metadata))

    print(f'Metadata file saved at: {metadata_file_path}')



In [21]:
# Example usage
parquet_file_path = './data/parquet/dialogues.parquet'
dataset_name = 'dialogues'
generate_hf_metadata(parquet_file_path, dataset_name)

Metadata file saved at: dialogues_metadata.yaml


In [22]:
import yaml

def generate_markdown_from_metadata(yaml_file_path, dataset_name, dataset_card_path):
    # Load metadata from YAML file
    with open(yaml_file_path, 'r') as yaml_file:
        metadata = yaml.load(yaml_file, Loader=yaml.FullLoader)

    # Generate Markdown content
    markdown_content = f"---\n{yaml.dump(metadata)}\n---\n# Dataset Card for \"{dataset_name}\"\n\n[More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)"

    # Save Markdown content to file
    with open(dataset_card_path, 'w') as md_file:
        md_file.write(markdown_content)

    print(f'Markdown file saved at: {dataset_card_path}')

# Example usage
yaml_file_path = 'dialogues_metadata.yaml'
dataset_name = 'dialogues'
dataset_card_path = 'dialogues_dataset_card.md'
generate_markdown_from_metadata(yaml_file_path, dataset_name, dataset_card_path)


Markdown file saved at: dialogues_dataset_card.md
