| DATA=${1:-"ocp_subsampled_5G"} | |
| if [[ $DATA == "ocp_subsampled_5G" ]]; then | |
| input_dir="ocp_subsampled_5G" | |
| splits_dir="ocp_subsampled_5G_entropy87_splits" | |
| compression_dir="ocp_subsampled_5G_entropy87_enumerative_compression" | |
| #### Step 1: setup subsampled data for evaluation | |
| if [[ ! -d "$input_dir" ]]; then | |
| mkdir $input_dir | |
| head -n 1050000 /mnt/hdfs/user/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > temp.jsonl | |
| split -n r/8 --suffix-length=1 --numeric-suffixes=1 --additional-suffix=.jsonl temp.jsonl ${input_dir}/ocp.chunk. | |
| rm temp.jsonl | |
| else | |
| echo "Directory '$input_dir' already exists." | |
| fi | |
| elif [[ $DATA == "opencoder_subsampled_5G" ]]; then | |
| input_dir="opencoder_subsampled_5G" | |
| splits_dir="opencoder_subsampled_5G_entropy87_splits" | |
| compression_dir="opencoder_subsampled_5G_entropy87_enumerative_compression" | |
| #### Step 1: setup subsampled data for evaluation | |
| if [[ ! -d "$input_dir" ]]; then | |
| mkdir $input_dir | |
| head -n 1050000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > temp.jsonl | |
| split -n r/8 --suffix-length=1 --numeric-suffixes=1 --additional-suffix=.jsonl temp.jsonl ${input_dir}/ocp.chunk. | |
| rm temp.jsonl | |
| else | |
| echo "Directory '$input_dir' already exists." | |
| fi | |
| elif [[ $DATA == "ocp_subsampled_50G" ]]; then | |
| input_dir="/mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G" | |
| splits_dir="ocpython_subsampled_50G_entropy87_splits" | |
| compression_dir="ocpython_subsampled_50G_entropy87_enumerative_compression" | |
| else | |
| echo "Unknown $DATA." | |
| exit 0 | |
| fi | |
| entropy_model_path=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_full/checkpoints/0000200000 | |
| compression_model_path=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_full/checkpoints/0000200000 | |
| NUM_GPUS=4 | |
| total_jsonls=8 | |
| total_jobs=1 | |
| wait | |
| # for JSONL_IDX in $(seq 1 4); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| # --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir "ocpython_subsampled_50G_entropy90_splits_linesplit" \ | |
| # --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| # --data_batch_size 256 --max_entropy_batch_size 8192 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| # --apply_line_split > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_lines.log 2>&1 & | |
| # done | |
| # done | |
| # wait | |
| # for JSONL_IDX in $(seq 5 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| # --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir "ocpython_subsampled_50G_entropy90_splits_linesplit" \ | |
| # --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| # --data_batch_size 256 --max_entropy_batch_size 8192 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| # --apply_line_split > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_lines.log 2>&1 & | |
| # done | |
| # done | |
| ############################################################## | |
| ############################################################## | |
| # wait | |
| # for JSONL_IDX in $(seq 1 4); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| # --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir "ocpython_subsampled_50G_entropy90_splits_chunk256" \ | |
| # --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| # --data_batch_size 256 --max_entropy_batch_size 4096 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| # --chunk_size 256 > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_chunk256.log 2>&1 & | |
| # done | |
| # done | |
| # wait | |
| # for JSONL_IDX in $(seq 5 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| # --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir "ocpython_subsampled_50G_entropy90_splits_chunk256" \ | |
| # --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| # --data_batch_size 256 --max_entropy_batch_size 4096 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| # --chunk_size 256 > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_chunk256.log 2>&1 & | |
| # done | |
| # done | |
| ############################################################## | |
| ############################################################## | |
| wait | |
| for JSONL_IDX in $(seq 1 4); do | |
| for index in $(seq 0 $((total_jobs - 1))); do | |
| echo "Starting job $index..." | |
| GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| --output_dir "ocpython_subsampled_50G_entropy90_splits_chunk512" \ | |
| --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| --data_batch_size 256 --max_entropy_batch_size 2048 \ | |
| --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| --chunk_size 512 > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_chunk512.log 2>&1 & | |
| done | |
| done | |
| wait | |
| for JSONL_IDX in $(seq 5 $total_jsonls); do | |
| for index in $(seq 0 $((total_jobs - 1))); do | |
| echo "Starting job $index..." | |
| GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| --output_dir "ocpython_subsampled_50G_entropy90_splits_chunk512" \ | |
| --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| --data_batch_size 256 --max_entropy_batch_size 2048 \ | |
| --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| --chunk_size 512 > split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}_chunk512.log 2>&1 & | |
| done | |
| done | |
| # wait | |
| # NUM_GPUS=8 | |
| # total_jsonls=8 | |
| # total_jobs=3 | |
| # for JSONL_IDX in $(seq 1 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_compress.py \ | |
| # --input_file ${splits_dir}/ocp.chunk.${JSONL_IDX}_out_0.jsonl \ | |
| # --output_dir $compression_dir \ | |
| # --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \ | |
| # --data_batch_size 1024 --max_compression_batch_size 3072 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs > compress_jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 & | |
| # done | |
| # done | |