Yaning1001 commited on Oct 29, 2024

Commit

7332c68

verified ·

1 Parent(s): 709a089

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/bnc_spoken_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/childes_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/gutenberg_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/open_subtitles_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/simple_wiki_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/switchboard_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/childes_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/switchboard_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/bnc_spoken_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/childes_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/gutenberg_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/open_subtitles_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/simple_wiki_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/switchboard_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/childes_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/switchboard_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/bnc_spoken_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/childes_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/gutenberg_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/open_subtitles_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/simple_wiki_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/switchboard_unaffected.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/childes_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test +0 -0
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/switchboard_unaffected_sents.test +0 -0
data/perturb.py +359 -0
data/perturb.sh +35 -0
data/perturb_llama.py +361 -0
data/perturb_model.sh +40 -0
data/perturb_qwen.py +361 -0
data/tag.py +153 -0
data/tag_1.py +166 -0
data/tag_distributed.py +106 -0
data/tag_single.py +140 -0
perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-1000.csv +2 -0
perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-10000.csv +2 -0
perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-11500.csv +2 -0
perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-1500.csv +2 -0
perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-2000.csv +2 -0

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/bnc_spoken_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/childes_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/gutenberg_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/open_subtitles_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/simple_wiki_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected/switchboard_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/childes_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_unaffected_sents/switchboard_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/bnc_spoken_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/childes_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/gutenberg_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/open_subtitles_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/simple_wiki_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected/switchboard_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/childes_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_unaffected_sents/switchboard_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/bnc_spoken_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/childes_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/gutenberg_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/open_subtitles_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/simple_wiki_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected/switchboard_unaffected.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/childes_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test ADDED Viewed

File without changes

data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_unaffected_sents/switchboard_unaffected_sents.test ADDED Viewed

File without changes

data/perturb.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# perturb.py
+# Author: Julie Kallini
+# For importing utils
+import sys
+sys.path.append("..")
+from utils import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
+    GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
+from glob import glob
+import numpy as np
+import itertools
+import json
+import os
+import tqdm
+import argparse
+import pytest
+def lines_equivalent_3pres(file1_path, file2_path):
+    """Compare lines of two files after splitting them."""
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            res1 = [i for i in line1.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            res2 = [i for i in line2.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            if res1 != res2:
+                print(line1)
+                print(line2)
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_3pres = [
+    ("0tokens", "4tokens"),
+    ("0tokens", "4words"),
+    ("4tokens", "4words"),
+]
+# Yj: 针对与第三人称单数/复数相关的扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_3pres)  # Yj: generate different pairs used in test
+# Yj: 用于在测试函数中，例如 test_3pres_all_equivalent，生成各种测试组合，包括不同的扰动策略。
+# Yj: 区分受影响和未受影响的测试子集，以比较扰动前后的效果。
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data) # 测试函数会针对 test_data 中的每组参数运行一次
+def test_3pres_all_equivalent(split, genre, perturbation_pair):     # Yj: genre these are different kinds of Corpus, which can be seen in utils.py
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"   # Yj: Development Set is similar to Validation Set
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_3pres_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_3pres_{perturbation2}/babylm_{split}/{filename}"
+    #Yj: compare two files in two paths
+    assert lines_equivalent_3pres(path1, path2), f"File {filename} of " + \
+        f"3pres_{perturbation1} and 3pres_{perturbation2} have non-equivalent lines!"
+def lines_equivalent_reversal(rev_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(rev_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = line1.split()
+            line2_tokens = line2.split()
+            # Get REV marker index
+            marker_index = line1_tokens.index(str(marker_rev_token))
+            # Make sure tokens up to and including the marker are all the same
+            if line1_tokens[:marker_index+1] != line2_tokens[:marker_index+1]:
+                return False
+            # Make sure reversal of rest of string is equal to identity
+            line1_tokens_rev = line1_tokens[marker_index+1:].copy()
+            line1_tokens_rev.reverse()
+            if line1_tokens_rev != line2_tokens[marker_index+1:]:
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("reversal", "reversal_identity"),
+]
+# Yj: 针对反转扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_reversal_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_reversal(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def lines_equivalent_determiner_swap(det_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(det_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = set(line1.split())
+            line2_tokens = set(line2.split())
+            if line1_tokens != line2_tokens:
+                print(line1.split())
+                print(line2.split())
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("determiner_swap", "determiner_swap_identity"),
+]
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_determiner_swap_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_determiner_swap(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def flatten_list(l):
+    """Function to flatten a nested list."""
+    return list(itertools.chain.from_iterable(l))
+def process_line(line):
+    """
+    Process a given line from the dataset, apply transformations to its sentences,
+    and categorize them into affected or unaffected based on the transformation.
+    Parameters:
+    - line (dict): A dictionary representing a line from the dataset, which contains
+      sentence annotations.
+    Returns:
+    - tuple: A tuple containing three lists:
+        1. new_lines_affected (list of str): Sentences that were affected by the transformation.
+        2. new_lines_unaffected (list of str): Sentences that were not affected by the transformation.
+    Note:
+    - The transformation functions (`perturbation_function`, `affect_function`, `filter_function`)
+      are expected to be available in the global scope.
+    """
+    new_lines_affected = []
+    new_lines_unaffected = []
+    sents_unaffected = []
+    # Apply transformation to each sentence on line
+    for sent in line["sent_annotations"]:   # Yj: 这处不明白为什么用annotations不用text?
+        tokens = perturbation_function(sent)
+        if len([tok for tok in tokens if tok not in MARKER_TOKEN_IDS]) <= 1:
+            continue
+        token_line = " ".join([str(tok) for tok in tokens])
+        # Check if sent is affected
+        if affect_function(sent):
+            # Check if this affected sentence should be filtered or not
+            if filter_function(sent):
+                new_lines_affected.append(token_line + "\n")
+        else:  # Unaffected sentences
+            new_lines_unaffected.append(token_line + "\n")
+            sents_unaffected.append(sent["sent_text"] + "\n")
+    return new_lines_affected, new_lines_unaffected, sents_unaffected
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Perturb BabyLM dataset',
+        description='Perturb BabyLM dataset by altering POS-tagged data')
+    parser.add_argument('perturbation_type',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=PERTURBATIONS.keys(),
+                        help='Perturbation function used to transform BabyLM dataset')
+    parser.add_argument('babylm_dataset',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=BABYLM_SPLITS,
+                        help='BabyLM dataset choice')
+    # Get args
+    args = parser.parse_args()
+    # Load dataset (only json files containing tagged data)
+    babylm_dataset = args.babylm_dataset
+    json_ext = "_parsed.json"
+    # babylm_data = glob(f"{BABYLM_DATA_PATH}/babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    babylm_data = glob(f"babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    print("babylm_data:", babylm_data)
+    # Get perturbation, affect, and filter functions
+    perturbation_function = PERTURBATIONS[args.perturbation_type]['perturbation_function']
+    affect_function = PERTURBATIONS[args.perturbation_type]['affect_function']
+    filter_function = PERTURBATIONS[args.perturbation_type]['filter_function']
+    gpt2_tokenizer = PERTURBATIONS[args.perturbation_type]['gpt2_tokenizer']
+    if babylm_dataset == "test":  # Yj: 为什么abylm_dataset是test? BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, unaffected_sents = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            unaffected_sents = flatten_list(unaffected_sents)
+            # Name new file
+            new_file_affected = os.path.basename(
+                file).replace(json_ext, "_affected.test")
+            new_file_unaffected = os.path.basename(
+                file).replace(json_ext, "_unaffected.test")
+            file_unaffected_sents = os.path.basename(
+                file).replace(json_ext, "_unaffected_sents.test")
+            # Create directory
+            data_write_directory = f"{BABYLM_DATA_PATH}/babylm_data_perturbed"
+            directory_affected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_affected/"
+            if not os.path.exists(directory_affected):
+                os.makedirs(directory_affected)
+            directory_unaffected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected/"
+            if not os.path.exists(directory_unaffected):
+                os.makedirs(directory_unaffected)
+            directory_unaffected_sents = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected_sents/"
+            if not os.path.exists(directory_unaffected_sents):
+                os.makedirs(directory_unaffected_sents)
+            # Write files
+            write_file(directory_affected,
+                       new_file_affected, new_lines_affected)
+            write_file(directory_unaffected,
+                       new_file_unaffected, new_lines_unaffected)
+            write_file(directory_unaffected_sents,
+                       file_unaffected_sents, unaffected_sents)
+    else:
+        # Yj: BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, _ = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            # Combine affected and unaffected sentences
+            new_lines = new_lines_unaffected + new_lines_affected
+            # Name new file
+            if babylm_dataset == "dev":
+                new_file = os.path.basename(file).replace(json_ext, ".dev")
+            elif babylm_dataset == 'unittest':
+                new_file = os.path.basename(file).replace(json_ext, ".test")
+                # Print strings for unittest
+                new_lines_decoded = [gpt2_tokenizer.decode(
+                    [int(tok) for tok in line.split()]) + "\n" for line in new_lines]
+                new_lines_with_strings = []
+                for tokens, line in list(zip(new_lines, new_lines_decoded)):
+                    new_lines_with_strings.append(tokens)
+                    new_lines_with_strings.append(line)
+                new_lines = new_lines_with_strings
+            else:
+                new_file = os.path.basename(file).replace(json_ext, ".train")   # '10M 100M' is training set
+            # Create directory and write file
+            directory = f"{BABYLM_DATA_PATH}/babylm_data_perturbed/babylm_{args.perturbation_type}/babylm_{babylm_dataset}/"
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            write_file(directory, new_file, new_lines)

data/perturb.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/sh
+# perturb.sh
+# author: Julie Kallini
+echo "
+-------------------------------------------------------------------------------
+Arguments
+-------------------------------------------------------------------------------
+"
+echo "Perturbation type: $1"
+echo "Train set: $2"
+# Create perturbed dataset for all splits
+echo "
+-------------------------------------------------------------------------------
+Creating perturbed dataset for all splits
+-------------------------------------------------------------------------------
+"
+cd ../data
+echo "python3 perturb.py $1 $2"
+python3 perturb.py $1 $2
+echo "
+python3 perturb.py $1 dev"
+python3 perturb.py $1 dev
+echo "
+python3 perturb.py $1 test"
+python3 perturb.py $1 test
+echo "
+python3 perturb.py $1 unittest"
+python3 perturb.py $1 unittest
+cd ..

data/perturb_llama.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# perturb.py
+# Author: Julie Kallini
+# For importing utils
+import sys
+sys.path.append("..")
+from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
+    GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
+from glob import glob
+import numpy as np
+import itertools
+import json
+import os
+import tqdm
+import argparse
+import pytest
+MODEL_NAME = "Llama-3.2-3B"
+def lines_equivalent_3pres(file1_path, file2_path):
+    """Compare lines of two files after splitting them."""
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            res1 = [i for i in line1.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            res2 = [i for i in line2.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            if res1 != res2:
+                print(line1)
+                print(line2)
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_3pres = [
+    ("0tokens", "4tokens"),
+    ("0tokens", "4words"),
+    ("4tokens", "4words"),
+]
+# Yj: 针对与第三人称单数/复数相关的扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_3pres)  # Yj: generate different pairs used in test
+# Yj: 用于在测试函数中，例如 test_3pres_all_equivalent，生成各种测试组合，包括不同的扰动策略。
+# Yj: 区分受影响和未受影响的测试子集，以比较扰动前后的效果。
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data) # 测试函数会针对 test_data 中的每组参数运行一次
+def test_3pres_all_equivalent(split, genre, perturbation_pair):     # Yj: genre these are different kinds of Corpus, which can be seen in utils.py
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"   # Yj: Development Set is similar to Validation Set
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_3pres_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_3pres_{perturbation2}/babylm_{split}/{filename}"
+    #Yj: compare two files in two paths
+    assert lines_equivalent_3pres(path1, path2), f"File {filename} of " + \
+        f"3pres_{perturbation1} and 3pres_{perturbation2} have non-equivalent lines!"
+def lines_equivalent_reversal(rev_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(rev_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = line1.split()
+            line2_tokens = line2.split()
+            # Get REV marker index
+            marker_index = line1_tokens.index(str(marker_rev_token))
+            # Make sure tokens up to and including the marker are all the same
+            if line1_tokens[:marker_index+1] != line2_tokens[:marker_index+1]:
+                return False
+            # Make sure reversal of rest of string is equal to identity
+            line1_tokens_rev = line1_tokens[marker_index+1:].copy()
+            line1_tokens_rev.reverse()
+            if line1_tokens_rev != line2_tokens[marker_index+1:]:
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("reversal", "reversal_identity"),
+]
+# Yj: 针对反转扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_reversal_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_reversal(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def lines_equivalent_determiner_swap(det_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(det_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = set(line1.split())
+            line2_tokens = set(line2.split())
+            if line1_tokens != line2_tokens:
+                print(line1.split())
+                print(line2.split())
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("determiner_swap", "determiner_swap_identity"),
+]
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_determiner_swap_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_llama/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_determiner_swap(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def flatten_list(l):
+    """Function to flatten a nested list."""
+    return list(itertools.chain.from_iterable(l))
+def process_line(line):
+    """
+    Process a given line from the dataset, apply transformations to its sentences,
+    and categorize them into affected or unaffected based on the transformation.
+    Parameters:
+    - line (dict): A dictionary representing a line from the dataset, which contains
+      sentence annotations.
+    Returns:
+    - tuple: A tuple containing three lists:
+        1. new_lines_affected (list of str): Sentences that were affected by the transformation.
+        2. new_lines_unaffected (list of str): Sentences that were not affected by the transformation.
+    Note:
+    - The transformation functions (`perturbation_function`, `affect_function`, `filter_function`)
+      are expected to be available in the global scope.
+    """
+    new_lines_affected = []
+    new_lines_unaffected = []
+    sents_unaffected = []
+    # Apply transformation to each sentence on line
+    for sent in line["sent_annotations"]:   # Yj: 这处不明白为什么用annotations不用text?
+        tokens = perturbation_function(sent)
+        if len([tok for tok in tokens if tok not in MARKER_TOKEN_IDS]) <= 1:
+            continue
+        token_line = " ".join([str(tok) for tok in tokens])
+        # Check if sent is affected
+        if affect_function(sent):
+            # Check if this affected sentence should be filtered or not
+            if filter_function(sent):
+                new_lines_affected.append(token_line + "\n")
+        else:  # Unaffected sentences
+            new_lines_unaffected.append(token_line + "\n")
+            sents_unaffected.append(sent["sent_text"] + "\n")
+    return new_lines_affected, new_lines_unaffected, sents_unaffected
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Perturb BabyLM dataset',
+        description='Perturb BabyLM dataset by altering POS-tagged data')
+    parser.add_argument('perturbation_type',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=PERTURBATIONS.keys(),
+                        help='Perturbation function used to transform BabyLM dataset')
+    parser.add_argument('babylm_dataset',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=BABYLM_SPLITS,
+                        help='BabyLM dataset choice')
+    # Get args
+    args = parser.parse_args()
+    # Load dataset (only json files containing tagged data)
+    babylm_dataset = args.babylm_dataset
+    json_ext = "_parsed.json"
+    # babylm_data = glob(f"{BABYLM_DATA_PATH}/babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    babylm_data = glob(f"babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    print("babylm_data:", babylm_data)
+    # Get perturbation, affect, and filter functions
+    perturbation_function = PERTURBATIONS[args.perturbation_type]['perturbation_function']
+    affect_function = PERTURBATIONS[args.perturbation_type]['affect_function']
+    filter_function = PERTURBATIONS[args.perturbation_type]['filter_function']
+    llama_tokenizer = PERTURBATIONS[args.perturbation_type]['llama_tokenizer']
+    if babylm_dataset == "test":  # Yj: 为什么abylm_dataset是test? BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, unaffected_sents = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            unaffected_sents = flatten_list(unaffected_sents)
+            # Name new file
+            new_file_affected = os.path.basename(
+                file).replace(json_ext, "_affected.test")
+            new_file_unaffected = os.path.basename(
+                file).replace(json_ext, "_unaffected.test")
+            file_unaffected_sents = os.path.basename(
+                file).replace(json_ext, "_unaffected_sents.test")
+            # Create directory
+            data_write_directory = f"{BABYLM_DATA_PATH}/Perturbed_data/{MODEL_NAME}"
+            directory_affected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_affected/"
+            if not os.path.exists(directory_affected):
+                os.makedirs(directory_affected)
+            directory_unaffected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected/"
+            if not os.path.exists(directory_unaffected):
+                os.makedirs(directory_unaffected)
+            directory_unaffected_sents = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected_sents/"
+            if not os.path.exists(directory_unaffected_sents):
+                os.makedirs(directory_unaffected_sents)
+            # Write files
+            write_file(directory_affected,
+                       new_file_affected, new_lines_affected)
+            write_file(directory_unaffected,
+                       new_file_unaffected, new_lines_unaffected)
+            write_file(directory_unaffected_sents,
+                       file_unaffected_sents, unaffected_sents)
+    else:
+        # Yj: BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, _ = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            # Combine affected and unaffected sentences
+            new_lines = new_lines_unaffected + new_lines_affected
+            # Name new file
+            if babylm_dataset == "dev":
+                new_file = os.path.basename(file).replace(json_ext, ".dev")
+            elif babylm_dataset == 'unittest':
+                new_file = os.path.basename(file).replace(json_ext, ".test")
+                # Print strings for unittest
+                new_lines_decoded = [llama_tokenizer.decode(
+                    [int(tok) for tok in line.split()]) + "\n" for line in new_lines]
+                new_lines_with_strings = []
+                for tokens, line in list(zip(new_lines, new_lines_decoded)):
+                    new_lines_with_strings.append(tokens)
+                    new_lines_with_strings.append(line)
+                new_lines = new_lines_with_strings
+            else:
+                new_file = os.path.basename(file).replace(json_ext, ".train")   # '10M 100M' is training set
+            # Create directory and write file
+            directory = f"{BABYLM_DATA_PATH}/Perturbed_data/{MODEL_NAME}/babylm_{args.perturbation_type}/babylm_{babylm_dataset}/"
+            print("directory:", directory)
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            write_file(directory, new_file, new_lines)

data/perturb_model.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+# Define your perturbations and BabyLM splits
+PERTURBATIONS=("hop_control" "hop_tokens4" "hop_words4" "reverse_control" "reverse_partial" "reverse_full" "shuffle_control"
+"shuffle_nondeterministic" "shuffle_deterministic21" "shuffle_deterministic57" "shuffle_deterministic84" "shuffle_local3"
+"shuffle_local5" "shuffle_local10" "shuffle_even_odd")
+# BABYLM_SPLITS=("100M" "10M" "dev" "test" "unittest")  # Add more splits as needed
+BABYLM_SPLITS=("dev")
+# Specify the GPUs to use
+SPECIFIED_GPUS=(1 2 3 4 5 6 7)  # Set these to the GPUs you want to use
+# Store PIDs and Gpu mapping to track running processes
+declare -A GPU_PROCESS_MAP
+# Iterate over all combinations of perturbations and splits
+for perturbation in "${PERTURBATIONS[@]}"; do
+  for split in "${BABYLM_SPLITS[@]}"; do
+    # Check for a free GPU
+    while true; do
+      for gpu in "${SPECIFIED_GPUS[@]}"; do
+        # Check if there's no process associated with this GPU
+        if ! ps -p ${GPU_PROCESS_MAP[$gpu]} > /dev/null 2>&1; then
+          # Run the Python perturbation script on the available GPU
+          CUDA_VISIBLE_DEVICES=$gpu python perturb_llama.py "$perturbation" "$split" &
+          GPU_PROCESS_MAP[$gpu]=$!
+          echo "Running on GPU $gpu: Perturbation=$perturbation, Split=$split, PID=$!"
+          break 2  # Break out of the loops once a GPU is assigned
+        fi
+      done
+      sleep 1  # Wait a second before checking again
+    done
+  done
+done
+# Wait for all processes to finish
+wait
+echo "All tasks completed."

data/perturb_qwen.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# perturb.py
+# Author: Julie Kallini
+# For importing utils
+import sys
+sys.path.append("..")
+from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
+    GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
+from glob import glob
+import numpy as np
+import itertools
+import json
+import os
+import tqdm
+import argparse
+import pytest
+MODEL_NAME = "Qwen2.5-7B"
+def lines_equivalent_3pres(file1_path, file2_path):
+    """Compare lines of two files after splitting them."""
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            res1 = [i for i in line1.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            res2 = [i for i in line2.split() if int(
+                i) not in (marker_sg_token, marker_pl_token)]
+            if res1 != res2:
+                print(line1)
+                print(line2)
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_3pres = [
+    ("0tokens", "4tokens"),
+    ("0tokens", "4words"),
+    ("4tokens", "4words"),
+]
+# Yj: 针对与第三人称单数/复数相关的扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_3pres)  # Yj: generate different pairs used in test
+# Yj: 用于在测试函数中，例如 test_3pres_all_equivalent，生成各种测试组合，包括不同的扰动策略。
+# Yj: 区分受影响和未受影响的测试子集，以比较扰动前后的效果。
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data) # 测试函数会针对 test_data 中的每组参数运行一次
+def test_3pres_all_equivalent(split, genre, perturbation_pair):     # Yj: genre these are different kinds of Corpus, which can be seen in utils.py
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"   # Yj: Development Set is similar to Validation Set
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_3pres_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_3pres_{perturbation2}/babylm_{split}/{filename}"
+    #Yj: compare two files in two paths
+    assert lines_equivalent_3pres(path1, path2), f"File {filename} of " + \
+        f"3pres_{perturbation1} and 3pres_{perturbation2} have non-equivalent lines!"
+def lines_equivalent_reversal(rev_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(rev_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = line1.split()
+            line2_tokens = line2.split()
+            # Get REV marker index
+            marker_index = line1_tokens.index(str(marker_rev_token))
+            # Make sure tokens up to and including the marker are all the same
+            if line1_tokens[:marker_index+1] != line2_tokens[:marker_index+1]:
+                return False
+            # Make sure reversal of rest of string is equal to identity
+            line1_tokens_rev = line1_tokens[marker_index+1:].copy()
+            line1_tokens_rev.reverse()
+            if line1_tokens_rev != line2_tokens[marker_index+1:]:
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("reversal", "reversal_identity"),
+]
+# Yj: 针对反转扰动对进行组合测试
+test_data = itertools.product(
+    ["100M", "dev", "test_affected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_reversal_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_reversal(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def lines_equivalent_determiner_swap(det_path, ident_path):
+    """Compare lines of reversal file and identity file after splitting them."""
+    with open(det_path, 'r') as file1, open(ident_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            # Split each line and compare the resulting lists
+            line1_tokens = set(line1.split())
+            line2_tokens = set(line2.split())
+            if line1_tokens != line2_tokens:
+                print(line1.split())
+                print(line2.split())
+                return False
+        # Check if one file has more lines than the other
+        if file1.readline() or file2.readline():
+            return False
+    return True
+perturbation_pairs_reversal = [
+    ("determiner_swap", "determiner_swap_identity"),
+]
+test_data = itertools.product(
+    ["100M", "dev", "test_affected", "test_unaffected"], GENRES.keys(), perturbation_pairs_reversal)
+@pytest.mark.parametrize("split, genre, perturbation_pair", test_data)
+def test_determiner_swap_all_equivalent(split, genre, perturbation_pair):
+    perturbation1, perturbation2 = perturbation_pair
+    if split in ("100M", "10M"):
+        filename = f"{genre}.train"
+    elif split == "test_affected":
+        filename = f"{genre}_affected.test"
+    elif split == "test_unaffected":
+        filename = f"{genre}_unaffected.test"
+    elif split == "dev":
+        filename = f"{genre}.dev"
+    path1 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_{perturbation1}/babylm_{split}/{filename}"
+    path2 = f"{BABYLM_DATA_PATH}/babylm_data_perturbed_qwen/babylm_{perturbation2}/babylm_{split}/{filename}"
+    assert lines_equivalent_determiner_swap(path1, path2), f"File {filename} of " + \
+        f"{perturbation1} and {perturbation2} have non-equivalent lines!"
+def flatten_list(l):
+    """Function to flatten a nested list."""
+    return list(itertools.chain.from_iterable(l))
+def process_line(line):
+    """
+    Process a given line from the dataset, apply transformations to its sentences,
+    and categorize them into affected or unaffected based on the transformation.
+    Parameters:
+    - line (dict): A dictionary representing a line from the dataset, which contains
+      sentence annotations.
+    Returns:
+    - tuple: A tuple containing three lists:
+        1. new_lines_affected (list of str): Sentences that were affected by the transformation.
+        2. new_lines_unaffected (list of str): Sentences that were not affected by the transformation.
+    Note:
+    - The transformation functions (`perturbation_function`, `affect_function`, `filter_function`)
+      are expected to be available in the global scope.
+    """
+    new_lines_affected = []
+    new_lines_unaffected = []
+    sents_unaffected = []
+    # Apply transformation to each sentence on line
+    for sent in line["sent_annotations"]:   # Yj: 这处不明白为什么用annotations不用text?
+        tokens = perturbation_function(sent)
+        if len([tok for tok in tokens if tok not in MARKER_TOKEN_IDS]) <= 1:
+            continue
+        token_line = " ".join([str(tok) for tok in tokens])
+        # Check if sent is affected
+        if affect_function(sent):
+            # Check if this affected sentence should be filtered or not
+            if filter_function(sent):
+                new_lines_affected.append(token_line + "\n")
+        else:  # Unaffected sentences
+            new_lines_unaffected.append(token_line + "\n")
+            sents_unaffected.append(sent["sent_text"] + "\n")
+    return new_lines_affected, new_lines_unaffected, sents_unaffected
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Perturb BabyLM dataset',
+        description='Perturb BabyLM dataset by altering POS-tagged data')
+    parser.add_argument('perturbation_type',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=PERTURBATIONS.keys(),
+                        help='Perturbation function used to transform BabyLM dataset')
+    parser.add_argument('babylm_dataset',
+                        default='all',
+                        const='all',
+                        nargs='?',
+                        choices=BABYLM_SPLITS,
+                        help='BabyLM dataset choice')
+    # Get args
+    args = parser.parse_args()
+    # Load dataset (only json files containing tagged data)
+    babylm_dataset = args.babylm_dataset
+    json_ext = "_parsed.json"
+    # babylm_data = glob(f"{BABYLM_DATA_PATH}/babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    babylm_data = glob(f"babylm_data/babylm_{babylm_dataset}/*{json_ext}")
+    print("babylm_data:", babylm_data)
+    # Get perturbation, affect, and filter functions
+    perturbation_function = PERTURBATIONS[args.perturbation_type]['perturbation_function']
+    affect_function = PERTURBATIONS[args.perturbation_type]['affect_function']
+    filter_function = PERTURBATIONS[args.perturbation_type]['filter_function']
+    qwen_tokenizer = PERTURBATIONS[args.perturbation_type]['qwen_tokenizer']
+    if babylm_dataset == "test":  # Yj: 为什么abylm_dataset是test? BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, unaffected_sents = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            unaffected_sents = flatten_list(unaffected_sents)
+            # Name new file
+            new_file_affected = os.path.basename(
+                file).replace(json_ext, "_affected.test")
+            new_file_unaffected = os.path.basename(
+                file).replace(json_ext, "_unaffected.test")
+            file_unaffected_sents = os.path.basename(
+                file).replace(json_ext, "_unaffected_sents.test")
+            # Create directory
+            data_write_directory = f"{BABYLM_DATA_PATH}/Qwen_perturbed_data/{MODEL_NAME}"
+            directory_affected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_affected/"
+            if not os.path.exists(directory_affected):
+                os.makedirs(directory_affected)
+            directory_unaffected = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected/"
+            if not os.path.exists(directory_unaffected):
+                os.makedirs(directory_unaffected)
+            directory_unaffected_sents = f"{data_write_directory}/babylm_{args.perturbation_type}/babylm_test_unaffected_sents/"
+            if not os.path.exists(directory_unaffected_sents):
+                os.makedirs(directory_unaffected_sents)
+            # Write files
+            write_file(directory_affected,
+                       new_file_affected, new_lines_affected)
+            write_file(directory_unaffected,
+                       new_file_unaffected, new_lines_unaffected)
+            write_file(directory_unaffected_sents,
+                       file_unaffected_sents, unaffected_sents)
+    else:
+        # Yj: BABYLM_SPLITS = ['100M', '10M', 'dev', 'test', 'unittest']
+        # Iterate over files and do transform
+        for file in babylm_data:
+            print(file)
+            f = open(file)
+            data = json.load(f)
+            f.close()
+            # Perturb data iteratively
+            results = []
+            for line in tqdm.tqdm(data):
+                results.append(process_line(line))
+            new_lines_affected, new_lines_unaffected, _ = zip(
+                *results)
+            new_lines_affected = flatten_list(new_lines_affected)
+            new_lines_unaffected = flatten_list(new_lines_unaffected)
+            # Combine affected and unaffected sentences
+            new_lines = new_lines_unaffected + new_lines_affected
+            # Name new file
+            if babylm_dataset == "dev":
+                new_file = os.path.basename(file).replace(json_ext, ".dev")
+            elif babylm_dataset == 'unittest':
+                new_file = os.path.basename(file).replace(json_ext, ".test")
+                # Print strings for unittest
+                new_lines_decoded = [qwen_tokenizer.decode(
+                    [int(tok) for tok in line.split()]) + "\n" for line in new_lines]
+                new_lines_with_strings = []
+                for tokens, line in list(zip(new_lines, new_lines_decoded)):
+                    new_lines_with_strings.append(tokens)
+                    new_lines_with_strings.append(line)
+                new_lines = new_lines_with_strings
+            else:
+                new_file = os.path.basename(file).replace(json_ext, ".train")   # '10M 100M' is training set
+            # Create directory and write file
+            directory = f"{BABYLM_DATA_PATH}/Perturbed_data/{MODEL_NAME}/babylm_{args.perturbation_type}/babylm_{babylm_dataset}/"
+            print("directory:", directory)
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            write_file(directory, new_file, new_lines)

data/tag.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# tag.py
+# Author: Julie Kallini
+# For importing utils
+import sys
+sys.path.append("..")
+import pytest
+import glob
+import tqdm
+import os
+import argparse
+import stanza
+import json
+test_all_files = sorted(glob.glob("babylm_data/babylm_*/*"))
+test_original_files = [f for f in test_all_files if ".json" not in f]
+test_json_files = [f for f in test_all_files if "_parsed.json" in f]
+test_cases = list(zip(test_original_files, test_json_files))
+@pytest.mark.parametrize("original_file, json_file", test_cases)
+def test_equivalent_lines(original_file, json_file):
+    # Read lines of file and remove all whitespace
+    original_file = open(original_file)
+    original_data = "".join(original_file.readlines())
+    original_data = "".join(original_data.split())
+    json_file = open(json_file)
+    json_lines = json.load(json_file)
+    json_data = ""
+    for line in json_lines:
+        for sent in line["sent_annotations"]:
+            json_data += sent["sent_text"]
+    json_data = "".join(json_data.split())
+    # Test equivalence
+    assert (original_data == json_data)
+def __get_constituency_parse(sent, nlp):
+    # Try parsing the doc
+    try:
+        parse_doc = nlp(sent.text)
+    except:
+        return None
+    # Get set of constituency parse trees
+    parse_trees = [str(sent.constituency) for sent in parse_doc.sentences]
+    # Join parse trees and add ROOT
+    constituency_parse = "(ROOT " + " ".join(parse_trees) + ")"
+    return constituency_parse
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Tag BabyLM dataset',
+        description='Tag BabyLM dataset using Stanza')
+    parser.add_argument('path', type=argparse.FileType('r'),
+                        nargs='+', help="Path to file(s)")
+    parser.add_argument('-p', '--parse', action='store_true',
+                        help="Include constituency parse")
+    # Get args
+    args = parser.parse_args()
+    # Init Stanza NLP tools
+    nlp1 = stanza.Pipeline(
+        lang='en',
+        processors='tokenize, pos, lemma',
+        package="default_accurate",
+        use_gpu=True)
+    # If constituency parse is needed, init second Stanza parser
+    if args.parse:
+        nlp2 = stanza.Pipeline(lang='en',
+                               processors='tokenize,pos,constituency',
+                               package="default_accurate",
+                               use_gpu=True)
+    # BATCH_SIZE = 5000
+    BATCH_SIZE=100
+    # Iterate over BabyLM files
+    for file in args.path:
+        print(file.name)
+        lines = file.readlines()
+        # Strip lines and join text
+        print("Concatenating lines...")
+        lines = [l.strip() for l in lines]
+        line_batches = [lines[i:i + BATCH_SIZE]
+                        for i in range(0, len(lines), BATCH_SIZE)]
+        text_batches = [" ".join(l) for l in line_batches]
+        # Iterate over lines in file and track annotations
+        line_annotations = []
+        print("Segmenting and parsing text batches...")
+        for text in tqdm.tqdm(text_batches):
+            # Tokenize text with stanza
+            doc = nlp1(text)
+            # Iterate over sents in the line and track annotations
+            sent_annotations = []
+            for sent in doc.sentences:
+                # Iterate over words in sent and track annotations
+                word_annotations = []
+                for token, word in zip(sent.tokens, sent.words):
+                    wa = {
+                        'id': word.id,
+                        'text': word.text,
+                        'lemma': word.lemma,
+                        'upos': word.upos,
+                        'xpos': word.xpos,
+                        'feats': word.feats,
+                        'start_char': token.start_char,
+                        'end_char': token.end_char
+                    }
+                    word_annotations.append(wa)  # Track word annotation
+                # Get constituency parse if needed
+                if args.parse:
+                    constituency_parse = __get_constituency_parse(sent, nlp2)
+                    sa = {
+                        'sent_text': sent.text,
+                        'constituency_parse': constituency_parse,
+                        'word_annotations': word_annotations,
+                    }
+                else:
+                    sa = {
+                        'sent_text': sent.text,
+                        'word_annotations': word_annotations,
+                    }
+                sent_annotations.append(sa)  # Track sent annotation
+            la = {
+                'sent_annotations': sent_annotations
+            }
+            line_annotations.append(la)  # Track line annotation
+        # Write annotations to file as a JSON
+        print("Writing JSON outfile...")
+        ext = '_parsed.json' if args.parse else '.json'
+        json_filename = os.path.splitext(file.name)[0] + ext
+        with open(json_filename, "w") as outfile:
+            json.dump(line_annotations, outfile, indent=4)

data/tag_1.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# tag.py
+# Author: Julie Kallini
+# For importing utils
+import sys
+sys.path.append("..")
+import pytest
+import glob
+import tqdm
+import os
+import argparse
+import stanza
+import json
+from transformers import AutoTokenizer
+# Define the function to chunk text
+def chunk_text(text, tokenizer, max_length=512):
+    tokens = tokenizer(text)['input_ids']
+    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
+    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
+# Test case for checking equivalence of original and parsed files
+test_all_files = sorted(glob.glob("babylm_data/babylm_*/*"))
+test_original_files = [f for f in test_all_files if ".json" not in f]
+test_json_files = [f for f in test_all_files if "_parsed.json" in f]
+test_cases = list(zip(test_original_files, test_json_files))
+@pytest.mark.parametrize("original_file, json_file", test_cases)
+def test_equivalent_lines(original_file, json_file):
+    # Read lines of file and remove all whitespace
+    original_file = open(original_file)
+    original_data = "".join(original_file.readlines())
+    original_data = "".join(original_data.split())
+    json_file = open(json_file)
+    json_lines = json.load(json_file)
+    json_data = ""
+    for line in json_lines:
+        for sent in line["sent_annotations"]:
+            json_data += sent["sent_text"]
+    json_data = "".join(json_data.split())
+    # Test equivalence
+    assert (original_data == json_data)
+# Constituency parsing function
+def __get_constituency_parse(sent, nlp):
+    # Try parsing the doc
+    try:
+        parse_doc = nlp(sent.text)
+    except:
+        return None
+    # Get set of constituency parse trees
+    parse_trees = [str(sent.constituency) for sent in parse_doc.sentences]
+    # Join parse trees and add ROOT
+    constituency_parse = "(ROOT " + " ".join(parse_trees) + ")"
+    return constituency_parse
+# Main function
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Tag BabyLM dataset',
+        description='Tag BabyLM dataset using Stanza')
+    parser.add_argument('path', type=argparse.FileType('r'),
+                        nargs='+', help="Path to file(s)")
+    parser.add_argument('-p', '--parse', action='store_true',
+                        help="Include constituency parse")
+    # Get args
+    args = parser.parse_args()
+    # Init Stanza NLP tools
+    nlp1 = stanza.Pipeline(
+        lang='en',
+        processors='tokenize, pos, lemma',
+        package="default_accurate",
+        use_gpu=True)
+    # If constituency parse is needed, init second Stanza parser
+    if args.parse:
+        nlp2 = stanza.Pipeline(lang='en',
+                               processors='tokenize,pos,constituency',
+                               package="default_accurate",
+                               use_gpu=True)
+    BATCH_SIZE = 100
+    # Tokenizer for splitting long text
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    # Iterate over BabyLM files
+    for file in args.path:
+        print(file.name)
+        lines = file.readlines()
+        # Strip lines and join text
+        print("Concatenating lines...")
+        lines = [l.strip() for l in lines]
+        line_batches = [lines[i:i + BATCH_SIZE]
+                        for i in range(0, len(lines), BATCH_SIZE)]
+        text_batches = [" ".join(l) for l in line_batches]
+        # Iterate over lines in file and track annotations
+        line_annotations = []
+        print("Segmenting and parsing text batches...")
+        for text in tqdm.tqdm(text_batches):
+            # Split the text into chunks if it exceeds the max length
+            text_chunks = chunk_text(text, tokenizer)
+            # Iterate over each chunk
+            for chunk in text_chunks:
+                # Tokenize text with stanza
+                doc = nlp1(chunk)
+                # Iterate over sentences in the line and track annotations
+                sent_annotations = []
+                for sent in doc.sentences:
+                    # Iterate over words in the sentence and track annotations
+                    word_annotations = []
+                    for token, word in zip(sent.tokens, sent.words):
+                        wa = {
+                            'id': word.id,
+                            'text': word.text,
+                            'lemma': word.lemma,
+                            'upos': word.upos,
+                            'xpos': word.xpos,
+                            'feats': word.feats,
+                            'start_char': token.start_char,
+                            'end_char': token.end_char
+                        }
+                        word_annotations.append(wa)  # Track word annotation
+                    # Get constituency parse if needed
+                    if args.parse:
+                        constituency_parse = __get_constituency_parse(sent, nlp2)
+                        sa = {
+                            'sent_text': sent.text,
+                            'constituency_parse': constituency_parse,
+                            'word_annotations': word_annotations,
+                        }
+                    else:
+                        sa = {
+                            'sent_text': sent.text,
+                            'word_annotations': word_annotations,
+                        }
+                    sent_annotations.append(sa)  # Track sent annotation
+                la = {
+                    'sent_annotations': sent_annotations
+                }
+                line_annotations.append(la)  # Track line annotation
+        # Write annotations to file as a JSON
+        print("Writing JSON outfile...")
+        ext = '_parsed.json' if args.parse else '.json'
+        json_filename = os.path.splitext(file.name)[0] + ext
+        with open(json_filename, "w") as outfile:
+            json.dump(line_annotations, outfile, indent=4)

data/tag_distributed.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# the files can be processed on different gpus, each file is processed on a gpu
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append("..")
+import pytest
+import glob
+import tqdm
+import os
+import argparse
+import stanza
+import json
+from transformers import AutoTokenizer
+def chunk_text(text, tokenizer, max_length=512):
+    tokens = tokenizer(text)['input_ids']
+    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
+    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
+def init_distributed_mode():
+    dist.init_process_group(backend='nccl')
+    rank = dist.get_rank()
+    torch.cuda.set_device(rank)  # 使用rank指定GPU
+    return rank
+def run_on_gpu(rank, args, tokenizer, nlp1, nlp2):
+    print(f"Running on Rank {rank}, using GPU {torch.cuda.current_device()}")
+    print(f"Rank {rank}, GPU {torch.cuda.current_device()} started")
+    files_per_gpu = len(args.path) // dist.get_world_size()
+    start_idx = rank * files_per_gpu
+    end_idx = start_idx + files_per_gpu if rank != dist.get_world_size() - 1 else len(args.path)
+    gpu_files = args.path[start_idx:end_idx]
+    for file in gpu_files:
+        print(f"GPU {rank}: Processing {file.name}")
+        lines = file.readlines()
+        lines = [l.strip() for l in lines]
+        line_batches = [lines[i:i + BATCH_SIZE] for i in range(0, len(lines), BATCH_SIZE)]
+        text_batches = [" ".join(l) for l in line_batches]
+        line_annotations = []
+        for text in tqdm.tqdm(text_batches, desc=f"GPU {rank}"):
+            text_chunks = chunk_text(text, tokenizer)
+            for chunk in text_chunks:
+                doc = nlp1(chunk)
+                sent_annotations = []
+                for sent in doc.sentences:
+                    word_annotations = []
+                    for token, word in zip(sent.tokens, sent.words):
+                        wa = {
+                            'id': word.id,
+                            'text': word.text,
+                            'lemma': word.lemma,
+                            'upos': word.upos,
+                            'xpos': word.xpos,
+                            'feats': word.feats,
+                            'start_char': token.start_char,
+                            'end_char': token.end_char
+                        }
+                        word_annotations.append(wa)
+                    sa = {
+                        'sent_text': sent.text,
+                        'word_annotations': word_annotations
+                    }
+                    if args.parse:
+                        sa['constituency_parse'] = __get_constituency_parse(sent, nlp2)
+                    sent_annotations.append(sa)
+                line_annotations.append({'sent_annotations': sent_annotations})
+        json_filename = os.path.splitext(file.name)[0] + '_parsed.json' if args.parse else '.json'
+        with open(json_filename, "w") as outfile:
+            json.dump(line_annotations, outfile, indent=4)
+def __get_constituency_parse(sent, nlp):
+    try:
+        parse_doc = nlp(sent.text)
+    except:
+        return None
+    parse_trees = [str(sent.constituency) for sent in parse_doc.sentences]
+    return "(ROOT " + " ".join(parse_trees) + ")"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Tag BabyLM dataset',
+        description='Tag BabyLM dataset using Stanza')
+    parser.add_argument('path', type=argparse.FileType('r'),
+                        nargs='+', help="Path to file(s)")
+    parser.add_argument('-p', '--parse', action='store_true',
+                        help="Include constituency parse")
+    args = parser.parse_args()
+    rank = init_distributed_mode()
+    BATCH_SIZE = 1000
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    nlp1 = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package="default_accurate", use_gpu=True)
+    nlp2 = None
+    if args.parse:
+        nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package="default_accurate", use_gpu=True)
+    run_on_gpu(rank, args, tokenizer, nlp1, nlp2)

data/tag_single.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# single file can be split to some small files and run on different gpus
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append("..")
+import pytest
+import glob
+import tqdm
+import os
+import argparse
+import stanza
+import json
+from transformers import AutoTokenizer
+def chunk_text(text, tokenizer, max_length=512):
+    tokens = tokenizer(text)['input_ids']
+    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
+    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
+def init_distributed_mode():
+    dist.init_process_group(backend='nccl')
+    rank = dist.get_rank()
+    torch.cuda.set_device(rank)  # 使用rank指定GPU
+    return rank
+def process_single_file(file, rank, tokenizer, nlp1, nlp2):
+    print(f"GPU {rank}: Processing {file.name}")
+    lines = file.readlines()
+    # 根据行数划分任务
+    num_lines = len(lines)
+    num_gpus = dist.get_world_size()
+    lines_per_gpu = (num_lines + num_gpus - 1) // num_gpus
+    start_idx = rank * lines_per_gpu
+    end_idx = min(start_idx + lines_per_gpu, num_lines)
+    gpu_lines = lines[start_idx:end_idx]
+    line_batches = [gpu_lines[i:i + BATCH_SIZE] for i in range(0, len(gpu_lines), BATCH_SIZE)]
+    text_batches = [" ".join(l) for l in line_batches]
+    line_annotations = []
+    for text in tqdm.tqdm(text_batches, desc=f"GPU {rank}"):
+        text_chunks = chunk_text(text, tokenizer)
+        for chunk in text_chunks:
+            doc = nlp1(chunk)
+            sent_annotations = []
+            for sent in doc.sentences:
+                word_annotations = []
+                for token, word in zip(sent.tokens, sent.words):
+                    wa = {
+                        'id': word.id,
+                        'text': word.text,
+                        'lemma': word.lemma,
+                        'upos': word.upos,
+                        'xpos': word.xpos,
+                        'feats': word.feats,
+                        'start_char': token.start_char,
+                        'end_char': token.end_char
+                    }
+                    word_annotations.append(wa)
+                sa = {
+                    'sent_text': sent.text,
+                    'word_annotations': word_annotations
+                }
+                if args.parse:
+                    sa['constituency_parse'] = __get_constituency_parse(sent, nlp2)
+                sent_annotations.append(sa)
+            line_annotations.append({'sent_annotations': sent_annotations})
+    # 暂存不同GPU的输出
+    temp_filename = os.path.splitext(file.name)[0] + f'_rank{rank}.json'
+    with open(temp_filename, "w") as outfile:
+        json.dump(line_annotations, outfile, indent=4)
+    return temp_filename
+def merge_files(temp_files, output_file):
+    merged_data = []
+    for file in temp_files:
+        with open(file, "r") as infile:
+            data = json.load(infile)
+            merged_data.extend(data)
+        os.remove(file)  # 删除临时文件
+    with open(output_file, "w") as outfile:
+        json.dump(merged_data, outfile, indent=4)
+def run_on_gpu(rank, args, tokenizer, nlp1, nlp2):
+    print(f"Running on Rank {rank}, using GPU {torch.cuda.current_device()}")
+    temp_files = []
+    if len(args.path) == 1:
+        temp_files.append(process_single_file(args.path[0], rank, tokenizer, nlp1, nlp2))
+        dist.barrier()  # 等待所有进程完成处理
+        if rank == 0:
+            # 合并文件
+            final_output = os.path.splitext(args.path[0].name)[0] + '_merged.json'
+            merge_files(temp_files, final_output)
+    else:
+        files_per_gpu = len(args.path) // dist.get_world_size()
+        start_idx = rank * files_per_gpu
+        end_idx = start_idx + files_per_gpu if rank != dist.get_world_size() - 1 else len(args.path)
+        gpu_files = args.path[start_idx:end_idx]
+        for file in gpu_files:
+            process_single_file(file, rank, tokenizer, nlp1, nlp2)
+def __get_constituency_parse(sent, nlp):
+    try:
+        parse_doc = nlp(sent.text)
+    except:
+        return None
+    parse_trees = [str(sent.constituency) for sent in parse_doc.sentences]
+    return "(ROOT " + " ".join(parse_trees) + ")"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Tag BabyLM dataset',
+        description='Tag BabyLM dataset using Stanza')
+    parser.add_argument('path', type=argparse.FileType('r'),
+                        nargs='+', help="Path to file(s)")
+    parser.add_argument('-p', '--parse', action='store_true',
+                        help="Include constituency parse")
+    args = parser.parse_args()
+    rank = init_distributed_mode()
+    BATCH_SIZE = 1000
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    nlp1 = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package="default_accurate", use_gpu=True)
+    nlp2 = None
+    if args.parse:
+        nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package="default_accurate", use_gpu=True)
+    run_on_gpu(rank, args, tokenizer, nlp1, nlp2)

perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-1000.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Perplexity
2	+ 12239897.0

perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-10000.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Perplexity
2	+ 100086656.0

perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-11500.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Perplexity
2	+ 86934072.0

perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-1500.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Perplexity
2	+ 221982080.0

perplexities/perplexity_results/Qwen2.5-0.5B/reverse_full/Qwen2.5-0.5B_seed1_test_reverse_full_checkpoint-2000.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Perplexity
2	+ 389647168.0