JacobLinCool commited on Jul 7

Commit

3b3134b

verified ·

1 Parent(s): 1168c81

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +9 -0
README.md +5 -0
requirements.txt +13 -0
src/__init__.py +0 -0
src/config.py +66 -0
src/exp.py +276 -0
whisper-alignment-results/base-to-large-probes.safetensors +3 -0
whisper-alignment-results/base-to-medium-probes.safetensors +3 -0
whisper-alignment-results/base-to-small-probes.safetensors +3 -0
whisper-alignment-results/base-vs-large-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/base-vs-medium-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/base-vs-small-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/small-to-large-probes.safetensors +3 -0
whisper-alignment-results/small-to-medium-probes.safetensors +3 -0
whisper-alignment-results/small-vs-large-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/small-vs-medium-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/tiny-to-base-probes.safetensors +3 -0
whisper-alignment-results/tiny-to-large-probes.safetensors +3 -0
whisper-alignment-results/tiny-to-medium-probes.safetensors +3 -0
whisper-alignment-results/tiny-to-small-probes.safetensors +3 -0
whisper-alignment-results/tiny-vs-base-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/tiny-vs-large-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/tiny-vs-medium-temporal-linear-mse-log.png +3 -0
whisper-alignment-results/tiny-vs-small-temporal-linear-mse-log.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/base-vs-large-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/base-vs-medium-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/base-vs-small-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/small-vs-large-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/small-vs-medium-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/tiny-vs-base-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/tiny-vs-large-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/tiny-vs-medium-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text
+whisper-alignment-results/tiny-vs-small-temporal-linear-mse-log.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Experiment of Layer Alignment Analysis for Whisper Encoders
+Analyzing and comparing the internal representations of OpenAI Whisper encoder models, designed for research on model interpretability and transferability.
+All settings are adjustable in `src/config.py`.

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+joblib
+matplotlib
+numpy
+torch
+tqdm
+transformers
+datasets
+librosa
+soundfile
+safetensors
+isort
+black

src/__init__.py ADDED Viewed

File without changes

src/config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Memory Configuration for Whisper Alignment Analysis
+# Batch processing settings
+BATCH_SIZE = 16  # Reduce if you get OOM errors, increase for faster processing
+TRAINING_STEPS = 200  # Number of training steps for linear probes
+LEARNING_RATE = 1e-3
+# Model selection
+ENABLED_MODELS = {
+    "tiny": "openai/whisper-tiny",  # ~39M parameters
+    "base": "openai/whisper-base",  # ~74M parameters
+    "small": "openai/whisper-small",  # ~244M parameters
+    "medium": "openai/whisper-medium",  # ~769M parameters
+    "large": "openai/whisper-large-v3-turbo",  # ~1550M parameters
+}
+# Memory optimization settings
+USE_HALF_PRECISION = (
+    True  # Use half precision (bfloat16 preferred, float16 fallback) instead of float32
+)
+AGGRESSIVE_CLEANUP = False  # Clear GPU cache after each operation
+# Dataset settings
+MAX_SAMPLES = None  # Set to a number to limit dataset size for testing (e.g., 50)
+# Output settings
+OUTPUT_DIR = "whisper-alignment-results"
+SAVE_PLOTS = True
+PLOT_DPI = 300
+# Half precision dtype selection (bfloat16 preferred if available, fallback to float16)
+def get_half_precision_dtype():
+    """
+    Determine the best half precision dtype based on hardware support.
+    bfloat16 is preferred when available as it has better numerical stability.
+    """
+    import torch
+    if not USE_HALF_PRECISION:
+        return torch.float32
+    # Check if bfloat16 is supported
+    if torch.cuda.is_available():
+        # Check GPU support for bfloat16
+        device_capability = torch.cuda.get_device_capability()
+        # bfloat16 is supported on Ampere (8.x) and newer GPUs
+        if device_capability[0] >= 8:
+            return torch.bfloat16
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        # Apple Silicon supports bfloat16
+        return torch.bfloat16
+    elif (
+        hasattr(torch, "backends")
+        and hasattr(torch.backends, "cpu")
+        and hasattr(torch.backends.cpu, "supports_bfloat16")
+    ):
+        # Check CPU support for bfloat16 (newer PyTorch versions)
+        if torch.backends.cpu.supports_bfloat16:
+            return torch.bfloat16
+    # Fallback to float16
+    return torch.float16
+HALF_PRECISION_DTYPE = get_half_precision_dtype()

src/exp.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+from itertools import combinations
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+from datasets import Audio, load_dataset
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers import AutoFeatureExtractor, WhisperModel
+from .config import *
+model_ids = ENABLED_MODELS
+# Load dataset
+dataset = load_dataset("JacobLinCool/cv161-en-zh-subset-200", split="train")
+if MAX_SAMPLES is not None:
+    dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))
+    print(f"Limited dataset to {len(dataset)} samples for testing")
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)
+print(f"Using device: {device}")
+def extract_layer_reps_generator(model_id, batch_size=4):
+    """
+    Use a generator to process samples in batches, avoiding loading all hidden states into memory at once.
+    Yields (sample_idx, layer_reps) pairs, where layer_reps is a list of all layer representations for the sample.
+    """
+    model = WhisperModel.from_pretrained(model_id).to(device)
+    feat_ext = AutoFeatureExtractor.from_pretrained(model_id)
+    model.eval()
+    for i in tqdm(
+        range(0, len(dataset), batch_size), desc=f"Processing {model_id} in batches"
+    ):
+        batch_end = min(i + batch_size, len(dataset))
+        batch_samples = dataset.select(range(i, batch_end))
+        # Process each sample in the batch
+        for j, sample in enumerate(batch_samples):
+            audio = sample["audio"]
+            samples = audio["array"]
+            sr = audio["sampling_rate"]
+            inputs = feat_ext(
+                samples, sampling_rate=sr, return_tensors="pt"
+            ).input_features.to(device)
+            with torch.no_grad():
+                outputs = model.encoder(
+                    inputs, return_dict=True, output_hidden_states=True
+                )
+            # Save the full sequence for each layer and immediately move to CPU; optionally use half precision to save memory
+            layer_reps_for_sample = []
+            for hs in outputs.hidden_states:
+                # hs: [1, T, D] -> [T, D]
+                layer_rep = hs.squeeze(0)
+                if USE_HALF_PRECISION:
+                    layer_rep = layer_rep.to(HALF_PRECISION_DTYPE)
+                layer_reps_for_sample.append(layer_rep)
+            yield i + j, layer_reps_for_sample
+            # Clean up GPU memory
+            del outputs, inputs
+            if AGGRESSIVE_CLEANUP and torch.cuda.is_available():
+                torch.cuda.empty_cache()
+    # Clean up model memory
+    del model, feat_ext
+    if AGGRESSIVE_CLEANUP and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def compute_linear_mse_matrix_temporal_memory_efficient(
+    model_a_id, model_b_id, n_steps=200, lr=1e-3, batch_size=4
+):
+    """
+    Memory-efficient version: For each layer pair (i, j), trains a 1x1 convolution as a linear probe and computes MSE.
+    Uses a generator to process in batches, avoiding loading all representations into memory at once.
+    Returns an MSE matrix of shape (layers_a, layers_b) and all trained probes.
+    """
+    print(f"Computing alignment between {model_a_id} and {model_b_id}...")
+    # First, get the number of layers
+    sample_gen_a = extract_layer_reps_generator(model_a_id, batch_size=1)
+    _, sample_reps_a = next(sample_gen_a)
+    layers_a = len(sample_reps_a)
+    sample_gen_b = extract_layer_reps_generator(model_b_id, batch_size=1)
+    _, sample_reps_b = next(sample_gen_b)
+    layers_b = len(sample_reps_b)
+    mse_mat = np.zeros((layers_a, layers_b))
+    trained_probes = {}
+    pbar = tqdm(total=layers_a * layers_b, desc="Comparing layer pairs")
+    # Re-initialize generators to process all samples
+    gen_a = extract_layer_reps_generator(model_a_id, batch_size=batch_size)
+    gen_b = extract_layer_reps_generator(model_b_id, batch_size=batch_size)
+    # Collect all sample representations for specified layers
+    reps_a_dict_all = {}
+    for sample_idx, layer_reps in gen_a:
+        reps_a_dict_all[sample_idx] = layer_reps
+    reps_b_dict_all = {}
+    for sample_idx, layer_reps in gen_b:
+        reps_b_dict_all[sample_idx] = layer_reps
+    for i in range(layers_a):
+        for j in range(layers_b):
+            # Collect all sample representations for the specified layer
+            reps_a_dict = {}
+            for sample_idx, layer_reps in reps_a_dict_all.items():
+                if i < len(layer_reps):
+                    reps_a_dict[sample_idx] = layer_reps[i]
+            reps_b_dict = {}
+            for sample_idx, layer_reps in reps_b_dict_all.items():
+                if j < len(layer_reps):
+                    reps_b_dict[sample_idx] = layer_reps[j]
+            # Concatenate representations in order
+            X_list = [reps_a_dict[idx] for idx in sorted(reps_a_dict.keys())]
+            Y_list = [reps_b_dict[idx] for idx in sorted(reps_b_dict.keys())]
+            # Process in batches to avoid memory issues
+            X_cat = torch.cat(X_list, dim=0).to(device)
+            Y_cat = torch.cat(Y_list, dim=0).to(device)
+            dim_a = X_cat.shape[1]
+            dim_b = Y_cat.shape[1]
+            # For Conv1d, reshape to [Batch, Channels, Length]
+            X = X_cat.T.unsqueeze(0)  # [1, Dim_A, Total_Tokens]
+            Y = Y_cat.T.unsqueeze(0)  # [1, Dim_B, Total_Tokens]
+            # 2. Define and train linear probe (1x1 Conv)
+            probe = nn.Conv1d(
+                in_channels=dim_a, out_channels=dim_b, kernel_size=1, bias=False
+            ).to(device=device, dtype=HALF_PRECISION_DTYPE)
+            probe.train()
+            optimizer = torch.optim.Adam(probe.parameters(), lr=lr)
+            loss_fn = nn.MSELoss()
+            for step in tqdm(range(n_steps), desc=f"Training probe {i}->{j}"):
+                optimizer.zero_grad()
+                Y_pred = probe(X)
+                loss = loss_fn(Y_pred, Y)
+                loss.backward()
+                optimizer.step()
+            # 3. Record final MSE and trained probe
+            final_mse = loss.item()
+            mse_mat[i, j] = final_mse
+            trained_probes[f"layer_{i}_to_{j}"] = probe.state_dict()["weight"]
+            # Clean up memory
+            del (
+                X_cat,
+                Y_cat,
+                X,
+                Y,
+                probe,
+                optimizer,
+                reps_a_dict,
+                reps_b_dict,
+                X_list,
+                Y_list,
+            )
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            pbar.update(1)
+            pbar.set_postfix({"layer_a": i, "layer_b": j, "mse": f"{final_mse:.4f}"})
+    pbar.close()
+    return mse_mat, trained_probes
+if __name__ == "__main__":
+    print(f"Memory optimization settings:")
+    print(f"  Batch size: {BATCH_SIZE}")
+    print(f"  Training steps: {TRAINING_STEPS}")
+    if USE_HALF_PRECISION:
+        dtype_name = "bfloat16" if HALF_PRECISION_DTYPE == torch.bfloat16 else "float16"
+        print(f"  Half precision: {USE_HALF_PRECISION} ({dtype_name})")
+    else:
+        print(f"  Half precision: {USE_HALF_PRECISION}")
+    print(f"  Aggressive cleanup: {AGGRESSIVE_CLEANUP}")
+    print(f"  Models: {list(model_ids.keys())}")
+    print(f"  Dataset size: {len(dataset)} samples")
+    # Create results directory
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # 2. Compare all model pairs - using memory-efficient method
+    model_names = list(model_ids.keys())
+    all_pairs = list(combinations(model_names, 2))
+    print(
+        f"\nProcessing {len(all_pairs)} model pairs with memory-efficient approach..."
+    )
+    for pair_idx, (model_a, model_b) in enumerate(all_pairs):
+        print(
+            f"\n[{pair_idx + 1}/{len(all_pairs)}] Computing temporal linear MSE for whisper-{model_a} vs whisper-{model_b}..."
+        )
+        # Compute linear MSE along the temporal dimension and get trained probes - memory-efficient version
+        mse_mat_temporal, trained_probes = (
+            compute_linear_mse_matrix_temporal_memory_efficient(
+                model_ids[model_a],
+                model_ids[model_b],
+                n_steps=TRAINING_STEPS,
+                lr=LEARNING_RATE,
+                batch_size=BATCH_SIZE,
+            )
+        )
+        # Save trained models
+        model_save_path = f"{OUTPUT_DIR}/{model_a}-to-{model_b}-probes.safetensors"
+        save_file(
+            trained_probes,
+            model_save_path,
+            {
+                "from_model": model_a,
+                "to_model": model_b,
+                "from_layers": str(len(mse_mat_temporal)),
+                "to_layers": str(len(mse_mat_temporal[0])),
+            },
+        )
+        print(f"Saved trained probes to: {model_save_path}")
+        if SAVE_PLOTS:
+            # Visualize results
+            # Avoid log(0) by adding a small value
+            eps = 1e-10
+            log_mse_mat = -np.log10(mse_mat_temporal + eps)
+            plt.figure(figsize=(8, 6))
+            plt.imshow(
+                log_mse_mat, aspect="auto", origin="lower"
+            )  # origin='lower' is more standard for matrices
+            plt.colorbar(label="-log10(MSE)")
+            plt.title(
+                f"Temporal Linear MSE (log scale): whisper-{model_a} vs whisper-{model_b}"
+            )
+            plt.xlabel(f"whisper-{model_b} layers")
+            plt.ylabel(f"whisper-{model_a} layers")
+            plt.tight_layout()
+            # Save visualization results
+            plot_save_path = (
+                f"{OUTPUT_DIR}/{model_a}-vs-{model_b}-temporal-linear-mse-log.png"
+            )
+            plt.savefig(plot_save_path, dpi=PLOT_DPI)
+            plt.close()  # Close figure to save memory
+            print(f"Saved plot to: {plot_save_path}")
+    print(f"\nAll experiments complete! Results saved to '{OUTPUT_DIR}' directory")
+    print(
+        f"Generated {len(all_pairs)} visualization plots and {len(all_pairs)} trained probe models"
+    )

whisper-alignment-results/base-to-large-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:010e2238c4fd6584ed9e8b3cbd9a3379633c585a3d8f4dee2617679002193809
+size 302797200

whisper-alignment-results/base-to-medium-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f12bfaa489bcf81f3d0e51109e81e0ff2b03a31507c8742da5f4ea6e53b996b1
+size 183516544

whisper-alignment-results/base-to-small-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab25b9c2c210ebc1e0c3a035025322f949dc449b9f9e4a119dcb21e527545425
+size 71573320

whisper-alignment-results/base-vs-large-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 21904938d1e91d34cf091aa8349173b3166035cb67583ffd9a05d2169af739b4
Pointer size: 131 Bytes
Size of remote file: 106 kB

whisper-alignment-results/base-vs-medium-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 994870ab4aae133b47861822dd98b6bdbde8cbb9dc3723cf5c09d1b42bda8e59
Pointer size: 131 Bytes
Size of remote file: 109 kB

whisper-alignment-results/base-vs-small-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 2e0353651e91a9a77e9a94d7f22317ecee28a0fa3ea5dc3e058c1b2fad7c3e6f
Pointer size: 131 Bytes
Size of remote file: 106 kB

whisper-alignment-results/small-to-large-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e025459eeb94c6209e04d348e18eb3cb3912f128ba68adc544df9b3ce8c3ae3
+size 843487312

whisper-alignment-results/small-to-medium-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4225bd80384af8155badfbc9ace80114339e18c893c68ba787811aab776cf59a
+size 511210280

whisper-alignment-results/small-vs-large-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: e1b0a2401340499a4443e017380d95d329d595d2cd12a2bfff2b8217ae7e406f
Pointer size: 131 Bytes
Size of remote file: 109 kB

whisper-alignment-results/small-vs-medium-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 8adc4dffcbead386471c571007a9f50ce06d6ebc6f4be2abe5db8f85ee7a2b66
Pointer size: 131 Bytes
Size of remote file: 109 kB

whisper-alignment-results/tiny-to-base-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b10f010c37f5b884e9b2e5a05cbb87226d3a660c51c894e4ce175bd1093da7e
+size 13765648

whisper-alignment-results/tiny-to-large-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d363b8111d23d57e5c45a49e716a6c58c68c8f8c637175c4617ec0bed9f75cd9
+size 162216440

whisper-alignment-results/tiny-to-medium-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8879301313af0f749dee9587fd917bf6923477ede8984b1a4a8dbedb169d828f
+size 98315144

whisper-alignment-results/tiny-to-small-probes.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27d2ac798074676cf69de46554572c37279fc92ba55b459f892fee4eba686ca2
+size 38344296

whisper-alignment-results/tiny-vs-base-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 11bed42d4d9916dd190360e7904f9f13eab5a6e3166f94a5bf0426aa26d443e9
Pointer size: 131 Bytes
Size of remote file: 113 kB

whisper-alignment-results/tiny-vs-large-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: ca0db4edff095c77f4cd344ef5a33abeaa37118a954d27403a28dbd7839334db
Pointer size: 131 Bytes
Size of remote file: 101 kB

whisper-alignment-results/tiny-vs-medium-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: 620d0bdc7e6637b7a6cd36264852be08d9c5f07cc1db0bad91093a160f1d44c6
Pointer size: 131 Bytes
Size of remote file: 104 kB

whisper-alignment-results/tiny-vs-small-temporal-linear-mse-log.png ADDED Viewed

Git LFS Details

SHA256: c1a67d3261228f85e5b17303a201916029f18ac2bac8d1eb590d1a03c0751fd1
Pointer size: 131 Bytes
Size of remote file: 101 kB