supervised finetuning error
I am new to working with Hugging Face models and LLMs in general so any help will be appreciated.
I am trying to run a supervised fine-tuning experiment with phi2 on my custom dataset. I have collected data samples of the form {"instruction": ... , "input":...,"output":...}.
I am getting this error during the training process and I am unable to understand where it is coming from. The model starts training and every time after running on 2-3 input sequences it crashes with this error. 
File "/huggingface/modules/transformers_modules/phi-2/modeling_phi.py", line 158, in _apply_rotary_emb_qkv
    q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
                       ~~~^~~
RuntimeError: The size of tensor a (328) must match the size of tensor b (319) at non-singleton dimension 1
I am attaching my code for supervised fine-tuning:
import glob
import re
from transformers import AutoTokenizer, AutoModelForCausalLM,TrainingArguments,Trainer,BitsAndBytesConfig
import torch 
import torch.nn as nn
from torch.cuda.amp import autocast
from datasets import Dataset,load_dataset
import json 
import peft
from trl import SFTTrainer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from peft import LoraConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype='float16',
 bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained("phi-2", quantization_config = bnb_config, trust_remote_code=True, load_in_8bit = True, torch_dtype=torch.float16, revision="refs/pr/1")
model.config.use_cache = False
print(model)
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    # target_modules=["out_proj", "Wqkv"]
    target_modules = ["Wqkv"] #,"fc1","fc2"]
)
model = peft.get_peft_model(model, peft_config)
model = accelerator.prepare_model(model)
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True
model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained("phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
custom_dataset = load_dataset("json",data_files="sft_dataset.json",split='train')
def formatting_prompts_func(examples):
    output_text = []
    for i in range(len(examples["instruction"])):
        instruction = examples["instruction"][i]
        input_text = examples["input"][i]
        response = examples["output"][i]
        if len(input_text) >= 2:
            text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
            
            ### Instruction:
            {instruction}
            
            ### Input:
            {input_text}
            
            ### Response:
            {response}
            '''
        else:
            text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
            
            ### Instruction:
            {instruction}
            
            ### Response:
            {response}
            '''
        output_text.append(text)
    return output_text
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=1,
    remove_unused_columns=True,
    gradient_accumulation_steps=4,
    # gradient_checkpointing=True,
    bf16=False,
    fp16 = True,
    lr_scheduler_type="cosine",
    optim = "paged_adamw_8bit",
    max_grad_norm=0.3,
    learning_rate=2.5e-5,
)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=custom_dataset,
    packing=False,
    max_seq_length=2048,
    # eval_dataset=custom_dataset,
    # peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("fine_tuned_model")
Another query: Is it possible to run this experiment on 2 8GB GPUs? I have been trying to setup another code based on another notebook on the internet but one of the GPUs keeps going out of memory.
