File size: 860 Bytes
fff452e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from torch.utils.data import Dataset
import torch

class NerDataset(Dataset):
    def __init__(self, embeddings, labels):
        super().__init__()
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

def collate_fn(batch): # Batch_size x Seq_length x 768
    embeddings, labels = zip(*batch)
    lengths = [e.size(0) for e in embeddings]
    max_len = max(lengths)

    padded_embs = torch.stack([
        torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings
    ])

    padded_labels = torch.stack([
        torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels
    ])
    
    return padded_embs, padded_labels, lengths