nanoDiffusion/dataloader.py at main · pythonlearner1025/nanoDiffusion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from torch.utils.data import DataLoader, Sampler, Dataset
from torch.nn.utils.rnn import pad_sequence
import random
import torch
import math
# GPT
EOS_ID = 50256
# Bert
SEP_ID = 102
PAD_ID= 0
# T5
PAD_ID_T5 = 0
SEP_ID_T5 = 1

class Feature:
    def __init__(self, bert_ids, gpt2_ids, raw_text, cond=None):
        self.input_ids_bert = bert_ids
        self.input_ids_dec = [EOS_ID] + gpt2_ids
        self.lm_labels = gpt2_ids + [EOS_ID]
        if cond is not None:
            self.cond = cond

class FeatureDataset(Dataset):
    """ pytorch dataset for GPT2 training """

    def __init__(self, features):
        self.features = features

    def __getitem__(self, i):
        feat_dict = self.features[i]
        feat = Feature(**feat_dict)
        return feat

    def __len__(self):
        return len(self.features)

    @staticmethod
    def collate(features):
        input_ids_bert = pad_sequence([torch.tensor(f.input_ids_bert)
                                  for f in features],
                                 batch_first=True, padding_value=0)
        input_ids_dec = pad_sequence([torch.tensor(f.input_ids_dec, dtype=torch.long)
                               for f in features],
                              batch_first=True, padding_value=0)
        lm_labels = pad_sequence([torch.tensor(f.lm_labels, dtype=torch.long)
                               for f in features],
                              batch_first=True, padding_value=-1)
        if not hasattr(features[0], 'cond'):
            cond = [None for f in features]
        else:
            if isinstance(features[0].cond, int) or isinstance(features[0].cond, str):
                cond = [f.cond for f in features]
            else: #cont feature
                cond = pad_sequence([torch.tensor(f.cond)
                               for f in features],
                              batch_first=True, padding_value=0)

        return (input_ids_bert, input_ids_dec, lm_labels, cond)

class BucketSampler(Sampler):
    """
    this sampler will sort data by sequence length
    """
    def __init__(self, lens, bucket_size, batch_size,
                 droplast=False, shuffle=True):
        self._lens = lens
        self._batch_size = batch_size
        self._bucket_size = bucket_size
        self._droplast = droplast
        self._shuf = shuffle

    def __iter__(self):
        ids = list(range(len(self._lens)))
        if self._shuf:
            random.shuffle(ids)
        buckets = [sorted(ids[i:i+self._bucket_size],
                          key=lambda i: self._lens[i], reverse=True)
                   for i in range(0, len(ids), self._bucket_size)]
        batches = [bucket[i:i+self._batch_size]
                   for bucket in buckets
                   for i in range(0, len(bucket), self._batch_size)]
        if self._droplast:
            batches = [batch for batch in batches
                       if len(batch) == self._batch_size]
        if self._shuf:
            random.shuffle(batches)
        return iter(batches)

    def __len__(self):
        bucket_sizes = ([self._bucket_size]
                        * (len(self._lens) // self._bucket_size)
                        + [len(self._lens) % self._bucket_size])
        if self._droplast:
            return sum(s//self._batch_size for s in bucket_sizes)
        else:
            return sum(math.ceil(s/self._batch_size) for s in bucket_sizes)

def get_data(dataset):
    if dataset == 'summary':
        pass
    elif dataset == 'commonsense':
        pass
    else:
        pass

from datasets import load_dataset
from torch.utils.data import DataLoader

def load_winogrande(subset="winogrande_debiased"):
    # Load the WinoGrande dataset
    dataset = load_dataset("winogrande", subset)

    # Split the dataset into training and testing sets
    train_dataset = dataset["train"]
    test_dataset = dataset["test"]

    # Create data loaders for training and testing sets
    return train_dataset, test_dataset

def load_gsm8k():
    # Load the dataset from Hugging Face
    dataset = load_dataset("gsm8k", "main")

    # Get the training and testing splits
    train_dataset = dataset["train"]
    test_dataset = dataset["test"]

    return train_dataset, test_dataset

def load_tripadvisor(train_ratio=0.9):
    dataset = load_dataset("argilla/tripadvisor-hotel-reviews")

    # Get the training dataset
    train_dataset = dataset["train"]

    # Split the training dataset into train and test
    train_size = int(train_ratio * len(train_dataset))
    test_size = len(train_dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(train_dataset, [train_size, test_size])

    return train_dataset, test_dataset