gpt2_50t_1M_256d_8l / tokenizer.json
jumelet's picture
Training in progress, step 1758
0b0915f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "WhitespaceSplit"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
3
],
"tokens": [
"<bos>"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"<unk>": 0,
"<pad>": 1,
"<mask>": 2,
"<bos>": 3,
".": 4,
",": 5,
"the": 6,
"was": 7,
"and": 8,
"to": 9,
"a": 10,
"Noah": 11,
"he": 12,
"his": 13,
"of": 14,
"n<apostrophe>t": 15,
"back": 16,
"<apostrophe><apostrophe>": 17,
"The": 18,
"in": 19,
"it": 20,
"boy": 21,
"had": 22,
"``": 23,
"<apostrophe>s": 24,
"up": 25,
"you": 26,
"Bible": 27,
"one": 28,
"that": 29,
"she": 30,
"I": 31,
"She": 32,
"there": 33,
"on": 34,
"be": 35,
"from": 36,
"friend": 37,
"get": 38,
"over": 39,
"would": 40,
"really": 41,
"about": 42,
"And": 43,
"He": 44,
"off": 45,
"this": 46,
"here": 47,
"never": 48,
"just": 49,
"good": 50,
"with": 51,
"by": 52,
"ditch": 53,
":": 54,
"happened": 55,
"left": 56,
"came": 57,
"But": 58,
"but": 59,
"It": 60,
"they": 61,
"him": 62,
"did": 63,
"were": 64,
"<apostrophe>ll": 65,
"her": 66,
"Quechua": 67,
"now": 68,
"then": 69,
"dead": 70,
"like": 71,
"higher": 72,
"guy": 73,
"for": 74,
"chin": 75,
"into": 76,
"best": 77,
"so": 78,
"them": 79,
"out": 80,
"leave": 81,
"looking": 82,
"Cochabamba": 83,
"eyes": 84,
"grass": 85,
"Oh": 86,
"?": 87,
"asked": 88,
"saw": 89,
"knew": 90,
"slid": 91,
"began": 92,
"quite": 93,
"fell": 94,
"looked": 95,
"used": 96,
"forced": 97,
"intended": 98,
"wore": 99,
"is": 100,
"warmed": 101,
"heard": 102,
"pushed": 103,
"cared": 104,
"Plus": 105,
"meant": 106,
"They": 107,
"no": 108,
"gold": 109,
"we": 110,
"breathe": 111,
"make": 112,
"school": 113,
"give": 114,
"teach": 115,
"speak": 116,
"march": 117,
"try": 118,
"help": 119,
"justice": 120,
"seat": 121,
"could": 122,
"surfer": 123,
"stop": 124,
"stand": 125,
"sleeve": 126,
"position": 127,
"unmistakably": 128,
"care": 129,
"badly": 130,
"eye": 131,
"should": 132,
"find": 133,
"freezing": 134,
"not": 135,
"downright": 136,
"crumpled": 137,
"himself": 138,
"garden": 139,
"dude": 140,
"teaching": 141,
"can": 142,
"Another": 143,
"have": 144,
"class": 145,
"together": 146,
"face-up": 147,
"shaking": 148,
"soon": 149,
"waking": 150,
"none": 151,
"everything": 152,
"blackened": 153,
"sprawled": 154,
"been": 155,
"moved": 156,
"muscle": 157,
"mattered": 158,
"might": 159,
"aside": 160,
"going": 161,
"hardly": 162,
"glinting": 163,
"Today": 164,
"finally": 165,
"anymore": 166,
"gone": 167,
"Tuesday": 168,
"Did": 169,
"Word": 170,
"told": 171,
"climbing": 172,
"mutter": 173,
"stepping": 174,
"dripping": 175,
"Wiping": 176,
"flung": 177,
"town": 178,
"know": 179,
"cap": 180,
"Hearst": 181,
"Ohio": 182,
"are": 183,
"stones": 184,
"native": 185,
"four": 186,
"Bolivian": 187,
"potatoes": 188,
"turquoise": 189,
"lot": 190,
"hat": 191,
"eight": 192,
"right": 193,
"lucky": 194,
"moonlight": 195,
"country": 196,
"street": 197,
"classes": 198,
"calculations": 199,
"knees": 200,
"luggage": 201,
"aura": 202,
"world": 203,
"His": 204,
"cows": 205,
"road": 206,
"hypocrite": 207,
"green": 208,
"By": 209,
"clouds": 210,
"freaky": 211,
"white": 212,
"onto": 213,
"fingers": 214,
"bloodied": 215,
"months": 216,
"tears": 217,
"finger": 218,
"fat": 219,
"full": 220,
"sure": 221,
"surfers": 222,
"Her": 223,
"women": 224,
"things": 225,
"At": 226,
"hours": 227,
"Because": 228,
"under": 229,
"first": 230,
"vibrant": 231,
"nice": 232,
"embroidered": 233,
"emerald": 234,
"sharp": 235,
"muddy": 236,
"rough": 237,
"stuff": 238,
"able": 239,
"low": 240,
"swollen": 241,
"content": 242,
"knit": 243,
"long": 244,
"city": 245,
"hike": 246,
"literacy": 247,
"face": 248,
"cross": 249,
"big": 250,
"glance": 251,
"nap": 252,
"moan": 253,
"uniform": 254,
"perch": 255,
"breath": 256,
"ride": 257,
"virtue": 258,
"towards": 259,
"because": 260,
"Despite": 261,
"since": 262,
"as": 263,
"at": 264,
"what": 265,
"where": 266,
"how": 267,
"when": 268,
"grimaced": 269,
"narrowed": 270,
"rose": 271,
"gave": 272,
"thought": 273,
"tried": 274,
"crisped": 275,
"anybody": 276,
"tossed": 277,
"skin": 278,
"hazel": 279,
"house": 280,
"addition": 281,
"path": 282,
"forward": 283,
"dirt": 284,
"too": 285,
"Not": 286,
"seriously": 287,
"Jeep": 288,
"again": 289,
"sun": 290,
"conference": 291,
"even": 292,
"place": 293,
"child": 294,
"Seriously": 295,
"baby": 296,
"looks": 297,
"This": 298,
"tree": 299,
"lip": 300,
"sky": 301,
"lunch": 302,
"all": 303,
"light": 304,
"stretch": 305,
"staying": 306,
"rising": 307,
"take": 308,
"worried": 309,
"carrying": 310,
"Sounding": 311,
"studied": 312,
"being": 313,
"murky": 314,
"quivered": 315,
"beckoning": 316,
"red": 317,
"guess": 318,
"than": 319,
"Twilight": 320,
"skeletal": 321,
"Spain": 322,
"Spanish": 323,
"strong": 324,
"blond": 325,
"Zone": 326,
"fine": 327,
"own": 328,
"In": 329,
"tall": 330,
"single": 331,
"silky": 332,
"parents": 333,
"our": 334,
"thick": 335,
"shadows": 336,
"gravel": 337,
"remains": 338,
"darker": 339,
"lower": 340,
"If": 341,
"outside": 342,
"<apostrophe>": 343,
"who": 344
},
"unk_token": "<unk>"
}
}