[model] | |
@architectures = "TransformerDecoder" | |
n_blocks = 20 | |
block_size = 4096 | |
prenorm = "True" | |
rope_base = 10000 | |
[model.attention] | |
@layers = "CausalSelfAttention" | |
n_in = 768 | |
n_heads = 12 | |
n_query_groups = 12 | |
q_bias = "False" | |
k_bias = "False" | |
v_bias = "False" | |
o_bias = "False" | |
[model.embedding] | |
@layers = "TokenEmbedding" | |
n_embeddings = 21178 | |
embedding_size = 768 | |
[model.feedforward] | |
@layers = "SwiGLU" | |
n_in = 768 | |
n_hidden = 3072 | |
[model.head] | |
@layers = "ParametrizedLinear" | |
n_in = 768 | |
n_out = 21178 | |
bias = "False" | |
[model.norm] | |
@layers = "RMSNorm" | |
n_in = 768 | |
eps = 0.000001 |