JohannesGaessler commited on
Commit
8d3b3c1
·
1 Parent(s): 0612f1f

llama/ggml: add LLM training support (llama/10544)

Browse files

* llama/ggml: add LLM training support

more compact progress bar

llama_save_model_to_file

llama_opt_param_filter

ggml_graph_dup force_grads

refactor ggml_opt, fix test-opt

* remove logits_all

* refactor CUDA implementation for ACC

* reset graph at beginning of opt period

ggml/include/ggml-opt.h CHANGED
@@ -37,13 +37,16 @@ extern "C" {
37
  // ====== Dataset ======
38
 
39
  GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
40
- int64_t ne_datapoint, // number of elements per datapoint
41
- int64_t ne_label, // number of elements per label
42
- int64_t ndata, // total number of datapoints/labels
43
- int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
 
 
44
  GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
45
 
46
  // get underlying tensors that store the data
 
47
  GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
48
  GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
49
 
@@ -56,13 +59,19 @@ extern "C" {
56
  struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
57
  struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
58
  int64_t ibatch);
 
 
 
 
 
 
59
 
60
  // ====== Model / Context ======
61
 
62
  enum ggml_opt_build_type {
63
- GGML_OPT_BUILD_TYPE_FORWARD,
64
- GGML_OPT_BUILD_TYPE_GRAD,
65
- GGML_OPT_BUILD_TYPE_OPT,
66
  };
67
 
68
  // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
81
  // userdata can be used to pass arbitrary data
82
  typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
83
 
84
- // returns the default optimizer params (constant)
85
  // userdata is not used
86
  GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
87
 
 
 
 
88
  // parameters for initializing a new optimization context
89
  struct ggml_opt_params {
90
  ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
91
 
92
- struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
93
-
94
- // the forward graph is defined by inputs and outputs
95
- // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
96
- struct ggml_tensor * inputs;
97
- struct ggml_tensor * outputs;
98
 
99
  enum ggml_opt_loss_type loss_type;
100
  enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {
107
 
108
  // get parameters for an optimization context with defaults set where possible
109
  // parameters for which no sensible defaults exist are supplied as arguments to this function
110
- GGML_API ggml_opt_params ggml_opt_default_params(
111
- ggml_backend_sched_t backend_sched,
112
- struct ggml_context * ctx_compute,
113
- struct ggml_tensor * inputs,
114
- struct ggml_tensor * outputs,
115
- enum ggml_opt_loss_type loss_type);
116
 
117
  GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
118
  GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
121
  GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
122
 
123
  // get underlying tensors that store data
 
124
  GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
125
  GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
126
  GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
128
  GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
129
  GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
130
 
 
131
  GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
132
 
133
  // ====== Optimization Result ======
134
 
135
- GGML_API ggml_opt_result_t ggml_opt_result_init();
136
  GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
137
  GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
138
 
@@ -144,11 +154,20 @@ extern "C" {
144
 
145
  // ====== Computation ======
146
 
147
- // do forward pass, increment result if not NULL
148
- GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
 
 
 
 
 
 
 
 
 
149
 
150
- // do forward pass, increment result if not NULL, do backward pass
151
- GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
152
 
153
  // ############################################################################
154
  // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
200
  // fit model defined by inputs and outputs to dataset
201
  GGML_API void ggml_opt_fit(
202
  ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
203
- ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
204
- ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
205
- ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
206
  ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
207
  enum ggml_opt_loss_type loss_type, // loss to minimize
208
  ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
 
37
  // ====== Dataset ======
38
 
39
  GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
40
+ enum ggml_type type_data, // the type for the internal data tensor
41
+ enum ggml_type type_label, // the type for the internal labels tensor
42
+ int64_t ne_datapoint, // number of elements per datapoint
43
+ int64_t ne_label, // number of elements per label
44
+ int64_t ndata, // total number of datapoints/labels
45
+ int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
46
  GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
47
 
48
  // get underlying tensors that store the data
49
+ GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
50
  GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
51
  GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
52
 
 
59
  struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
60
  struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
61
  int64_t ibatch);
62
+ GGML_API void ggml_opt_dataset_get_batch_host(
63
+ ggml_opt_dataset_t dataset,
64
+ void * data_batch,
65
+ size_t nb_data_batch,
66
+ void * labels_batch,
67
+ int64_t ibatch);
68
 
69
  // ====== Model / Context ======
70
 
71
  enum ggml_opt_build_type {
72
+ GGML_OPT_BUILD_TYPE_FORWARD = 10,
73
+ GGML_OPT_BUILD_TYPE_GRAD = 20,
74
+ GGML_OPT_BUILD_TYPE_OPT = 30,
75
  };
76
 
77
  // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
 
90
  // userdata can be used to pass arbitrary data
91
  typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
92
 
93
+ // returns the default optimizer params (constant, hard-coded values)
94
  // userdata is not used
95
  GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
96
 
97
+ // casts userdata to ggml_opt_optimizer_params and returns it
98
+ GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
99
+
100
  // parameters for initializing a new optimization context
101
  struct ggml_opt_params {
102
  ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
103
 
104
+ // by default the forward graph needs to be reconstructed for each eval
105
+ // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
106
+ struct ggml_context * ctx_compute;
107
+ struct ggml_tensor * inputs;
108
+ struct ggml_tensor * outputs;
 
109
 
110
  enum ggml_opt_loss_type loss_type;
111
  enum ggml_opt_build_type build_type;
 
118
 
119
  // get parameters for an optimization context with defaults set where possible
120
  // parameters for which no sensible defaults exist are supplied as arguments to this function
121
+ GGML_API struct ggml_opt_params ggml_opt_default_params(
122
+ ggml_backend_sched_t backend_sched,
123
+ enum ggml_opt_loss_type loss_type);
 
 
 
124
 
125
  GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
126
  GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
 
129
  GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
130
 
131
  // get underlying tensors that store data
132
+ // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
133
  GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
134
  GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
135
  GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
 
137
  GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
138
  GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
139
 
140
+ // get the gradient accumulator for a node from the forward graph
141
  GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
142
 
143
  // ====== Optimization Result ======
144
 
145
+ GGML_API ggml_opt_result_t ggml_opt_result_init(void);
146
  GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
147
  GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
148
 
 
154
 
155
  // ====== Computation ======
156
 
157
+ // if not using static graphs, this function must be called prior to ggml_opt_alloc
158
+ GGML_API void ggml_opt_prepare_alloc(
159
+ ggml_opt_context_t opt_ctx,
160
+ struct ggml_context * ctx_compute,
161
+ struct ggml_cgraph * gf,
162
+ struct ggml_tensor * inputs,
163
+ struct ggml_tensor * outputs);
164
+
165
+ // allocate the next graph for evaluation, either forward or forward + backward
166
+ // must be called exactly once prior to calling ggml_opt_eval
167
+ GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
168
 
169
+ // do forward pass, increment result if not NULL, do backward pass if allocated
170
+ GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
171
 
172
  // ############################################################################
173
  // ## The high-level functions start here. They do not depend on any private ##
 
219
  // fit model defined by inputs and outputs to dataset
220
  GGML_API void ggml_opt_fit(
221
  ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
222
+ struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
223
+ struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
224
+ struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
225
  ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
226
  enum ggml_opt_loss_type loss_type, // loss to minimize
227
  ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
ggml/include/ggml.h CHANGED
@@ -768,7 +768,7 @@ extern "C" {
768
  // Tensor flags
769
  GGML_API void ggml_set_input(struct ggml_tensor * tensor);
770
  GGML_API void ggml_set_output(struct ggml_tensor * tensor);
771
- GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
772
  GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
773
 
774
  //
@@ -938,7 +938,7 @@ extern "C" {
938
  GGML_API struct ggml_tensor * ggml_repeat_back(
939
  struct ggml_context * ctx,
940
  struct ggml_tensor * a,
941
- struct ggml_tensor * b);
942
 
943
  // concat a and b along dim
944
  // used in stable-diffusion
@@ -2049,15 +2049,14 @@ extern "C" {
2049
 
2050
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2051
  GGML_API void ggml_build_backward_expand(
2052
- struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
2053
- struct ggml_context * ctx_compute, // context for gradient computation
2054
- struct ggml_cgraph * cgraph,
2055
- bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
2056
 
2057
  // graph allocation in a context
2058
  GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2059
  GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2060
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2061
  GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2062
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2063
  GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
 
768
  // Tensor flags
769
  GGML_API void ggml_set_input(struct ggml_tensor * tensor);
770
  GGML_API void ggml_set_output(struct ggml_tensor * tensor);
771
+ GGML_API void ggml_set_param(struct ggml_tensor * tensor);
772
  GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
773
 
774
  //
 
938
  GGML_API struct ggml_tensor * ggml_repeat_back(
939
  struct ggml_context * ctx,
940
  struct ggml_tensor * a,
941
+ struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
942
 
943
  // concat a and b along dim
944
  // used in stable-diffusion
 
2049
 
2050
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2051
  GGML_API void ggml_build_backward_expand(
2052
+ struct ggml_context * ctx, // context for gradient computation
2053
+ struct ggml_cgraph * cgraph,
2054
+ struct ggml_tensor ** grad_accs);
 
2055
 
2056
  // graph allocation in a context
2057
  GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2058
  GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2059
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
2060
  GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2061
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2062
  GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
ggml/src/ggml-backend.cpp CHANGED
@@ -1111,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1111
 
1112
  const int node_backend_id = tensor_backend_id(node);
1113
 
1114
- assert(node_backend_id != -1); // all nodes should be assigned by now
1115
 
1116
  // check if we should start a new split based on the sources of the current node
1117
  bool need_new_split = false;
 
1111
 
1112
  const int node_backend_id = tensor_backend_id(node);
1113
 
1114
+ assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1115
 
1116
  // check if we should start a new split based on the sources of the current node
1117
  bool need_new_split = false;
ggml/src/ggml-cuda/acc.cu CHANGED
@@ -1,47 +1,61 @@
1
  #include "acc.cuh"
2
 
3
- static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
- const int ne10, const int ne11, const int ne12,
5
- const int nb1, const int nb2, int offset) {
6
- const int i = blockDim.x * blockIdx.x + threadIdx.x;
 
7
  if (i >= ne) {
8
  return;
9
  }
10
- int src1_idx = i - offset;
11
- int oz = src1_idx / nb2;
12
- int oy = (src1_idx - (oz * nb2)) / nb1;
13
- int ox = src1_idx % nb1;
14
- if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
- dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
- } else {
17
- dst[i] = x[i];
 
 
 
 
 
 
 
18
  }
 
19
  }
20
 
21
- static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
- const int ne10, const int ne11, const int ne12,
23
- const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
- int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
- acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
  }
27
 
28
  void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
  const ggml_tensor * src0 = dst->src[0];
30
  const ggml_tensor * src1 = dst->src[1];
31
- const float * src0_d = (const float *)src0->data;
32
- const float * src1_d = (const float *)src1->data;
33
- float * dst_d = (float *)dst->data;
 
 
34
  cudaStream_t stream = ctx.stream();
35
 
36
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
- GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
 
41
- int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
- int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
- // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
- int offset = dst->op_params[3] / 4; // offset in bytes
 
 
 
 
45
 
46
- acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
  }
 
1
  #include "acc.cuh"
2
 
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
4
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
5
+ const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
6
+ const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
7
+
8
  if (i >= ne) {
9
  return;
10
  }
11
+
12
+ int64_t src1_idx = i - offset;
13
+
14
+ int64_t tmp = src1_idx;
15
+ const int64_t i13 = tmp / s13;
16
+ tmp -= i13 * s13;
17
+ const int64_t i12 = tmp / s12;
18
+ tmp -= i12 * s12;
19
+ const int64_t i11 = tmp / s11;
20
+ tmp -= i11 * s11;
21
+ const int64_t i10 = tmp;
22
+
23
+ float val = x[i];
24
+ if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
25
+ val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
26
  }
27
+ dst[i] = val;
28
  }
29
 
30
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
31
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
32
+ const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
33
+ const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
34
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
35
  }
36
 
37
  void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
38
  const ggml_tensor * src0 = dst->src[0];
39
  const ggml_tensor * src1 = dst->src[1];
40
+
41
+ const float * src0_d = (const float *) src0->data;
42
+ const float * src1_d = (const float *) src1->data;
43
+ float * dst_d = (float *) dst->data;
44
+
45
  cudaStream_t stream = ctx.stream();
46
 
47
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
48
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
49
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
50
 
51
+ GGML_ASSERT(ggml_is_contiguous(src1));
52
+ GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
53
+ GGML_ASSERT(ggml_is_contiguously_allocated(dst));
54
+
55
+ const int64_t s1 = dst->op_params[0] / sizeof(float);
56
+ const int64_t s2 = dst->op_params[1] / sizeof(float);
57
+ const int64_t s3 = dst->op_params[2] / sizeof(float);
58
+ const int64_t offset = dst->op_params[3] / sizeof(float);
59
 
60
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
61
  }
ggml/src/ggml-cuda/sum.cu CHANGED
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
31
 
32
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
33
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
34
- GGML_ASSERT(ggml_is_contiguous(src0));
35
 
36
  const float * src0_d = (const float *) src0->data;
37
  float * dst_d = (float *) dst->data;
 
31
 
32
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
33
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
34
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
35
 
36
  const float * src0_d = (const float *) src0->data;
37
  float * dst_d = (float *) dst->data;
ggml/src/ggml-opt.cpp CHANGED
@@ -28,16 +28,19 @@ struct ggml_opt_dataset {
28
  };
29
 
30
  struct ggml_opt_context {
31
- ggml_backend_sched_t backend_sched = nullptr;
32
- ggml_cgraph * allocated_graph = nullptr;
33
- ggml_cgraph * allocated_graph_copy = nullptr;
34
- struct ggml_context * ctx_static = nullptr;
35
- struct ggml_context * ctx_static_cpu = nullptr;
36
- struct ggml_context * ctx_compute = nullptr;
37
- struct ggml_context * ctx_copy = nullptr;
38
- ggml_backend_buffer_t buf_static = nullptr;
39
- ggml_backend_buffer_t buf_static_cpu = nullptr;
40
- std::mt19937 rng;
 
 
 
41
 
42
  struct ggml_tensor * inputs = nullptr;
43
  struct ggml_tensor * outputs = nullptr;
@@ -50,6 +53,11 @@ struct ggml_opt_context {
50
  struct ggml_cgraph * gf = nullptr;
51
  struct ggml_cgraph * gb_grad = nullptr;
52
  struct ggml_cgraph * gb_opt = nullptr;
 
 
 
 
 
53
 
54
  int64_t iter = 1;
55
  int32_t opt_period = 1;
@@ -73,7 +81,13 @@ struct ggml_opt_result {
73
 
74
  // ====== Dataset ======
75
 
76
- ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
 
 
 
 
 
 
77
  GGML_ASSERT(ne_datapoint > 0);
78
  GGML_ASSERT(ne_label >= 0);
79
  GGML_ASSERT(ndata > 0);
@@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label,
92
  result->ctx = ggml_init(params);
93
  }
94
 
95
- result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
96
  result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
97
 
98
  if (ne_label > 0) {
99
- result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
100
  result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
101
  } else {
102
  result->labels = nullptr;
@@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
119
  delete dataset;
120
  }
121
 
 
 
 
 
122
  struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
123
  return dataset->data;
124
  }
@@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
144
  GGML_ASSERT( data_batch && ggml_is_contiguous(data_batch));
145
  GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
146
  GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
 
 
147
 
148
  const size_t nb_data_batch = ggml_nbytes(data_batch);
149
  GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
171
  }
172
  }
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  // ====== Model / Context ======
175
 
176
  struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
187
  return result;
188
  }
189
 
 
 
 
 
190
  struct ggml_opt_params ggml_opt_default_params(
191
  ggml_backend_sched_t backend_sched,
192
- struct ggml_context * ctx_compute,
193
- struct ggml_tensor * inputs,
194
- struct ggml_tensor * outputs,
195
  enum ggml_opt_loss_type loss_type) {
196
  return {
197
  /*backend_sched =*/ backend_sched,
198
- /*ctx_compute =*/ ctx_compute,
199
- /*inputs =*/ inputs,
200
- /*logits =*/ outputs,
201
  /*loss_type =*/ loss_type,
202
  /*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
203
  /*opt_period =*/ 1,
@@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
266
  return dst;
267
  }
268
 
269
- static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
270
- GGML_ASSERT(graph);
271
- if (opt_ctx->allocated_graph == graph) {
272
- return;
273
- }
274
-
275
- ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
276
-
277
- {
278
- ggml_init_params params = {
279
- /*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
280
- /*.mem_buffer =*/ nullptr,
281
- /*.no_alloc =*/ true,
282
- };
283
- ggml_free(opt_ctx->ctx_copy);
284
- opt_ctx->ctx_copy = ggml_init(params);
285
- }
286
-
287
- opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
288
-
289
- ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
290
- opt_ctx->allocated_graph = graph;
291
- }
292
-
293
- ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
294
- ggml_opt_context_t result = new struct ggml_opt_context;
295
- result->backend_sched = params.backend_sched;
296
- result->ctx_compute = params.ctx_compute;
297
- result->inputs = params.inputs;
298
- result->outputs = params.outputs;
299
- result->opt_period = params.opt_period;
300
- result->get_opt_pars = params.get_opt_pars;
301
- result->get_opt_pars_ud = params.get_opt_pars_ud;
302
-
303
- GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
304
- GGML_ASSERT(result->opt_period >= 1);
305
-
306
- const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
307
- (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
308
 
309
- ggml_set_input(result->inputs);
310
- ggml_set_output(result->outputs);
311
 
312
- result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
313
- ggml_build_forward_expand(result->gf, result->outputs);
314
 
315
  int n_param = 0;
316
- for (int i = 0; i < result->gf->n_nodes; ++i) {
317
- if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
 
318
  n_param++;
319
  }
 
320
  }
321
 
322
- {
323
  // The static context is used for:
324
- // - gradients (1 tensor per param if using gradient accumulation)
325
  // - optimizer momenta (2 tensors per param)
326
- // - labels
327
- // - loss + its gradient (up to 5 tensors)
328
- // - pred
329
- // - ncorrect (2 tensors).
330
- const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
331
- const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
 
 
 
332
  struct ggml_init_params params = {
333
  /*.mem_size =*/ size_meta,
334
  /*.mem_buffer =*/ nullptr,
335
  /*.no_alloc =*/ true,
336
  };
337
- result->ctx_static = ggml_init(params);
338
  }
 
 
339
  {
340
- // The static cpu context is used for:
341
- // - optimizer parameters (1 for the entire context)
 
342
  const size_t size_meta = 1 * ggml_tensor_overhead();
343
  struct ggml_init_params params = {
344
  /*.mem_size =*/ size_meta,
345
  /*.mem_buffer =*/ nullptr,
346
  /*.no_alloc =*/ true,
347
  };
348
- result->ctx_static_cpu = ggml_init(params);
 
 
 
 
349
  }
350
 
 
351
 
352
- switch (params.loss_type) {
353
  case GGML_OPT_LOSS_TYPE_MEAN: {
354
- result->loss = ggml_sum(result->ctx_static, result->outputs);
355
- ggml_set_name(result->loss, "loss_sum");
356
- const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
357
- result->loss = ggml_scale(result->ctx_static, result->loss, scale);
358
- ggml_set_name(result->loss, "loss_mean");
359
- result->loss_per_datapoint = true;
360
  break;
361
  }
362
  case GGML_OPT_LOSS_TYPE_SUM: {
363
- result->loss = ggml_sum(result->ctx_static, result->outputs);
364
- ggml_set_name(result->loss, "loss_sum");
365
- result->loss_per_datapoint = false;
366
  break;
367
  }
368
  case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
369
- result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
370
- ggml_set_input(result->labels);
371
- ggml_set_name(result->labels, "labels");
372
- result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
373
- ggml_set_name(result->loss, "loss_cross_entropy");
374
- if (result->opt_period > 1) {
375
- result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
376
- ggml_set_name(result->loss, "loss_cross_entropy_scaled");
377
  }
378
- result->loss_per_datapoint = true;
379
  break;
380
  }
381
  case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
382
- result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
383
- ggml_set_input(result->labels);
384
- ggml_set_name(result->labels, "labels");
385
- result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
386
- ggml_set_name(result->loss, "loss_error");
387
- result->loss = ggml_sqr(result->ctx_static, result->loss);
388
- ggml_set_name(result->loss, "loss_squared_error");
389
- result->loss = ggml_sum(result->ctx_static, result->loss);
390
- ggml_set_name(result->loss, "loss_sum_squared_error");
391
- const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
392
- result->loss = ggml_scale(result->ctx_static, result->loss, scale);
393
- ggml_set_name(result->loss, "loss_mean_squared_error");
394
- result->loss_per_datapoint = true;
395
  break;
396
  }
397
  }
398
- ggml_set_output(result->loss);
399
- ggml_set_loss(result->loss);
400
- ggml_build_forward_expand(result->gf, result->loss);
401
-
402
- result->pred = ggml_argmax(result->ctx_static, result->outputs);
403
- ggml_set_name(result->pred, "pred");
404
- ggml_set_output(result->pred);
405
- ggml_build_forward_expand(result->gf, result->pred);
 
 
 
 
 
 
 
406
 
407
- if (result->labels) {
408
- result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
409
- ggml_set_name(result->ncorrect, "ncorrect");
410
- ggml_set_output(result->ncorrect);
411
- ggml_build_forward_expand(result->gf, result->ncorrect);
412
- } else {
413
- result->ncorrect = nullptr;
 
414
  }
415
 
416
- if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
417
- result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
418
- return result;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  }
420
 
421
  // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
422
- result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
423
- ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
424
 
425
- if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
426
- result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
427
- ggml_graph_reset(result->gb_grad);
428
- return result;
 
 
 
429
  }
430
 
431
- GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
432
 
433
  // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
434
- result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
435
 
436
- result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
437
- ggml_set_input(result->adamw_params);
438
- ggml_set_name(result->adamw_params, "adamw_params");
439
 
440
- for (int i = result->gf->n_nodes-1; i >= 0; --i) {
441
- struct ggml_tensor * node = result->gb_opt->nodes[i];
442
- struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
443
 
444
- if (node->flags & GGML_TENSOR_FLAG_PARAM) {
445
- struct ggml_tensor * m = ggml_dup_tensor(result->ctx_static, node);
446
- struct ggml_tensor * v = ggml_dup_tensor(result->ctx_static, node);
447
- struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
448
- ggml_build_forward_expand(result->gb_opt, opt_step);
 
 
 
 
 
449
  }
450
  }
451
 
452
- result->buf_static = ggml_backend_alloc_ctx_tensors(
453
- result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
 
 
 
454
 
455
- result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
 
456
 
457
- ggml_graph_reset(result->gb_opt);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  return result;
460
  }
@@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
464
  return;
465
  }
466
  ggml_backend_buffer_free(opt_ctx->buf_static);
467
- ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
468
  ggml_free(opt_ctx->ctx_static);
469
- ggml_free(opt_ctx->ctx_static_cpu);
470
  delete opt_ctx;
471
  }
472
 
@@ -582,8 +679,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl
582
 
583
  // ====== Computation ======
584
 
585
- static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
586
- if (graph != opt_ctx->gf) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
588
 
589
  GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
@@ -609,9 +777,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
609
  adamw_par_data[6] = beta2h;
610
  }
611
 
612
- ggml_opt_alloc_graph(opt_ctx, graph);
613
  ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
614
  opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
 
 
 
 
 
 
 
 
 
 
 
615
 
616
  if (!result) {
617
  return;
@@ -635,12 +813,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
635
  ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
636
  result->loss.push_back(loss);
637
 
638
- GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
639
- std::vector<int32_t> pred(ndata);
640
- ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
641
- result->pred.insert(result->pred.end(), pred.begin(), pred.end());
 
 
642
 
643
- if (!opt_ctx->labels || result->ncorrect < 0) {
644
  result->ncorrect = -1;
645
  return;
646
  }
@@ -652,26 +832,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
652
  result->ncorrect += ncorrect;
653
  }
654
 
655
- void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
656
- ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
657
- }
658
-
659
- void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
660
- if (opt_ctx->opt_period == 1) {
661
- ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
662
- return;
663
- }
664
-
665
- const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
666
- if (opt_i_next == 0) {
667
- ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
668
- ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
669
- } else {
670
- ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
671
- }
672
- opt_ctx->opt_i = opt_i_next;
673
- }
674
-
675
  // ====== High-Level Functions ======
676
 
677
  void ggml_opt_epoch(
@@ -700,16 +860,18 @@ void ggml_opt_epoch(
700
  int64_t ibatch = 0;
701
  int64_t t_loop_start = ggml_time_us();
702
  for (; ibatch < ibatch_split; ++ibatch) {
 
703
  ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
704
- ggml_opt_forward_backward(opt_ctx, result_train);
705
  if (callback_train) {
706
  callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
707
  }
708
  }
709
  t_loop_start = ggml_time_us();
710
  for (; ibatch < nbatches; ++ibatch) {
 
711
  ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
712
- ggml_opt_forward(opt_ctx, result_eval);
713
  if (callback_eval) {
714
  callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
715
  }
@@ -726,13 +888,26 @@ void ggml_opt_epoch_callback_progress_bar(
726
  int64_t t_start_us) {
727
  fprintf(stderr, "%s[", train ? "train: " : "val: ");
728
 
729
- constexpr int64_t bar_length = 25;
 
 
730
  for (int64_t j = 0; j < bar_length; ++j) {
731
- const int64_t ibatch_j = ibatch_max * j/bar_length;
732
- if (ibatch_j < ibatch) {
733
- fprintf(stderr, "=");
734
- } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
735
- fprintf(stderr, ">");
 
 
 
 
 
 
 
 
 
 
 
736
  } else {
737
  fprintf(stderr, " ");
738
  }
@@ -764,8 +939,8 @@ void ggml_opt_epoch_callback_progress_bar(
764
  const int64_t t_eta_m = t_eta_s / 60;
765
  t_eta_s -= t_eta_m * 60;
766
 
767
- fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
768
- "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
769
  idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
770
  t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
771
  if (ibatch == ibatch_max) {
@@ -806,7 +981,10 @@ void ggml_opt_fit(
806
 
807
  int64_t epoch = 1;
808
 
809
- ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
 
 
 
810
  params.opt_period = opt_period;
811
  params.get_opt_pars = get_opt_pars;
812
  params.get_opt_pars_ud = &epoch;
 
28
  };
29
 
30
  struct ggml_opt_context {
31
+ ggml_backend_sched_t backend_sched = nullptr;
32
+ ggml_cgraph * allocated_graph = nullptr;
33
+ ggml_cgraph * allocated_graph_copy = nullptr;
34
+ struct ggml_context * ctx_static = nullptr;
35
+ struct ggml_context * ctx_cpu = nullptr;
36
+ struct ggml_context * ctx_compute = nullptr;
37
+ struct ggml_context * ctx_copy = nullptr;
38
+ ggml_backend_buffer_t buf_static = nullptr;
39
+ ggml_backend_buffer_t buf_cpu = nullptr;
40
+ std::mt19937 rng;
41
+ enum ggml_opt_loss_type loss_type;
42
+ enum ggml_opt_build_type build_type;
43
+ enum ggml_opt_build_type build_type_alloc;
44
 
45
  struct ggml_tensor * inputs = nullptr;
46
  struct ggml_tensor * outputs = nullptr;
 
53
  struct ggml_cgraph * gf = nullptr;
54
  struct ggml_cgraph * gb_grad = nullptr;
55
  struct ggml_cgraph * gb_opt = nullptr;
56
+ bool static_graphs = false;
57
+ bool eval_ready = false;
58
+ std::vector<struct ggml_tensor *> grad_accs;
59
+ std::vector<struct ggml_tensor *> grad_m;
60
+ std::vector<struct ggml_tensor *> grad_v;
61
 
62
  int64_t iter = 1;
63
  int32_t opt_period = 1;
 
81
 
82
  // ====== Dataset ======
83
 
84
+ ggml_opt_dataset_t ggml_opt_dataset_init(
85
+ enum ggml_type type_data,
86
+ enum ggml_type type_label,
87
+ int64_t ne_datapoint,
88
+ int64_t ne_label,
89
+ int64_t ndata,
90
+ int64_t ndata_shard) {
91
  GGML_ASSERT(ne_datapoint > 0);
92
  GGML_ASSERT(ne_label >= 0);
93
  GGML_ASSERT(ndata > 0);
 
106
  result->ctx = ggml_init(params);
107
  }
108
 
109
+ result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
110
  result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
111
 
112
  if (ne_label > 0) {
113
+ result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
114
  result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
115
  } else {
116
  result->labels = nullptr;
 
133
  delete dataset;
134
  }
135
 
136
+ int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
137
+ return dataset->ndata;
138
+ }
139
+
140
  struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
141
  return dataset->data;
142
  }
 
162
  GGML_ASSERT( data_batch && ggml_is_contiguous(data_batch));
163
  GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
164
  GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
165
+ GGML_ASSERT( data_batch->type == dataset->data->type);
166
+ GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);
167
 
168
  const size_t nb_data_batch = ggml_nbytes(data_batch);
169
  GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
 
191
  }
192
  }
193
 
194
+ void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
195
+ GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
196
+ GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
197
+
198
+ const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
199
+
200
+ GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
201
+
202
+ for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
203
+ const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
204
+
205
+ const char * ptr_data = (const char *) dataset->data->data + ishard *dataset->nbs_data;
206
+ char * ptr_data_batch = (char *) data_batch + ishard_batch*dataset->nbs_data;
207
+ memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
208
+
209
+ if (!labels_batch) {
210
+ continue;
211
+ }
212
+
213
+ const char * ptr_labels = (const char *) dataset->labels->data + ishard *dataset->nbs_labels;
214
+ char * ptr_labels_batch = (char *) labels_batch + ishard_batch*dataset->nbs_labels;
215
+ memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
216
+ }
217
+ }
218
+
219
  // ====== Model / Context ======
220
 
221
  struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
 
232
  return result;
233
  }
234
 
235
+ struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
236
+ return *((struct ggml_opt_optimizer_params *) userdata);
237
+ }
238
+
239
  struct ggml_opt_params ggml_opt_default_params(
240
  ggml_backend_sched_t backend_sched,
 
 
 
241
  enum ggml_opt_loss_type loss_type) {
242
  return {
243
  /*backend_sched =*/ backend_sched,
244
+ /*ctx_compute =*/ nullptr,
245
+ /*inputs =*/ nullptr,
246
+ /*logits =*/ nullptr,
247
  /*loss_type =*/ loss_type,
248
  /*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
249
  /*opt_period =*/ 1,
 
312
  return dst;
313
  }
314
 
315
+ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
316
+ GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
317
+ GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
320
+ !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
321
 
322
+ ggml_set_input(opt_ctx->inputs);
323
+ ggml_set_output(opt_ctx->outputs);
324
 
325
  int n_param = 0;
326
+ for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
327
+ const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
328
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
329
  n_param++;
330
  }
331
+ GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
332
  }
333
 
334
+ if (!opt_ctx->ctx_static) {
335
  // The static context is used for:
336
+ // - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
337
  // - optimizer momenta (2 tensors per param)
338
+ // - labels (if using static graphs)
339
+ // - loss (if using static graphs, up to 5 tensors)
340
+ // - pred (if using static graphs)
341
+ // - ncorrect (if using static graphs, 2 tensors).
342
+ constexpr size_t n_loss = 1;
343
+ const size_t tensors_per_param = (accumulate ? 1 : 0) +
344
+ (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
345
+ const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
346
+ const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
347
  struct ggml_init_params params = {
348
  /*.mem_size =*/ size_meta,
349
  /*.mem_buffer =*/ nullptr,
350
  /*.no_alloc =*/ true,
351
  };
352
+ opt_ctx->ctx_static = ggml_init(params);
353
  }
354
+ GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
355
+
356
  {
357
+ // The cpu context is allocated statically if using static graphs, dynamically otherwise.
358
+ // It is used for:
359
+ // - optimizer parameters (1 shared for all optimizer invocations)
360
  const size_t size_meta = 1 * ggml_tensor_overhead();
361
  struct ggml_init_params params = {
362
  /*.mem_size =*/ size_meta,
363
  /*.mem_buffer =*/ nullptr,
364
  /*.no_alloc =*/ true,
365
  };
366
+ ggml_free(opt_ctx->ctx_cpu);
367
+ opt_ctx->ctx_cpu = ggml_init(params);
368
+
369
+ ggml_backend_buffer_free(opt_ctx->buf_cpu);
370
+ opt_ctx->buf_cpu = nullptr;
371
  }
372
 
373
+ struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;
374
 
375
+ switch (opt_ctx->loss_type) {
376
  case GGML_OPT_LOSS_TYPE_MEAN: {
377
+ opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
378
+ ggml_set_name(opt_ctx->loss, "loss_sum");
379
+ const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
380
+ opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
381
+ ggml_set_name(opt_ctx->loss, "loss_mean");
382
+ opt_ctx->loss_per_datapoint = true;
383
  break;
384
  }
385
  case GGML_OPT_LOSS_TYPE_SUM: {
386
+ opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
387
+ ggml_set_name(opt_ctx->loss, "loss_sum");
388
+ opt_ctx->loss_per_datapoint = false;
389
  break;
390
  }
391
  case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
392
+ opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
393
+ ggml_set_input(opt_ctx->labels);
394
+ ggml_set_name(opt_ctx->labels, "labels");
395
+ opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
396
+ ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
397
+ if (opt_ctx->opt_period > 1) {
398
+ opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
399
+ ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
400
  }
401
+ opt_ctx->loss_per_datapoint = true;
402
  break;
403
  }
404
  case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
405
+ opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
406
+ ggml_set_input(opt_ctx->labels);
407
+ ggml_set_name(opt_ctx->labels, "labels");
408
+ opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
409
+ ggml_set_name(opt_ctx->loss, "loss_error");
410
+ opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
411
+ ggml_set_name(opt_ctx->loss, "loss_squared_error");
412
+ opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
413
+ ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
414
+ const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
415
+ opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
416
+ ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
417
+ opt_ctx->loss_per_datapoint = true;
418
  break;
419
  }
420
  }
421
+ ggml_set_output(opt_ctx->loss);
422
+ ggml_set_loss(opt_ctx->loss);
423
+ ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);
424
+
425
+ if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
426
+ opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
427
+ ggml_set_name(opt_ctx->pred, "pred");
428
+ ggml_set_output(opt_ctx->pred);
429
+ ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);
430
+
431
+ opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
432
+ ggml_set_name(opt_ctx->ncorrect, "ncorrect");
433
+ ggml_set_output(opt_ctx->ncorrect);
434
+ ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
435
+ }
436
 
437
+ if (opt_ctx->buf_static) {
438
+ if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
439
+ return;
440
+ }
441
+ } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
442
+ opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
443
+ opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
444
+ return;
445
  }
446
 
447
+ if (opt_ctx->grad_accs.empty()) {
448
+ GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);
449
+
450
+ const int n_nodes = opt_ctx->gf->n_nodes;
451
+ opt_ctx->grad_accs.resize(n_nodes);
452
+ for (int i = 0; i < n_nodes; ++i) {
453
+ ggml_tensor * node = opt_ctx->gf->nodes[i];
454
+ if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
455
+ opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
456
+ } else {
457
+ opt_ctx->grad_accs[i] = nullptr;
458
+ }
459
+ }
460
+
461
+ if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
462
+ opt_ctx->grad_m.resize(n_nodes);
463
+ opt_ctx->grad_v.resize(n_nodes);
464
+ for (int i = 0; i < n_nodes; ++i) {
465
+ ggml_tensor * node = opt_ctx->gf->nodes[i];
466
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
467
+ opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
468
+ opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
469
+ } else {
470
+ opt_ctx->grad_m[i] = nullptr;
471
+ opt_ctx->grad_v[i] = nullptr;
472
+ }
473
+ }
474
+ }
475
  }
476
 
477
  // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
478
+ opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
479
+ ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
480
 
481
+ if (opt_ctx->buf_static) {
482
+ if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
483
+ return;
484
+ }
485
+ } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
486
+ opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
487
+ ggml_graph_reset(opt_ctx->gb_grad);
488
  }
489
 
490
+ GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
491
 
492
  // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
493
+ opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
494
 
495
+ opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
496
+ ggml_set_input(opt_ctx->adamw_params);
497
+ ggml_set_name(opt_ctx->adamw_params, "adamw_params");
498
 
499
+ for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
500
+ struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
501
+ struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
502
 
503
+ if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
504
+ struct ggml_tensor * m = opt_ctx->grad_m[i];
505
+ struct ggml_tensor * v = opt_ctx->grad_v[i];
506
+ struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
507
+
508
+ ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
509
+ ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
510
+ ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
511
+
512
+ ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
513
  }
514
  }
515
 
516
+ if (!opt_ctx->buf_static) {
517
+ opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
518
+ opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
519
+ ggml_graph_reset(opt_ctx->gb_opt);
520
+ }
521
 
522
+ opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
523
+ }
524
 
525
+ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
526
+ ggml_opt_context_t result = new struct ggml_opt_context;
527
+ result->backend_sched = params.backend_sched;
528
+ result->ctx_compute = params.ctx_compute;
529
+ result->loss_type = params.loss_type;
530
+ result->build_type = params.build_type;
531
+ result->build_type_alloc = params.build_type;
532
+ result->inputs = params.inputs;
533
+ result->outputs = params.outputs;
534
+ result->opt_period = params.opt_period;
535
+ result->get_opt_pars = params.get_opt_pars;
536
+ result->get_opt_pars_ud = params.get_opt_pars_ud;
537
+
538
+ GGML_ASSERT(result->opt_period >= 1);
539
+
540
+ result->static_graphs = result->ctx_compute;
541
+
542
+ if (!result->static_graphs) {
543
+ GGML_ASSERT(!result->inputs);
544
+ GGML_ASSERT(!result->outputs);
545
+ return result;
546
+ }
547
+
548
+ GGML_ASSERT(result->inputs);
549
+ GGML_ASSERT(result->outputs);
550
+
551
+ result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
552
+ ggml_build_forward_expand(result->gf, result->outputs);
553
+
554
+ ggml_opt_build(result);
555
 
556
  return result;
557
  }
 
561
  return;
562
  }
563
  ggml_backend_buffer_free(opt_ctx->buf_static);
564
+ ggml_backend_buffer_free(opt_ctx->buf_cpu);
565
  ggml_free(opt_ctx->ctx_static);
566
+ ggml_free(opt_ctx->ctx_cpu);
567
  delete opt_ctx;
568
  }
569
 
 
679
 
680
  // ====== Computation ======
681
 
682
+ void ggml_opt_prepare_alloc(
683
+ ggml_opt_context_t opt_ctx,
684
+ struct ggml_context * ctx_compute,
685
+ struct ggml_cgraph * gf,
686
+ struct ggml_tensor * inputs,
687
+ struct ggml_tensor * outputs) {
688
+ GGML_ASSERT(!opt_ctx->static_graphs);
689
+ opt_ctx->ctx_compute = ctx_compute;
690
+ opt_ctx->gf = gf;
691
+ opt_ctx->inputs = inputs;
692
+ opt_ctx->outputs = outputs;
693
+ }
694
+
695
+ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
696
+ GGML_ASSERT(!opt_ctx->eval_ready);
697
+ if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
698
+ ggml_graph_reset(opt_ctx->gb_grad);
699
+ }
700
+ if (backward) {
701
+ const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
702
+ opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
703
+ } else {
704
+ opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
705
+ }
706
+
707
+ if (!opt_ctx->static_graphs) {
708
+ ggml_opt_build(opt_ctx);
709
+ }
710
+
711
+ struct ggml_cgraph * graph = nullptr;
712
+ switch (opt_ctx->build_type) {
713
+ case GGML_OPT_BUILD_TYPE_FORWARD: {
714
+ graph = opt_ctx->gf;
715
+ } break;
716
+ case GGML_OPT_BUILD_TYPE_GRAD: {
717
+ graph = opt_ctx->gb_grad;
718
+ } break;
719
+ case GGML_OPT_BUILD_TYPE_OPT: {
720
+ graph = opt_ctx->gb_opt;
721
+ } break;
722
+ }
723
+ GGML_ASSERT(graph);
724
+
725
+ if (opt_ctx->allocated_graph == graph) {
726
+ opt_ctx->eval_ready = true;
727
+ return;
728
+ }
729
+
730
+ ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
731
+
732
+ if (opt_ctx->static_graphs) {
733
+ ggml_init_params params = {
734
+ /*.mem_size =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
735
+ /*.mem_buffer =*/ nullptr,
736
+ /*.no_alloc =*/ true,
737
+ };
738
+ ggml_free(opt_ctx->ctx_copy);
739
+ opt_ctx->ctx_copy = ggml_init(params);
740
+
741
+ opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
742
+ } else {
743
+ opt_ctx->allocated_graph_copy = graph;
744
+ }
745
+
746
+ ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
747
+ opt_ctx->allocated_graph = graph;
748
+
749
+ opt_ctx->eval_ready = true;
750
+ }
751
+
752
+ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
753
+ GGML_ASSERT(opt_ctx->eval_ready);
754
+ if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
755
  struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
756
 
757
  GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
 
777
  adamw_par_data[6] = beta2h;
778
  }
779
 
 
780
  ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
781
  opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
782
+ opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
783
+
784
+ if (!opt_ctx->static_graphs) {
785
+ opt_ctx->gf = nullptr;
786
+ opt_ctx->gb_grad = nullptr;
787
+ opt_ctx->gb_opt = nullptr;
788
+ opt_ctx->allocated_graph = nullptr;
789
+ opt_ctx->allocated_graph_copy = nullptr;
790
+ }
791
+
792
+ opt_ctx->eval_ready = false;
793
 
794
  if (!result) {
795
  return;
 
813
  ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
814
  result->loss.push_back(loss);
815
 
816
+ if (opt_ctx->pred) {
817
+ GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
818
+ std::vector<int32_t> pred(ndata);
819
+ ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
820
+ result->pred.insert(result->pred.end(), pred.begin(), pred.end());
821
+ }
822
 
823
+ if (!opt_ctx->ncorrect || result->ncorrect < 0) {
824
  result->ncorrect = -1;
825
  return;
826
  }
 
832
  result->ncorrect += ncorrect;
833
  }
834
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
  // ====== High-Level Functions ======
836
 
837
  void ggml_opt_epoch(
 
860
  int64_t ibatch = 0;
861
  int64_t t_loop_start = ggml_time_us();
862
  for (; ibatch < ibatch_split; ++ibatch) {
863
+ ggml_opt_alloc(opt_ctx, /*backward =*/ true);
864
  ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
865
+ ggml_opt_eval(opt_ctx, result_train);
866
  if (callback_train) {
867
  callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
868
  }
869
  }
870
  t_loop_start = ggml_time_us();
871
  for (; ibatch < nbatches; ++ibatch) {
872
+ ggml_opt_alloc(opt_ctx, /*backward =*/ false);
873
  ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
874
+ ggml_opt_eval(opt_ctx, result_eval);
875
  if (callback_eval) {
876
  callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
877
  }
 
888
  int64_t t_start_us) {
889
  fprintf(stderr, "%s[", train ? "train: " : "val: ");
890
 
891
+ // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
892
+ constexpr int64_t bar_length = 8;
893
+ const int64_t ibatch8 = 8 * ibatch;
894
  for (int64_t j = 0; j < bar_length; ++j) {
895
+ if (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
896
+ fprintf(stderr, "\u2588"); // full block
897
+ } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
898
+ fprintf(stderr, "\u2589"); // 7/8 filled
899
+ } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
900
+ fprintf(stderr, "\u258A"); // 6/8 filled
901
+ } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
902
+ fprintf(stderr, "\u258B"); // 5/8 filled
903
+ } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
904
+ fprintf(stderr, "\u258C"); // 4/8 filled
905
+ } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
906
+ fprintf(stderr, "\u258D"); // 3/8 filled
907
+ } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
908
+ fprintf(stderr, "\u258E"); // 2/8 filled
909
+ } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
910
+ fprintf(stderr, "\u258F"); // 1/8 filled
911
  } else {
912
  fprintf(stderr, " ");
913
  }
 
939
  const int64_t t_eta_m = t_eta_s / 60;
940
  t_eta_s -= t_eta_m * 60;
941
 
942
+ fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
943
+ "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
944
  idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
945
  t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
946
  if (ibatch == ibatch_max) {
 
981
 
982
  int64_t epoch = 1;
983
 
984
+ ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
985
+ params.ctx_compute = ctx_compute;
986
+ params.inputs = inputs;
987
+ params.outputs = outputs;
988
  params.opt_period = opt_period;
989
  params.get_opt_pars = get_opt_pars;
990
  params.get_opt_pars_ud = &epoch;
ggml/src/ggml.c CHANGED
@@ -5499,7 +5499,7 @@ static void ggml_compute_backward(
5499
  // tensor = src0 * 1 + src1 * 0
5500
  if (src0_needs_grads) {
5501
  // dsrc0 = dtensor * 1
5502
- ggml_add_or_set(ctx, cgraph, isrc0, grad);
5503
  }
5504
  if (src1_needs_grads) {
5505
  // dsrc1 = dtensor * 0 -> noop
@@ -5780,10 +5780,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
5780
  }
5781
 
5782
  void ggml_build_backward_expand(
5783
- struct ggml_context * ctx_static,
5784
- struct ggml_context * ctx_compute,
5785
- struct ggml_cgraph * cgraph,
5786
- bool accumulate) {
5787
  GGML_ASSERT(cgraph->n_nodes > 0);
5788
  GGML_ASSERT(cgraph->grads);
5789
  GGML_ASSERT(cgraph->grad_accs);
@@ -5856,21 +5855,24 @@ void ggml_build_backward_expand(
5856
  GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
5857
  node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
5858
 
5859
- const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
5860
- GGML_ASSERT(igrad != GGML_HASHSET_FULL);
5861
- GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
5862
- if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
5863
- cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
5864
- cgraph->grads[igrad] = cgraph->grad_accs[igrad];
5865
- ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
 
 
 
5866
  }
5867
- grads_needed[igrad] = true;
5868
  }
5869
 
5870
  for (int i = n_nodes_f - 1; i >= 0; --i) {
5871
  // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
5872
  // use allocator to automatically make inplace operations
5873
- ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
5874
  }
5875
 
5876
  free(grads_needed);
@@ -6016,8 +6018,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
6016
  }
6017
  }
6018
 
6019
- struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
6020
- struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
6021
  ggml_graph_cpy(cgraph, result);
6022
  return result;
6023
  }
@@ -6036,6 +6038,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
6036
  }
6037
 
6038
  void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 
 
 
6039
  GGML_ASSERT(cgraph->grads != NULL);
6040
 
6041
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6345,8 +6350,8 @@ void ggml_set_output(struct ggml_tensor * tensor) {
6345
  tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
6346
  }
6347
 
6348
- void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
6349
- GGML_UNUSED(ctx); // TODO: remove this parameter
6350
  tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6351
  }
6352
 
 
5499
  // tensor = src0 * 1 + src1 * 0
5500
  if (src0_needs_grads) {
5501
  // dsrc0 = dtensor * 1
5502
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0));
5503
  }
5504
  if (src1_needs_grads) {
5505
  // dsrc1 = dtensor * 0 -> noop
 
5780
  }
5781
 
5782
  void ggml_build_backward_expand(
5783
+ struct ggml_context * ctx,
5784
+ struct ggml_cgraph * cgraph,
5785
+ struct ggml_tensor ** grad_accs) {
 
5786
  GGML_ASSERT(cgraph->n_nodes > 0);
5787
  GGML_ASSERT(cgraph->grads);
5788
  GGML_ASSERT(cgraph->grad_accs);
 
5855
  GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
5856
  node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
5857
 
5858
+ const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
5859
+ GGML_ASSERT(ihash != GGML_HASHSET_FULL);
5860
+ GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
5861
+ if (grad_accs && grad_accs[i]) {
5862
+ cgraph->grad_accs[ihash] = grad_accs[i];
5863
+ cgraph->grads[ihash] = cgraph->grad_accs[ihash];
5864
+ } else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
5865
+ // loss tensors always need a gradient accumulator
5866
+ cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
5867
+ cgraph->grads[ihash] = cgraph->grad_accs[ihash];
5868
  }
5869
+ grads_needed[ihash] = true;
5870
  }
5871
 
5872
  for (int i = n_nodes_f - 1; i >= 0; --i) {
5873
  // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
5874
  // use allocator to automatically make inplace operations
5875
+ ggml_compute_backward(ctx, cgraph, i, grads_needed);
5876
  }
5877
 
5878
  free(grads_needed);
 
6018
  }
6019
  }
6020
 
6021
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
6022
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads);
6023
  ggml_graph_cpy(cgraph, result);
6024
  return result;
6025
  }
 
6038
  }
6039
 
6040
  void ggml_graph_reset(struct ggml_cgraph * cgraph) {
6041
+ if (!cgraph) {
6042
+ return;
6043
+ }
6044
  GGML_ASSERT(cgraph->grads != NULL);
6045
 
6046
  for (int i = 0; i < cgraph->n_nodes; i++) {
 
6350
  tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
6351
  }
6352
 
6353
+ void ggml_set_param(struct ggml_tensor * tensor) {
6354
+ GGML_ASSERT(tensor->op == GGML_OP_NONE);
6355
  tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6356
  }
6357