KitaitiMakoto commited on
Commit
54b2b95
·
unverified ·
1 Parent(s): 005390a

ruby : fix bindings (#2484)

Browse files

* Improve Rakefile

* Remove intermediate files

* Remove unnecessary manipulations from extconf.rb

* Add README and LINCENSE to source files

* Manage ext source files using YAML file

* Use extsources.yaml to include files into gem package file

* Add git-managed source files to build dependency

* Add test task

* Download model for test if not exists

* Add test for build

* Ignore gem package directory

* Enable GitHub action for Ruby binding

* Fix model name

* Build lib file for test

* Use extension for each platform

* Use extension for each platform on testing

* Move built lib file rather than copy

* Add intermediate files to clean targets

.github/workflows/bindings-ruby.yml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bindings Tests (Ruby)
2
+ on:
3
+ push:
4
+ paths:
5
+ - bindings/ruby/**
6
+ - src/whisper.cpp
7
+ - include/whisper.h
8
+ - ggml/src/ggml.c
9
+ - ggml/src/ggml-impl.h
10
+ - ggml/src/ggml-aarch64.h
11
+ - ggml/src/ggml-aarch64.c
12
+ - ggml/src/ggml-alloc.c
13
+ - ggml/src/ggml-backend-impl.h
14
+ - ggml/src/ggml-backend.cpp
15
+ - ggml/src/ggml-common.h
16
+ - ggml/src/ggml-quants.h
17
+ - ggml/src/ggml-quants.c
18
+ - ggml/src/ggml-cpu-impl.h
19
+ - ggml/include/ggml.h
20
+ - ggml/include/ggml-alloc.h
21
+ - ggml/include/ggml-backend.h
22
+ - ggml/include/ggml-cuda.h
23
+ - ggml/include/ggml-kompute.h
24
+ - ggml/include/ggml-metal.h
25
+ - ggml/include/ggml-sycl.h
26
+ - ggml/include/ggml-vulkan.h
27
+ - examples/dr_wav.h
28
+ pull_request:
29
+ paths:
30
+ - bindings/ruby/**
31
+ - src/whisper.cpp
32
+ - include/whisper.h
33
+ - ggml/src/ggml.c
34
+ - ggml/src/ggml-impl.h
35
+ - ggml/src/ggml-aarch64.h
36
+ - ggml/src/ggml-aarch64.c
37
+ - ggml/src/ggml-alloc.c
38
+ - ggml/src/ggml-backend-impl.h
39
+ - ggml/src/ggml-backend.cpp
40
+ - ggml/src/ggml-common.h
41
+ - ggml/src/ggml-quants.h
42
+ - ggml/src/ggml-quants.c
43
+ - ggml/src/ggml-cpu-impl.h
44
+ - ggml/include/ggml.h
45
+ - ggml/include/ggml-alloc.h
46
+ - ggml/include/ggml-backend.h
47
+ - ggml/include/ggml-cuda.h
48
+ - ggml/include/ggml-kompute.h
49
+ - ggml/include/ggml-metal.h
50
+ - ggml/include/ggml-sycl.h
51
+ - ggml/include/ggml-vulkan.h
52
+ - examples/dr_wav.h
53
+
54
+ jobs:
55
+ ubuntu-latest:
56
+ runs-on: ubuntu-latest
57
+ defaults:
58
+ run:
59
+ working-directory: bindings/ruby
60
+ steps:
61
+ - uses: ruby/setup-ruby@v1
62
+ with:
63
+ ruby-version: '3.0'
64
+ - uses: actions/checkout@v4
65
+ - run: rake test
.github/workflows/bindings-ruby.yml.disabled DELETED
@@ -1,23 +0,0 @@
1
- # TODO: fix this workflow file, disabled for now
2
- name: Bindings Tests (Ruby)
3
- on:
4
- push:
5
- paths:
6
- - bindings/ruby/**
7
- - whisper.h
8
- pull_request:
9
- paths:
10
- - bindings/ruby/**
11
- - whisper.h
12
-
13
- jobs:
14
- ubuntu-latest:
15
- runs-on: ubuntu-latest
16
- steps:
17
- - uses: ruby/setup-ruby@v1
18
- with:
19
- ruby-version: '3.0'
20
- - uses: actions/checkout@v1
21
- - run: |
22
- cd bindings/ruby/ext
23
- ruby extconf.rb && make
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ README.md
2
+ LICENSE
3
+ pkg/
4
+ lib/whisper.*
bindings/ruby/Rakefile CHANGED
@@ -1,12 +1,55 @@
1
  require 'rake/clean'
2
- require 'rubygems/package'
3
-
4
- desc 'Build gem'
5
- task :package do
6
- spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
7
- spec = nil
8
- # see: http://gist.github.com/16215
9
- Thread.new { spec = eval("#{spec_source}") }.join
10
- spec.validate
11
- Gem::Package.build(spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  end
 
1
  require 'rake/clean'
2
+ require "bundler/gem_tasks"
3
+ require "pathname"
4
+ require "yaml"
5
+ require "rake/testtask"
6
+
7
+ extsources = YAML.load_file("extsources.yaml")
8
+ extsources.each_pair do |src_dir, dests|
9
+ dests.each do |dest|
10
+ src = Pathname(src_dir)/File.basename(dest)
11
+
12
+ file src
13
+ file dest => src do |t|
14
+ cp t.source, t.name
15
+ end
16
+ end
17
+ end
18
+ SOURCES = extsources.values.flatten
19
+ CLEAN.include SOURCES
20
+ CLEAN.include FileList["ext/*.o", "ext/whisper.so", "ext/whisper.bundle", "ext/whisper.dll"]
21
+
22
+ task build: SOURCES + FileList[
23
+ "ext/extconf.rb",
24
+ "ext/ruby_whisper.h",
25
+ "ext/ruby_whisper.cpp",
26
+ "whispercpp.gemspec",
27
+ ]
28
+
29
+ directory "pkg"
30
+ CLOBBER.include "pkg"
31
+
32
+ TEST_MODEL = "../../models/ggml-base.en.bin"
33
+ LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
34
+ LIB_FILE = File.join("lib", LIB_NAME)
35
+
36
+ directory "lib"
37
+ task LIB_FILE => SOURCES + ["lib"] do |t|
38
+ Dir.chdir "ext" do
39
+ sh "ruby extconf.rb"
40
+ sh "make"
41
+ end
42
+ mv "ext/#{LIB_NAME}", t.name
43
+ end
44
+ CLEAN.include LIB_FILE
45
+
46
+ Rake::TestTask.new do |t|
47
+ t.test_files = FileList["tests/test_*.rb"]
48
+ end
49
+ task test: [TEST_MODEL, LIB_FILE]
50
+
51
+ file TEST_MODEL do
52
+ Dir.chdir "../.." do
53
+ sh "./models/download-ggml-model.sh base.en"
54
+ end
55
  end
bindings/ruby/ext/.gitignore CHANGED
@@ -3,7 +3,26 @@ ggml.c
3
  ggml.h
4
  ggml-alloc.c
5
  ggml-alloc.h
6
- whisper.bundle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  whisper.cpp
8
  whisper.h
9
  dr_wav.h
 
 
 
 
3
  ggml.h
4
  ggml-alloc.c
5
  ggml-alloc.h
6
+ ggml-aarch64.c
7
+ ggml-aarch64.h
8
+ ggml-backend.cpp
9
+ ggml-backend-impl.h
10
+ ggml-backend.c
11
+ ggml-backend.h
12
+ ggml-common.h
13
+ ggml-cpu-impl.h
14
+ ggml-cuda.h
15
+ ggml-impl.h
16
+ ggml-kompute.h
17
+ ggml-metal.h
18
+ ggml-opencl.h
19
+ ggml-quants.c
20
+ ggml-quants.h
21
+ ggml-sycl.h
22
+ ggml-vulkan.h
23
  whisper.cpp
24
  whisper.h
25
  dr_wav.h
26
+ whisper.bundle
27
+ whisper.so
28
+ whisper.dll
bindings/ruby/ext/extconf.rb CHANGED
@@ -1,21 +1,4 @@
1
  require 'mkmf'
2
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
3
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
4
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
5
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
6
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
7
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.h')} .")
8
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.c')} .")
9
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
10
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
11
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
12
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
13
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
14
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
15
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
16
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
17
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
18
-
19
 
20
  # need to use c++ compiler flags
21
  $CXXFLAGS << ' -std=c++11'
 
1
  require 'mkmf'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # need to use c++ compiler flags
4
  $CXXFLAGS << ' -std=c++11'
bindings/ruby/ext/ggml-backend-impl.h DELETED
@@ -1,141 +0,0 @@
1
- #pragma once
2
-
3
- // ggml-backend internal header
4
-
5
- #include "ggml-backend.h"
6
-
7
- #ifdef __cplusplus
8
- extern "C" {
9
- #endif
10
-
11
- //
12
- // Backend buffer
13
- //
14
-
15
- // buffer type
16
- typedef void * ggml_backend_buffer_type_context_t;
17
-
18
- struct ggml_backend_buffer_type_i {
19
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
- ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
25
- // check if tensor data is in host memory
26
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
27
- bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
28
- };
29
-
30
- struct ggml_backend_buffer_type {
31
- struct ggml_backend_buffer_type_i iface;
32
- ggml_backend_buffer_type_context_t context;
33
- };
34
-
35
- // buffer
36
- typedef void * ggml_backend_buffer_context_t;
37
-
38
- struct ggml_backend_buffer_i {
39
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
40
- void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
41
- void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
42
- void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43
- void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
44
- void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
45
- bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
46
- void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
47
- void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
48
- };
49
-
50
- struct ggml_backend_buffer {
51
- struct ggml_backend_buffer_i iface;
52
- ggml_backend_buffer_type_t buft;
53
- ggml_backend_buffer_context_t context;
54
- size_t size;
55
- enum ggml_backend_buffer_usage usage;
56
- };
57
-
58
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
59
- ggml_backend_buffer_type_t buft,
60
- struct ggml_backend_buffer_i iface,
61
- ggml_backend_buffer_context_t context,
62
- size_t size);
63
-
64
- // do not use directly, use ggml_backend_tensor_copy instead
65
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
66
-
67
- // buffer that contains a collection of buffers
68
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
-
72
- //
73
- // Backend
74
- //
75
-
76
- typedef void * ggml_backend_context_t;
77
-
78
- struct ggml_backend_i {
79
- const char * (*GGML_CALL get_name)(ggml_backend_t backend);
80
-
81
- void (*GGML_CALL free)(ggml_backend_t backend);
82
-
83
- // buffer allocation
84
- ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
85
-
86
- // (optional) asynchronous tensor data access
87
- void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
- void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
-
91
- // (optional) complete all pending operations
92
- void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
-
94
- // compute graph with a plan (not used currently)
95
- ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
- void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
-
98
- // compute graph with a plan
99
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
- // compute graph without a plan (async)
101
- enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
-
103
- // check if the backend supports an operation
104
- bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
-
106
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
- // even if the weight has to be copied from the CPU temporarily
109
- bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
-
111
- // (optional) event synchronization
112
- ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
- void (*GGML_CALL event_free) (ggml_backend_event_t event);
114
- void (*GGML_CALL event_record) (ggml_backend_event_t event);
115
- void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
116
- void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117
- };
118
-
119
- struct ggml_backend {
120
- ggml_guid_t guid;
121
-
122
- struct ggml_backend_i iface;
123
- ggml_backend_context_t context;
124
- };
125
-
126
- struct ggml_backend_event {
127
- ggml_backend_t backend;
128
- void * context;
129
- };
130
-
131
- //
132
- // Backend registry
133
- //
134
-
135
- typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
136
-
137
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
138
-
139
- #ifdef __cplusplus
140
- }
141
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-backend.c DELETED
@@ -1,2095 +0,0 @@
1
- #include "ggml-backend-impl.h"
2
- #include "ggml-alloc.h"
3
- #include "ggml-impl.h"
4
-
5
- #include <assert.h>
6
- #include <limits.h>
7
- #include <stdarg.h>
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
11
-
12
-
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
-
15
- // backend buffer type
16
-
17
- const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
- return buft->iface.get_name(buft);
19
- }
20
-
21
- GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22
- return buft->iface.alloc_buffer(buft, size);
23
- }
24
-
25
- size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
26
- return buft->iface.get_alignment(buft);
27
- }
28
-
29
- size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
30
- // get_max_size is optional, defaults to SIZE_MAX
31
- if (buft->iface.get_max_size) {
32
- return buft->iface.get_max_size(buft);
33
- }
34
- return SIZE_MAX;
35
- }
36
-
37
- GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
- // get_alloc_size is optional, defaults to ggml_nbytes
39
- if (buft->iface.get_alloc_size) {
40
- size_t size = buft->iface.get_alloc_size(buft, tensor);
41
- assert(size >= ggml_nbytes(tensor));
42
- return size;
43
- }
44
- return ggml_nbytes(tensor);
45
- }
46
-
47
- bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
- return buft->iface.supports_backend(buft, backend);
49
- }
50
-
51
- bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
- if (buft->iface.is_host) {
53
- return buft->iface.is_host(buft);
54
- }
55
- return false;
56
- }
57
-
58
- // backend buffer
59
-
60
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
61
- ggml_backend_buffer_type_t buft,
62
- struct ggml_backend_buffer_i iface,
63
- ggml_backend_buffer_context_t context,
64
- size_t size) {
65
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
66
-
67
- (*buffer) = (struct ggml_backend_buffer) {
68
- /* .interface = */ iface,
69
- /* .buft = */ buft,
70
- /* .context = */ context,
71
- /* .size = */ size,
72
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
73
- };
74
-
75
- return buffer;
76
- }
77
-
78
- const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
79
- return buffer->iface.get_name(buffer);
80
- }
81
-
82
- void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
83
- if (buffer == NULL) {
84
- return;
85
- }
86
-
87
- if (buffer->iface.free_buffer != NULL) {
88
- buffer->iface.free_buffer(buffer);
89
- }
90
- free(buffer);
91
- }
92
-
93
- size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
94
- return buffer->size;
95
- }
96
-
97
- void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
98
- void * base = buffer->iface.get_base(buffer);
99
-
100
- GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
101
-
102
- return base;
103
- }
104
-
105
- GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
106
- // init_tensor is optional
107
- if (buffer->iface.init_tensor) {
108
- buffer->iface.init_tensor(buffer, tensor);
109
- }
110
- }
111
-
112
- size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
113
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
114
- }
115
-
116
- size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
117
- return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
118
- }
119
-
120
- size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
121
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
122
- }
123
-
124
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
125
- buffer->iface.clear(buffer, value);
126
- }
127
-
128
- bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
129
- return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
130
- }
131
-
132
- void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
133
- buffer->usage = usage;
134
-
135
- // FIXME: add a generic callback to the buffer interface
136
- if (ggml_backend_buffer_is_multi_buffer(buffer)) {
137
- ggml_backend_multi_buffer_set_usage(buffer, usage);
138
- }
139
- }
140
-
141
- ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
142
- return buffer->buft;
143
- }
144
-
145
- void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
146
- if (buffer->iface.reset) {
147
- buffer->iface.reset(buffer);
148
- }
149
- }
150
-
151
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
- ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
- if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
155
- }
156
- return false;
157
- }
158
-
159
- // backend
160
-
161
- ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
- if (backend == NULL) {
163
- return NULL;
164
- }
165
- return backend->guid;
166
- }
167
-
168
- const char * ggml_backend_name(ggml_backend_t backend) {
169
- if (backend == NULL) {
170
- return "NULL";
171
- }
172
- return backend->iface.get_name(backend);
173
- }
174
-
175
- void ggml_backend_free(ggml_backend_t backend) {
176
- if (backend == NULL) {
177
- return;
178
- }
179
-
180
- backend->iface.free(backend);
181
- }
182
-
183
- ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
184
- return backend->iface.get_default_buffer_type(backend);
185
- }
186
-
187
- ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
188
- return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
189
- }
190
-
191
- size_t ggml_backend_get_alignment(ggml_backend_t backend) {
192
- return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
193
- }
194
-
195
- size_t ggml_backend_get_max_size(ggml_backend_t backend) {
196
- return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
197
- }
198
-
199
- void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
200
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
201
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
202
-
203
- if (backend->iface.set_tensor_async == NULL) {
204
- ggml_backend_tensor_set(tensor, data, offset, size);
205
- } else {
206
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
207
- }
208
- }
209
-
210
- void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
211
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
212
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
213
-
214
- if (backend->iface.get_tensor_async == NULL) {
215
- ggml_backend_tensor_get(tensor, data, offset, size);
216
- } else {
217
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
218
- }
219
- }
220
-
221
- GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
-
224
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
-
228
- if (!size) {
229
- return;
230
- }
231
-
232
- buf->iface.set_tensor(buf, tensor, data, offset, size);
233
- }
234
-
235
- GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
-
238
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
-
242
- if (!size) {
243
- return;
244
- }
245
-
246
- buf->iface.get_tensor(buf, tensor, data, offset, size);
247
- }
248
-
249
- void ggml_backend_synchronize(ggml_backend_t backend) {
250
- if (backend->iface.synchronize == NULL) {
251
- return;
252
- }
253
-
254
- backend->iface.synchronize(backend);
255
- }
256
-
257
- ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
- GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
-
260
- return backend->iface.graph_plan_create(backend, cgraph);
261
- }
262
-
263
- void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
- GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
-
266
- backend->iface.graph_plan_free(backend, plan);
267
- }
268
-
269
- enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
- GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
-
272
- return backend->iface.graph_plan_compute(backend, plan);
273
- }
274
-
275
- enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
- ggml_backend_synchronize(backend);
278
- return err;
279
- }
280
-
281
- enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
- return backend->iface.graph_compute(backend, cgraph);
283
- }
284
-
285
- bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
286
- return backend->iface.supports_op(backend, op);
287
- }
288
-
289
- bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
- if (backend->iface.offload_op != NULL) {
291
- return backend->iface.offload_op(backend, op);
292
- }
293
- return false;
294
- }
295
-
296
- // backend copy
297
-
298
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
299
- if (a->type != b->type) {
300
- return false;
301
- }
302
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
303
- if (a->ne[i] != b->ne[i]) {
304
- return false;
305
- }
306
- if (a->nb[i] != b->nb[i]) {
307
- return false;
308
- }
309
- }
310
- return true;
311
- }
312
-
313
- void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
314
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
315
-
316
- if (src == dst) {
317
- return;
318
- }
319
-
320
- if (ggml_backend_buffer_is_host(src->buffer)) {
321
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
322
- } else if (ggml_backend_buffer_is_host(dst->buffer)) {
323
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
324
- } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
325
- #ifndef NDEBUG
326
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
327
- #endif
328
- size_t nbytes = ggml_nbytes(src);
329
- void * data = malloc(nbytes);
330
- ggml_backend_tensor_get(src, data, 0, nbytes);
331
- ggml_backend_tensor_set(dst, data, 0, nbytes);
332
- free(data);
333
- }
334
- }
335
-
336
- void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
337
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
338
-
339
- if (src == dst) {
340
- return;
341
- }
342
-
343
- if (backend_dst->iface.cpy_tensor_async != NULL) {
344
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
345
- return;
346
- }
347
- }
348
-
349
- // an async copy would normally happen after all the queued operations on both backends are completed
350
- // sync src, set_async dst
351
- if (ggml_backend_buffer_is_host(src->buffer)) {
352
- ggml_backend_synchronize(backend_src);
353
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
354
- } else {
355
- ggml_backend_synchronize(backend_src);
356
- ggml_backend_tensor_copy(src, dst);
357
- ggml_backend_synchronize(backend_dst);
358
- }
359
- }
360
-
361
- // events
362
-
363
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
364
- if (backend->iface.event_new == NULL) {
365
- return NULL;
366
- }
367
- return backend->iface.event_new(backend);
368
- }
369
-
370
- void ggml_backend_event_free(ggml_backend_event_t event) {
371
- if (event == NULL) {
372
- return;
373
- }
374
- event->backend->iface.event_free(event);
375
- }
376
-
377
- void ggml_backend_event_record(ggml_backend_event_t event) {
378
- GGML_ASSERT(event->backend->iface.event_record != NULL);
379
-
380
- event->backend->iface.event_record(event);
381
- }
382
-
383
- void ggml_backend_event_synchronize(ggml_backend_event_t event) {
384
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
385
-
386
- event->backend->iface.event_synchronize(event);
387
- }
388
-
389
- void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
390
- GGML_ASSERT(backend->iface.event_wait != NULL);
391
-
392
- backend->iface.event_wait(backend, event);
393
- }
394
-
395
- // backend registry
396
-
397
- #define GGML_REG_MAX_BACKENDS 16
398
-
399
- struct ggml_backend_reg {
400
- char name[128];
401
- ggml_backend_init_fn init_fn;
402
- ggml_backend_buffer_type_t default_buffer_type;
403
- void * user_data;
404
- };
405
-
406
- static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
407
- static size_t ggml_backend_registry_count = 0;
408
-
409
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
410
-
411
- GGML_CALL static void ggml_backend_registry_init(void) {
412
- static bool initialized = false;
413
-
414
- if (initialized) {
415
- return;
416
- }
417
-
418
- initialized = true;
419
-
420
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
-
422
- // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUDA
424
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
- ggml_backend_cuda_reg_devices();
426
- #endif
427
-
428
- #ifdef GGML_USE_SYCL
429
- extern void ggml_backend_sycl_reg_devices(void);
430
- ggml_backend_sycl_reg_devices();
431
- #endif
432
-
433
- #ifdef GGML_USE_METAL
434
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
435
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
436
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
437
- #endif
438
-
439
- #ifdef GGML_USE_VULKAN
440
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
441
- ggml_backend_vk_reg_devices();
442
- #endif
443
-
444
- #ifdef GGML_USE_KOMPUTE
445
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
446
- ggml_backend_kompute_reg_devices();
447
- #endif
448
- }
449
-
450
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
451
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
452
-
453
- size_t id = ggml_backend_registry_count;
454
-
455
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
456
- /* .name = */ {0},
457
- /* .fn = */ init_fn,
458
- /* .default_buffer_type = */ default_buffer_type,
459
- /* .user_data = */ user_data,
460
- };
461
-
462
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
463
-
464
- #ifndef NDEBUG
465
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
466
- #endif
467
-
468
- ggml_backend_registry_count++;
469
- }
470
-
471
- size_t ggml_backend_reg_get_count(void) {
472
- ggml_backend_registry_init();
473
-
474
- return ggml_backend_registry_count;
475
- }
476
-
477
- size_t ggml_backend_reg_find_by_name(const char * name) {
478
- ggml_backend_registry_init();
479
-
480
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
481
- // TODO: case insensitive in a portable way
482
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
483
- return i;
484
- }
485
- }
486
-
487
- // not found
488
- return SIZE_MAX;
489
- }
490
-
491
- // init from backend:params string
492
- ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
493
- ggml_backend_registry_init();
494
-
495
- const char * params = strchr(backend_str, ':');
496
- char backend_name[128];
497
- if (params == NULL) {
498
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
499
- params = "";
500
- } else {
501
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
502
- params++;
503
- }
504
-
505
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
506
-
507
- if (backend_i == SIZE_MAX) {
508
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
509
- return NULL;
510
- }
511
-
512
- return ggml_backend_reg_init_backend(backend_i, params);
513
- }
514
-
515
- const char * ggml_backend_reg_get_name(size_t i) {
516
- ggml_backend_registry_init();
517
-
518
- GGML_ASSERT(i < ggml_backend_registry_count);
519
- return ggml_backend_registry[i].name;
520
- }
521
-
522
- ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
523
- ggml_backend_registry_init();
524
-
525
- GGML_ASSERT(i < ggml_backend_registry_count);
526
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
527
- }
528
-
529
- ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
530
- ggml_backend_registry_init();
531
-
532
- GGML_ASSERT(i < ggml_backend_registry_count);
533
- return ggml_backend_registry[i].default_buffer_type;
534
- }
535
-
536
- ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
537
- ggml_backend_registry_init();
538
-
539
- GGML_ASSERT(i < ggml_backend_registry_count);
540
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
541
- }
542
-
543
- // backend CPU
544
-
545
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
546
-
547
- GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
548
- return "CPU";
549
-
550
- GGML_UNUSED(buffer);
551
- }
552
-
553
- GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
554
- uintptr_t data = (uintptr_t)buffer->context;
555
-
556
- // align the buffer
557
- if (data % TENSOR_ALIGNMENT != 0) {
558
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
559
- }
560
-
561
- return (void *)data;
562
- }
563
-
564
- GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
565
- free(buffer->context);
566
- }
567
-
568
- GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
569
- memcpy((char *)tensor->data + offset, data, size);
570
-
571
- GGML_UNUSED(buffer);
572
- }
573
-
574
- GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
575
- memcpy(data, (const char *)tensor->data + offset, size);
576
-
577
- GGML_UNUSED(buffer);
578
- }
579
-
580
- GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
581
- if (ggml_backend_buffer_is_host(src->buffer)) {
582
- memcpy(dst->data, src->data, ggml_nbytes(src));
583
- return true;
584
- }
585
- return false;
586
-
587
- GGML_UNUSED(buffer);
588
- }
589
-
590
- GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
591
- memset(buffer->context, value, buffer->size);
592
- }
593
-
594
- static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
595
- /* .get_name = */ ggml_backend_cpu_buffer_name,
596
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
597
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
598
- /* .init_tensor = */ NULL, // no initialization required
599
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
600
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
601
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
602
- /* .clear = */ ggml_backend_cpu_buffer_clear,
603
- /* .reset = */ NULL,
604
- };
605
-
606
- // for buffers from ptr, free is not called
607
- static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
608
- /* .get_name = */ ggml_backend_cpu_buffer_name,
609
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
610
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
611
- /* .init_tensor = */ NULL, // no initialization required
612
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
613
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
614
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
615
- /* .clear = */ ggml_backend_cpu_buffer_clear,
616
- /* .reset = */ NULL,
617
- };
618
-
619
- GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
620
- return "CPU";
621
-
622
- GGML_UNUSED(buft);
623
- }
624
-
625
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
626
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
627
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
628
- if (data == NULL) {
629
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
630
- return NULL;
631
- }
632
-
633
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
634
- }
635
-
636
- GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
637
- return TENSOR_ALIGNMENT;
638
-
639
- GGML_UNUSED(buft);
640
- }
641
-
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
- return ggml_backend_is_cpu(backend);
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
- GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
- return true;
650
-
651
- GGML_UNUSED(buft);
652
- }
653
-
654
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
655
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
656
- /* .iface = */ {
657
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
658
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
659
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
- },
665
- /* .context = */ NULL,
666
- };
667
-
668
- return &ggml_backend_cpu_buffer_type;
669
- }
670
-
671
- #ifdef GGML_USE_CPU_HBM
672
-
673
- // buffer type HBM
674
-
675
- #include <hbwmalloc.h>
676
-
677
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
678
- return "CPU_HBM";
679
-
680
- GGML_UNUSED(buft);
681
- }
682
-
683
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
684
- return "CPU_HBM";
685
-
686
- GGML_UNUSED(buf);
687
- }
688
-
689
- GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
690
- hbw_free(buffer->context);
691
- }
692
-
693
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
694
- //void * ptr = hbw_malloc(size);
695
- void * ptr;
696
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
697
- if (result != 0) {
698
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
699
- return NULL;
700
- }
701
-
702
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
703
- buffer->buft = buft;
704
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
705
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
706
-
707
- return buffer;
708
- }
709
-
710
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
711
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
712
- /* .iface = */ {
713
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
714
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
715
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
- },
721
- /* .context = */ NULL,
722
- };
723
-
724
- return &ggml_backend_cpu_buffer_type_hbm;
725
- }
726
- #endif
727
-
728
- struct ggml_backend_cpu_context {
729
- int n_threads;
730
- void * work_data;
731
- size_t work_size;
732
-
733
- ggml_abort_callback abort_callback;
734
- void * abort_callback_data;
735
- };
736
-
737
- GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
738
- return "CPU";
739
-
740
- GGML_UNUSED(backend);
741
- }
742
-
743
- GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
744
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
745
- free(cpu_ctx->work_data);
746
- free(cpu_ctx);
747
- free(backend);
748
- }
749
-
750
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
751
- return ggml_backend_cpu_buffer_type();
752
-
753
- GGML_UNUSED(backend);
754
- }
755
-
756
- struct ggml_backend_plan_cpu {
757
- struct ggml_cplan cplan;
758
- struct ggml_cgraph cgraph;
759
- };
760
-
761
- GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
762
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
763
-
764
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
765
-
766
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
767
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
768
-
769
- if (cpu_plan->cplan.work_size > 0) {
770
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771
- if (cpu_plan->cplan.work_data == NULL) {
772
- free(cpu_plan);
773
- return NULL;
774
- }
775
- }
776
-
777
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
778
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
779
-
780
- return cpu_plan;
781
- }
782
-
783
- GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
784
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
785
-
786
- free(cpu_plan->cplan.work_data);
787
- free(cpu_plan);
788
-
789
- GGML_UNUSED(backend);
790
- }
791
-
792
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
793
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
794
-
795
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
796
-
797
- GGML_UNUSED(backend);
798
- }
799
-
800
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
801
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
802
-
803
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
804
-
805
- if (cpu_ctx->work_size < cplan.work_size) {
806
- free(cpu_ctx->work_data);
807
- cpu_ctx->work_data = malloc(cplan.work_size);
808
- if (cpu_ctx->work_data == NULL) {
809
- cpu_ctx->work_size = 0;
810
- return GGML_STATUS_ALLOC_FAILED;
811
- }
812
- cpu_ctx->work_size = cplan.work_size;
813
- }
814
- cplan.work_data = cpu_ctx->work_data;
815
-
816
- cplan.abort_callback = cpu_ctx->abort_callback;
817
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
818
-
819
- return ggml_graph_compute(cgraph, &cplan);
820
- }
821
-
822
- GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
- switch (op->op) {
824
- case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
826
- case GGML_OP_MUL_MAT:
827
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
- default:
829
- return true;
830
- }
831
-
832
- GGML_UNUSED(backend);
833
- }
834
-
835
- static struct ggml_backend_i cpu_backend_i = {
836
- /* .get_name = */ ggml_backend_cpu_name,
837
- /* .free = */ ggml_backend_cpu_free,
838
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
839
- /* .set_tensor_async = */ NULL,
840
- /* .get_tensor_async = */ NULL,
841
- /* .cpy_tensor_async = */ NULL,
842
- /* .synchronize = */ NULL,
843
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
844
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
845
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
846
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
847
- /* .supports_op = */ ggml_backend_cpu_supports_op,
848
- /* .offload_op = */ NULL,
849
- /* .event_new = */ NULL,
850
- /* .event_free = */ NULL,
851
- /* .event_record = */ NULL,
852
- /* .event_wait = */ NULL,
853
- /* .event_synchronize = */ NULL,
854
- };
855
-
856
- static ggml_guid_t ggml_backend_cpu_guid(void) {
857
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
858
- return &guid;
859
- }
860
-
861
- ggml_backend_t ggml_backend_cpu_init(void) {
862
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
863
- if (ctx == NULL) {
864
- return NULL;
865
- }
866
-
867
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
868
- ctx->work_data = NULL;
869
- ctx->work_size = 0;
870
- ctx->abort_callback = NULL;
871
- ctx->abort_callback_data = NULL;
872
-
873
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
874
- if (cpu_backend == NULL) {
875
- free(ctx);
876
- return NULL;
877
- }
878
-
879
- *cpu_backend = (struct ggml_backend) {
880
- /* .guid = */ ggml_backend_cpu_guid(),
881
- /* .interface = */ cpu_backend_i,
882
- /* .context = */ ctx
883
- };
884
- return cpu_backend;
885
- }
886
-
887
- GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
888
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
889
- }
890
-
891
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
892
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
893
-
894
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
895
- ctx->n_threads = n_threads;
896
- }
897
-
898
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
899
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
900
-
901
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
902
- ctx->abort_callback = abort_callback;
903
- ctx->abort_callback_data = abort_callback_data;
904
- }
905
-
906
- GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
907
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
908
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
909
- }
910
-
911
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
912
- return ggml_backend_cpu_init();
913
-
914
- GGML_UNUSED(params);
915
- GGML_UNUSED(user_data);
916
- }
917
-
918
- // multi-buffer buffer
919
-
920
- struct ggml_backend_multi_buffer_context {
921
- ggml_backend_buffer_t * buffers;
922
- size_t n_buffers;
923
- };
924
-
925
- typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
926
-
927
- GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
928
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
929
-
930
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
931
- }
932
-
933
- GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
934
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
935
- for (size_t i = 0; i < ctx->n_buffers; i++) {
936
- ggml_backend_buffer_free(ctx->buffers[i]);
937
- }
938
-
939
- free(ctx->buffers);
940
- free(ctx);
941
- }
942
-
943
- GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
944
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
945
- for (size_t i = 0; i < ctx->n_buffers; i++) {
946
- ggml_backend_buffer_clear(ctx->buffers[i], value);
947
- }
948
- }
949
-
950
- static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
951
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
952
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
953
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
954
- /* .get_base = */ NULL,
955
- /* .init_tensor = */ NULL,
956
- /* .set_tensor = */ NULL,
957
- /* .get_tensor = */ NULL,
958
- /* .cpy_tensor = */ NULL,
959
- /* .clear = */ ggml_backend_multi_buffer_clear,
960
- /* .reset = */ NULL,
961
- };
962
-
963
- return multi_backend_buffer_i;
964
- }
965
-
966
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
967
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
968
- ctx->n_buffers = n_buffers;
969
- ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
970
-
971
- GGML_ASSERT(ctx->buffers != NULL);
972
-
973
- size_t total_size = 0;
974
- for (size_t i = 0; i < n_buffers; i++) {
975
- ctx->buffers[i] = buffers[i];
976
- total_size += ggml_backend_buffer_get_size(buffers[i]);
977
- }
978
-
979
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
980
- }
981
-
982
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
983
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
984
- }
985
-
986
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
987
- GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
988
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
989
- for (size_t i = 0; i < ctx->n_buffers; i++) {
990
- ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
991
- }
992
- }
993
-
994
- // creates a copy of the tensor with the same memory layout
995
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
996
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
997
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
998
- dup->nb[i] = tensor->nb[i];
999
- }
1000
- return dup;
1001
- }
1002
-
1003
- static bool ggml_is_view_op(enum ggml_op op) {
1004
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
1005
- }
1006
-
1007
- // scheduler
1008
-
1009
- #ifndef GGML_SCHED_MAX_BACKENDS
1010
- #define GGML_SCHED_MAX_BACKENDS 16
1011
- #endif
1012
-
1013
- #ifndef GGML_SCHED_MAX_SPLITS
1014
- #define GGML_SCHED_MAX_SPLITS 2048
1015
- #endif
1016
-
1017
- #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1018
- #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1019
- #endif
1020
-
1021
- #ifndef GGML_SCHED_MAX_COPIES
1022
- #define GGML_SCHED_MAX_COPIES 4
1023
- #endif
1024
-
1025
- struct ggml_backend_sched_split {
1026
- int backend_id;
1027
- int i_start;
1028
- int i_end;
1029
- struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1030
- int n_inputs;
1031
- // graph view of this split
1032
- struct ggml_cgraph graph;
1033
- };
1034
-
1035
- struct ggml_backend_sched {
1036
- bool is_reset; // true if the scheduler has been reset since the last graph split
1037
- bool is_alloc;
1038
-
1039
- int n_backends;
1040
-
1041
- ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1042
- ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1043
- ggml_gallocr_t galloc;
1044
-
1045
- // hash keys of the nodes in the graph
1046
- struct ggml_hash_set hash_set;
1047
- // hash values
1048
- int * tensor_backend_id;
1049
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1050
-
1051
- int * node_backend_ids; // [graph_size]
1052
- int * leaf_backend_ids; // [graph_size]
1053
-
1054
- // copy of the graph with modified inputs
1055
- struct ggml_cgraph * graph;
1056
-
1057
- // graph splits
1058
- struct ggml_backend_sched_split * splits;
1059
- int n_splits;
1060
- int splits_capacity;
1061
-
1062
- // pipeline parallelism support
1063
- int n_copies;
1064
- int cur_copy;
1065
- ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1066
- struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1067
- int n_graph_inputs;
1068
-
1069
- struct ggml_context * ctx;
1070
-
1071
- ggml_backend_sched_eval_callback callback_eval;
1072
- void * callback_eval_user_data;
1073
-
1074
- // align context_buffer to GGML_MEM_ALIGN
1075
- #ifdef _MSC_VER
1076
- __declspec(align(GGML_MEM_ALIGN))
1077
- #else
1078
- __attribute__((aligned(GGML_MEM_ALIGN)))
1079
- #endif
1080
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1081
- };
1082
-
1083
- #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1084
- #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
1085
-
1086
- // returns the priority of the backend, lower id is higher priority
1087
- static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
1088
- for (int i = 0; i < sched->n_backends; i++) {
1089
- if (sched->backends[i] == backend) {
1090
- return i;
1091
- }
1092
- }
1093
- return -1;
1094
- }
1095
-
1096
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1097
- ggml_backend_buffer_t buffer = tensor->buffer;
1098
- if (buffer == NULL) {
1099
- return -1;
1100
- }
1101
-
1102
- // find highest prio backend that supports the buffer type
1103
- for (int i = 0; i < sched->n_backends; i++) {
1104
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
1105
- return i;
1106
- }
1107
- }
1108
-
1109
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1110
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
1111
- GGML_ASSERT(false);
1112
-
1113
- return -1;
1114
- }
1115
-
1116
- #if 0
1117
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1118
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1119
- #define GET_CAUSE(node) causes[hash_id(node)]
1120
- #else
1121
- #define SET_CAUSE(node, ...)
1122
- #define GET_CAUSE(node) ""
1123
- #endif
1124
-
1125
- // returns the backend that should be used for the node based on the current locations
1126
- static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1127
- // TODO: use supports_op to check if the backend supports the op
1128
-
1129
- // assign pre-allocated nodes to their backend
1130
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1131
- if (cur_backend_id != -1) {
1132
- SET_CAUSE(tensor, "1.dst");
1133
- return cur_backend_id;
1134
- }
1135
-
1136
- // view_src
1137
- if (tensor->view_src != NULL) {
1138
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1139
- if (cur_backend_id != -1) {
1140
- SET_CAUSE(tensor, "1.vsrc");
1141
- return cur_backend_id;
1142
- }
1143
- }
1144
-
1145
- // graph input
1146
- if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1147
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1148
- SET_CAUSE(tensor, "1.inp");
1149
- return cur_backend_id;
1150
- }
1151
-
1152
- // assign nodes that use weights to the backend of the weights
1153
- // operations with weights are preferably run on the same backend as the weights
1154
- for (int i = 0; i < GGML_MAX_SRC; i++) {
1155
- const struct ggml_tensor * src = tensor->src[i];
1156
- if (src == NULL) {
1157
- continue;
1158
- }
1159
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1160
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1161
- // check if a backend with higher prio wants to offload the op
1162
- if (src_backend_id == sched->n_backends - 1) {
1163
- for (int b = 0; b < src_backend_id; b++) {
1164
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1165
- SET_CAUSE(tensor, "1.off");
1166
- return b;
1167
- }
1168
- }
1169
- }
1170
- SET_CAUSE(tensor, "1.wgt%d", i);
1171
- return src_backend_id;
1172
- }
1173
- }
1174
-
1175
- return -1;
1176
- }
1177
-
1178
- static char * fmt_size(size_t size) {
1179
- static char buffer[128];
1180
- if (size >= 1024*1024) {
1181
- sprintf(buffer, "%zuM", size/1024/1024);
1182
- } else {
1183
- sprintf(buffer, "%zuK", size/1024);
1184
- }
1185
- return buffer;
1186
- }
1187
-
1188
- static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1189
- int cur_split = 0;
1190
- for (int i = 0; i < graph->n_nodes; i++) {
1191
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1192
- ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1193
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1194
- sched->splits[cur_split].n_inputs);
1195
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1196
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1197
- fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1198
- }
1199
- fprintf(stderr, "\n");
1200
- cur_split++;
1201
- }
1202
- struct ggml_tensor * node = graph->nodes[i];
1203
- if (ggml_is_view_op(node->op)) {
1204
- continue;
1205
- }
1206
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1207
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1208
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1209
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1210
- struct ggml_tensor * src = node->src[j];
1211
- if (src == NULL) {
1212
- continue;
1213
- }
1214
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1215
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1216
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1217
- }
1218
- fprintf(stderr, "\n");
1219
- }
1220
- }
1221
-
1222
- //#define DEBUG_PASS1
1223
- //#define DEBUG_PASS2
1224
- //#define DEBUG_PASS3
1225
- //#define DEBUG_PASS4
1226
-
1227
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1228
- static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1229
- // reset splits
1230
- sched->n_splits = 0;
1231
- sched->n_graph_inputs = 0;
1232
- sched->is_reset = false;
1233
-
1234
- struct ggml_init_params params = {
1235
- /* .mem_size = */ sizeof(sched->context_buffer),
1236
- /* .mem_buffer = */ sched->context_buffer,
1237
- /* .no_alloc = */ true
1238
- };
1239
-
1240
- ggml_free(sched->ctx);
1241
-
1242
- sched->ctx = ggml_init(params);
1243
- if (sched->ctx == NULL) {
1244
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
1245
- GGML_ASSERT(false);
1246
- }
1247
-
1248
- // pass 1: assign backends to ops with pre-allocated inputs
1249
- for (int i = 0; i < graph->n_leafs; i++) {
1250
- struct ggml_tensor * leaf = graph->leafs[i];
1251
- int * leaf_backend_id = &tensor_backend_id(leaf);
1252
- if (*leaf_backend_id != -1) {
1253
- // do not overwrite user assignments
1254
- continue;
1255
- }
1256
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1257
- }
1258
-
1259
- for (int i = 0; i < graph->n_nodes; i++) {
1260
- struct ggml_tensor * node = graph->nodes[i];
1261
- int * node_backend_id = &tensor_backend_id(node);
1262
- if (*node_backend_id != -1) {
1263
- // do not overwrite user assignments
1264
- continue;
1265
- }
1266
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1267
- // src
1268
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1269
- struct ggml_tensor * src = node->src[j];
1270
- if (src == NULL) {
1271
- continue;
1272
- }
1273
- int * src_backend_id = &tensor_backend_id(src);
1274
- if (*src_backend_id == -1) {
1275
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1276
- }
1277
- }
1278
- }
1279
- #ifdef DEBUG_PASS1
1280
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1281
- #endif
1282
-
1283
- // pass 2: expand current backend assignments
1284
- // assign the same backend to adjacent nodes
1285
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1286
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1287
-
1288
-
1289
- // pass 2.2 expand gpu down
1290
- {
1291
- int cur_backend_id = -1;
1292
- for (int i = 0; i < graph->n_nodes; i++) {
1293
- struct ggml_tensor * node = graph->nodes[i];
1294
- if (ggml_is_view_op(node->op)) {
1295
- continue;
1296
- }
1297
- int * node_backend_id = &tensor_backend_id(node);
1298
- if (*node_backend_id != -1) {
1299
- if (*node_backend_id == sched->n_backends - 1) {
1300
- // skip cpu (lowest prio backend)
1301
- cur_backend_id = -1;
1302
- } else {
1303
- cur_backend_id = *node_backend_id;
1304
- }
1305
- } else {
1306
- *node_backend_id = cur_backend_id;
1307
- SET_CAUSE(node, "2.2");
1308
- }
1309
- }
1310
- }
1311
- // pass 2.1 expand gpu up
1312
- {
1313
- int cur_backend_id = -1;
1314
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1315
- struct ggml_tensor * node = graph->nodes[i];
1316
- if (ggml_is_view_op(node->op)) {
1317
- continue;
1318
- }
1319
- int * node_backend_id = &tensor_backend_id(node);
1320
- if (*node_backend_id != -1) {
1321
- if (*node_backend_id == sched->n_backends - 1) {
1322
- // skip cpu (lowest prio backend)
1323
- cur_backend_id = -1;
1324
- } else {
1325
- cur_backend_id = *node_backend_id;
1326
- }
1327
- } else {
1328
- *node_backend_id = cur_backend_id;
1329
- SET_CAUSE(node, "2.1");
1330
- }
1331
- }
1332
- }
1333
- // pass 2.4 expand rest down
1334
- {
1335
- int cur_backend_id = -1;
1336
- for (int i = 0; i < graph->n_nodes; i++) {
1337
- struct ggml_tensor * node = graph->nodes[i];
1338
- if (ggml_is_view_op(node->op)) {
1339
- continue;
1340
- }
1341
- int * node_backend_id = &tensor_backend_id(node);
1342
- if (*node_backend_id != -1) {
1343
- cur_backend_id = *node_backend_id;
1344
- } else {
1345
- *node_backend_id = cur_backend_id;
1346
- SET_CAUSE(node, "2.4");
1347
- }
1348
- }
1349
- }
1350
- // pass 2.3 expand rest up
1351
- {
1352
- int cur_backend_id = -1;
1353
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1354
- struct ggml_tensor * node = graph->nodes[i];
1355
- if (ggml_is_view_op(node->op)) {
1356
- continue;
1357
- }
1358
- int * node_backend_id = &tensor_backend_id(node);
1359
- if (*node_backend_id != -1) {
1360
- cur_backend_id = *node_backend_id;
1361
- } else {
1362
- *node_backend_id = cur_backend_id;
1363
- SET_CAUSE(node, "2.3");
1364
- }
1365
- }
1366
- }
1367
-
1368
- #ifdef DEBUG_PASS2
1369
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1370
- #endif
1371
-
1372
- // pass 3: assign backends to remaining src from dst and view_src
1373
- for (int i = 0; i < graph->n_nodes; i++) {
1374
- struct ggml_tensor * node = graph->nodes[i];
1375
- int * cur_backend_id = &tensor_backend_id(node);
1376
- if (node->view_src != NULL && *cur_backend_id == -1) {
1377
- *cur_backend_id = tensor_backend_id(node->view_src);
1378
- SET_CAUSE(node, "3.vsrc");
1379
- }
1380
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1381
- struct ggml_tensor * src = node->src[j];
1382
- if (src == NULL) {
1383
- continue;
1384
- }
1385
- int * src_backend_id = &tensor_backend_id(src);
1386
- if (*src_backend_id == -1) {
1387
- if (src->view_src != NULL) {
1388
- // views are always on the same backend as the source
1389
- *src_backend_id = tensor_backend_id(src->view_src);
1390
- SET_CAUSE(src, "3.vsrc");
1391
- } else {
1392
- *src_backend_id = *cur_backend_id;
1393
- SET_CAUSE(src, "3.cur");
1394
- }
1395
- }
1396
- }
1397
- }
1398
- #ifdef DEBUG_PASS3
1399
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1400
- #endif
1401
-
1402
- // pass 4: split graph, find tensors that need to be copied
1403
- {
1404
- int i_split = 0;
1405
- struct ggml_backend_sched_split * split = &sched->splits[0];
1406
- // find the backend of the first split, skipping view ops
1407
- for (int i = 0; i < graph->n_nodes; i++) {
1408
- struct ggml_tensor * node = graph->nodes[i];
1409
- if (!ggml_is_view_op(node->op)) {
1410
- split->backend_id = tensor_backend_id(node);
1411
- break;
1412
- }
1413
- }
1414
- split->i_start = 0;
1415
- split->n_inputs = 0;
1416
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1417
- int cur_backend_id = split->backend_id;
1418
- for (int i = 0; i < graph->n_nodes; i++) {
1419
- struct ggml_tensor * node = graph->nodes[i];
1420
-
1421
- if (ggml_is_view_op(node->op)) {
1422
- continue;
1423
- }
1424
-
1425
- const int node_backend_id = tensor_backend_id(node);
1426
-
1427
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1428
-
1429
- // check if we should start a new split based on the sources of the current node
1430
- bool need_new_split = false;
1431
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1432
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1433
- struct ggml_tensor * src = node->src[j];
1434
- if (src == NULL) {
1435
- continue;
1436
- }
1437
- // check if a weight is on a different backend
1438
- // by starting a new split, the memory of the previously offloaded weights can be reused
1439
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1440
- int src_backend_id = tensor_backend_id(src);
1441
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1442
- need_new_split = true;
1443
- break;
1444
- }
1445
- }
1446
- // check if the split has too many inputs
1447
- if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1448
- const size_t id = hash_id(src);
1449
- int src_backend_id = sched->tensor_backend_id[id];
1450
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1451
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452
- need_new_split = true;
1453
- break;
1454
- }
1455
- }
1456
- }
1457
- }
1458
-
1459
- if (node_backend_id != cur_backend_id || need_new_split) {
1460
- split->i_end = i;
1461
- i_split++;
1462
- if (i_split >= sched->splits_capacity) {
1463
- sched->splits_capacity *= 2;
1464
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1465
- GGML_ASSERT(sched->splits != NULL);
1466
- }
1467
- GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1468
- split = &sched->splits[i_split];
1469
- split->backend_id = node_backend_id;
1470
- split->i_start = i;
1471
- split->n_inputs = 0;
1472
- cur_backend_id = node_backend_id;
1473
- }
1474
-
1475
- // find inputs that are not on the same backend
1476
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1477
- struct ggml_tensor * src = node->src[j];
1478
- if (src == NULL) {
1479
- continue;
1480
- }
1481
-
1482
- const int src_backend_id = tensor_backend_id(src);
1483
- assert(src_backend_id != -1); // all inputs should be assigned by now
1484
-
1485
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1486
- size_t id = hash_id(src);
1487
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1488
- ggml_backend_t backend = sched->backends[src_backend_id];
1489
- for (int c = 0; c < sched->n_copies; c++) {
1490
- struct ggml_tensor * tensor_copy;
1491
- if (c == sched->cur_copy) {
1492
- tensor_copy = src; // use the original tensor as the current copy
1493
- } else {
1494
- tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1495
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1496
- }
1497
- if (sched->n_copies > 1) {
1498
- ggml_set_input(tensor_copy);
1499
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1500
- }
1501
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1502
- SET_CAUSE(tensor_copy, "4.cpy");
1503
- }
1504
- int n_graph_inputs = sched->n_graph_inputs++;
1505
- GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1506
- sched->graph_inputs[n_graph_inputs] = src;
1507
- }
1508
- }
1509
-
1510
- if (src_backend_id != node_backend_id) {
1511
- // create a copy of the input in the split's backend
1512
- const size_t id = hash_id(src);
1513
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1514
- ggml_backend_t backend = sched->backends[cur_backend_id];
1515
- for (int c = 0; c < sched->n_copies; c++) {
1516
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1517
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1518
- if (sched->n_copies > 1) {
1519
- ggml_set_input(tensor_copy);
1520
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1521
- }
1522
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1523
- SET_CAUSE(tensor_copy, "4.cpy");
1524
- }
1525
- int n_inputs = split->n_inputs++;
1526
- GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1527
- split->inputs[n_inputs] = src;
1528
- }
1529
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1530
- }
1531
- }
1532
- }
1533
- split->i_end = graph->n_nodes;
1534
- sched->n_splits = i_split + 1;
1535
- }
1536
- #ifdef DEBUG_PASS4
1537
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1538
- #endif
1539
-
1540
- // create copies of the graph for each split
1541
- // TODO: avoid this copy
1542
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1543
- for (int i = 0; i < sched->n_splits; i++) {
1544
- struct ggml_backend_sched_split * split = &sched->splits[i];
1545
- split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1546
-
1547
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1548
- for (int j = 0; j < split->n_inputs; j++) {
1549
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1550
-
1551
- struct ggml_tensor * input = split->inputs[j];
1552
- const size_t input_id = hash_id(input);
1553
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1554
-
1555
- // add a dependency to the input source so that it is not freed before the copy is done
1556
- struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1557
- input_dep->src[0] = input;
1558
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1559
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1560
-
1561
- // add a dependency to the input copy so that it is allocated at the start of the split
1562
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1563
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1564
- }
1565
-
1566
- for (int j = split->i_start; j < split->i_end; j++) {
1567
- assert(graph_copy->size > graph_copy->n_nodes);
1568
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1569
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1570
- }
1571
- }
1572
-
1573
- if (sched->n_copies > 1) {
1574
- // add input copies as leafs so that they are allocated first
1575
- for (int i = 0; i < sched->n_graph_inputs; i++) {
1576
- struct ggml_tensor * input = sched->graph_inputs[i];
1577
- size_t id = hash_id(input);
1578
- int backend_id = tensor_backend_id(input);
1579
- for (int c = 0; c < sched->n_copies; c++) {
1580
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1581
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1582
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1583
- }
1584
- }
1585
-
1586
- for (int i = 0; i < sched->n_splits; i++) {
1587
- struct ggml_backend_sched_split * split = &sched->splits[i];
1588
- int backend_id = split->backend_id;
1589
- for (int j = 0; j < split->n_inputs; j++) {
1590
- struct ggml_tensor * input = split->inputs[j];
1591
- size_t id = hash_id(input);
1592
- for (int c = 0; c < sched->n_copies; c++) {
1593
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1594
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1595
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1596
- }
1597
- }
1598
- }
1599
- }
1600
-
1601
- // add leafs from the original graph
1602
- for (int i = 0; i < graph->n_leafs; i++) {
1603
- struct ggml_tensor * leaf = graph->leafs[i];
1604
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1605
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1606
- }
1607
-
1608
- sched->graph = graph_copy;
1609
- }
1610
-
1611
- static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1612
- // allocate graph
1613
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1614
- // the re-allocation may cause the split inputs to be moved to a different address
1615
- ggml_backend_sched_synchronize(sched);
1616
- #ifndef NDEBUG
1617
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1618
- #endif
1619
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1620
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1621
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1622
- return false;
1623
- }
1624
- }
1625
-
1626
- return true;
1627
- }
1628
-
1629
- static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1630
- struct ggml_backend_sched_split * splits = sched->splits;
1631
-
1632
- for (int i = 0; i < sched->n_splits; i++) {
1633
- struct ggml_backend_sched_split * split = &splits[i];
1634
- int split_backend_id = split->backend_id;
1635
- ggml_backend_t split_backend = sched->backends[split_backend_id];
1636
-
1637
- // copy the input tensors to the split backend
1638
- for (int j = 0; j < split->n_inputs; j++) {
1639
- ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1640
- struct ggml_tensor * input = split->inputs[j];
1641
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1642
-
1643
- if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1644
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1645
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1646
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1647
- } else {
1648
- ggml_backend_synchronize(split_backend);
1649
- }
1650
- ggml_backend_tensor_copy(input, input_cpy);
1651
- } else {
1652
- // wait for the split backend to finish using the input before overwriting it
1653
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1654
- ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1655
- } else {
1656
- ggml_backend_synchronize(split_backend);
1657
- }
1658
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1659
- }
1660
- }
1661
-
1662
- if (!sched->callback_eval) {
1663
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1664
- if (ec != GGML_STATUS_SUCCESS) {
1665
- return ec;
1666
- }
1667
- } else {
1668
- // similar to ggml_backend_compare_graph_backend
1669
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1670
- struct ggml_tensor * t = split->graph.nodes[j0];
1671
-
1672
- // check if the user needs data from this node
1673
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1674
-
1675
- int j1 = j0;
1676
-
1677
- // determine the range [j0, j1] of nodes that can be computed together
1678
- while (!need && j1 < split->graph.n_nodes - 1) {
1679
- t = split->graph.nodes[++j1];
1680
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1681
- }
1682
-
1683
- struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1684
-
1685
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1686
- if (ec != GGML_STATUS_SUCCESS) {
1687
- return ec;
1688
- }
1689
-
1690
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1691
- ggml_backend_synchronize(split_backend);
1692
-
1693
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1694
- break;
1695
- }
1696
-
1697
- j0 = j1;
1698
- }
1699
- }
1700
-
1701
- // record the event of this copy
1702
- if (split->n_inputs > 0) {
1703
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1704
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1705
- }
1706
- }
1707
- }
1708
-
1709
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1710
-
1711
- return GGML_STATUS_SUCCESS;
1712
- }
1713
-
1714
- ggml_backend_sched_t ggml_backend_sched_new(
1715
- ggml_backend_t * backends,
1716
- ggml_backend_buffer_type_t * bufts,
1717
- int n_backends,
1718
- size_t graph_size,
1719
- bool parallel) {
1720
- GGML_ASSERT(n_backends > 0);
1721
- GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
-
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1725
-
1726
- // initialize hash table
1727
- sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1730
-
1731
- const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1734
-
1735
- sched->n_backends = n_backends;
1736
-
1737
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
-
1739
- const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1741
- sched->splits_capacity = initial_splits_capacity;
1742
-
1743
- for (int b = 0; b < n_backends; b++) {
1744
- sched->backends[b] = backends[b];
1745
- sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1746
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1747
- if (sched->n_copies > 1) {
1748
- for (int c = 0; c < sched->n_copies; c++) {
1749
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
1750
- }
1751
- }
1752
- }
1753
-
1754
- sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1755
-
1756
- ggml_backend_sched_reset(sched);
1757
-
1758
- return sched;
1759
- }
1760
-
1761
- void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1762
- if (sched == NULL) {
1763
- return;
1764
- }
1765
- for (int b = 0; b < sched->n_backends; b++) {
1766
- for (int c = 0; c < sched->n_copies; c++) {
1767
- ggml_backend_event_free(sched->events[b][c]);
1768
- }
1769
- }
1770
- ggml_gallocr_free(sched->galloc);
1771
- ggml_free(sched->ctx);
1772
- free(sched->splits);
1773
- free(sched->hash_set.keys);
1774
- free(sched->tensor_backend_id);
1775
- free(sched->tensor_copies);
1776
- free(sched->node_backend_ids);
1777
- free(sched->leaf_backend_ids);
1778
- free(sched);
1779
- }
1780
-
1781
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1782
- // reset state for the next run
1783
- size_t hash_size = sched->hash_set.size;
1784
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1785
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1786
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
-
1788
- sched->is_reset = true;
1789
- sched->is_alloc = false;
1790
- }
1791
-
1792
- bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1793
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1794
-
1795
- ggml_backend_sched_split_graph(sched, measure_graph);
1796
-
1797
- // TODO: extract this to a separate function
1798
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1799
- return false;
1800
- }
1801
-
1802
- ggml_backend_sched_reset(sched);
1803
- ggml_backend_sched_synchronize(sched);
1804
-
1805
- return true;
1806
- }
1807
-
1808
- bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1809
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1810
-
1811
- ggml_backend_sched_split_graph(sched, graph);
1812
-
1813
- if (!ggml_backend_sched_alloc_splits(sched)) {
1814
- return false;
1815
- }
1816
-
1817
- sched->is_alloc = true;
1818
-
1819
- return true;
1820
- }
1821
-
1822
- enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1823
- enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1824
- ggml_backend_sched_synchronize(sched);
1825
- return err;
1826
- }
1827
-
1828
- enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1829
- if (!sched->is_reset && !sched->is_alloc) {
1830
- ggml_backend_sched_reset(sched);
1831
- }
1832
-
1833
- if (!sched->is_alloc) {
1834
- if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1835
- return GGML_STATUS_ALLOC_FAILED;
1836
- }
1837
- }
1838
-
1839
- return ggml_backend_sched_compute_splits(sched);
1840
- }
1841
-
1842
- void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1843
- for (int i = 0; i < sched->n_backends; i++) {
1844
- ggml_backend_synchronize(sched->backends[i]);
1845
- }
1846
- }
1847
-
1848
- void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1849
- sched->callback_eval = callback;
1850
- sched->callback_eval_user_data = user_data;
1851
- }
1852
-
1853
- int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1854
- return sched->n_splits;
1855
- }
1856
-
1857
- int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1858
- return sched->n_copies;
1859
- }
1860
-
1861
- size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1862
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
1863
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1864
-
1865
- return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1866
- }
1867
-
1868
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1869
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
1870
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1871
- tensor_backend_id(node) = backend_index;
1872
- }
1873
-
1874
- ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1875
- int backend_index = tensor_backend_id(node);
1876
- if (backend_index == -1) {
1877
- return NULL;
1878
- }
1879
- return sched->backends[backend_index];
1880
- }
1881
-
1882
- // utils
1883
-
1884
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1885
- GGML_ASSERT(tensor->buffer == NULL);
1886
- GGML_ASSERT(tensor->view_src != NULL);
1887
- GGML_ASSERT(tensor->view_src->buffer != NULL);
1888
- GGML_ASSERT(tensor->view_src->data != NULL);
1889
-
1890
- tensor->buffer = buffer;
1891
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1892
- tensor->backend = tensor->view_src->backend;
1893
- ggml_backend_buffer_init_tensor(buffer, tensor);
1894
- }
1895
-
1896
- void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
1897
- GGML_ASSERT(tensor->buffer == NULL);
1898
- GGML_ASSERT(tensor->data == NULL);
1899
- GGML_ASSERT(tensor->view_src == NULL);
1900
- GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
1901
- GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1902
- (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
1903
-
1904
- tensor->buffer = buffer;
1905
- tensor->data = addr;
1906
- ggml_backend_buffer_init_tensor(buffer, tensor);
1907
- }
1908
-
1909
- static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1910
- struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1911
-
1912
- GGML_ASSERT(src != NULL);
1913
- GGML_ASSERT(src->data && "graph must be allocated");
1914
-
1915
- size_t id = ggml_hash_insert(hash_set, src);
1916
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
1917
- return node_copies[ggml_hash_find(hash_set, src)];
1918
- }
1919
-
1920
- struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1921
- if (src->view_src != NULL) {
1922
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1923
- dst->view_offs = src->view_offs;
1924
- }
1925
- dst->op = src->op;
1926
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1927
- ggml_set_name(dst, src->name);
1928
-
1929
- // copy src
1930
- for (int i = 0; i < GGML_MAX_SRC; i++) {
1931
- struct ggml_tensor * s = src->src[i];
1932
- if (s == NULL) {
1933
- continue;
1934
- }
1935
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1936
- }
1937
-
1938
- node_copies[id] = dst;
1939
- return dst;
1940
- }
1941
-
1942
- static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1943
- size_t id = ggml_hash_find(hash_set, src);
1944
- if (node_init[id]) {
1945
- return;
1946
- }
1947
- node_init[id] = true;
1948
-
1949
- struct ggml_tensor * dst = node_copies[id];
1950
- if (dst->view_src != NULL) {
1951
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1952
- ggml_backend_view_init(dst->view_src->buffer, dst);
1953
- }
1954
- else {
1955
- ggml_backend_tensor_copy(src, dst);
1956
- }
1957
-
1958
- // init src
1959
- for (int i = 0; i < GGML_MAX_SRC; i++) {
1960
- struct ggml_tensor * s = src->src[i];
1961
- if (s == NULL) {
1962
- continue;
1963
- }
1964
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1965
- }
1966
- }
1967
-
1968
- struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
- struct ggml_hash_set hash_set = {
1970
- /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1972
- };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1975
-
1976
- struct ggml_init_params params = {
1977
- /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
1978
- /* .mem_buffer = */ NULL,
1979
- /* .no_alloc = */ true
1980
- };
1981
-
1982
- struct ggml_context * ctx_allocated = ggml_init(params);
1983
- struct ggml_context * ctx_unallocated = ggml_init(params);
1984
-
1985
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1986
- fprintf(stderr, "failed to allocate context for graph copy\n");
1987
- free(hash_set.keys);
1988
- free(node_copies);
1989
- free(node_init);
1990
- ggml_free(ctx_allocated);
1991
- ggml_free(ctx_unallocated);
1992
- return (struct ggml_backend_graph_copy) {
1993
- /* .buffer = */ NULL,
1994
- /* .ctx_allocated = */ NULL,
1995
- /* .ctx_unallocated = */ NULL,
1996
- /* .graph = */ NULL,
1997
- };
1998
- }
1999
-
2000
- // dup nodes
2001
- for (int i = 0; i < graph->n_nodes; i++) {
2002
- struct ggml_tensor * node = graph->nodes[i];
2003
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2004
- }
2005
-
2006
- // allocate nodes
2007
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2008
- if (buffer == NULL) {
2009
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2010
- free(hash_set.keys);
2011
- free(node_copies);
2012
- free(node_init);
2013
- ggml_free(ctx_allocated);
2014
- ggml_free(ctx_unallocated);
2015
- return (struct ggml_backend_graph_copy) {
2016
- /* .buffer = */ NULL,
2017
- /* .ctx_allocated = */ NULL,
2018
- /* .ctx_unallocated = */ NULL,
2019
- /* .graph = */ NULL,
2020
- };
2021
- }
2022
-
2023
- //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2024
-
2025
- // copy data and init views
2026
- for (int i = 0; i < graph->n_nodes; i++) {
2027
- struct ggml_tensor * node = graph->nodes[i];
2028
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2029
- }
2030
-
2031
- // build graph copy
2032
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2033
- for (int i = 0; i < graph->n_nodes; i++) {
2034
- struct ggml_tensor * node = graph->nodes[i];
2035
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
2036
- graph_copy->nodes[i] = node_copy;
2037
- }
2038
- graph_copy->n_nodes = graph->n_nodes;
2039
-
2040
- free(hash_set.keys);
2041
- free(node_copies);
2042
- free(node_init);
2043
-
2044
- return (struct ggml_backend_graph_copy) {
2045
- /* .buffer = */ buffer,
2046
- /* .ctx_allocated = */ ctx_allocated,
2047
- /* .ctx_unallocated = */ ctx_unallocated,
2048
- /* .graph = */ graph_copy,
2049
- };
2050
- }
2051
-
2052
- void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2053
- ggml_backend_buffer_free(copy.buffer);
2054
- ggml_free(copy.ctx_allocated);
2055
- ggml_free(copy.ctx_unallocated);
2056
- }
2057
-
2058
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2059
- struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2060
- if (copy.buffer == NULL) {
2061
- return false;
2062
- }
2063
-
2064
- struct ggml_cgraph * g1 = graph;
2065
- struct ggml_cgraph * g2 = copy.graph;
2066
-
2067
- assert(g1->n_nodes == g2->n_nodes);
2068
-
2069
- for (int i = 0; i < g1->n_nodes; i++) {
2070
- //printf("eval %d/%d\n", i, g1->n_nodes);
2071
- struct ggml_tensor * t1 = g1->nodes[i];
2072
- struct ggml_tensor * t2 = g2->nodes[i];
2073
-
2074
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2075
-
2076
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2077
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2078
-
2079
- ggml_backend_graph_compute(backend1, &g1v);
2080
- ggml_backend_graph_compute(backend2, &g2v);
2081
-
2082
- if (ggml_is_view_op(t1->op)) {
2083
- continue;
2084
- }
2085
-
2086
- // compare results, calculate rms etc
2087
- if (!callback(i, t1, t2, user_data)) {
2088
- break;
2089
- }
2090
- }
2091
-
2092
- ggml_backend_graph_copy_free(copy);
2093
-
2094
- return true;
2095
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-backend.h DELETED
@@ -1,233 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-alloc.h"
5
-
6
- #ifdef __cplusplus
7
- extern "C" {
8
- #endif
9
-
10
- typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
11
- typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
12
- typedef struct ggml_backend_event * ggml_backend_event_t;
13
- typedef struct ggml_backend * ggml_backend_t;
14
- typedef void * ggml_backend_graph_plan_t;
15
-
16
- //
17
- // Backend buffer
18
- //
19
-
20
- // buffer type
21
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
22
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
23
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
- GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
- GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
27
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
28
-
29
- // buffer
30
- enum ggml_backend_buffer_usage {
31
- GGML_BACKEND_BUFFER_USAGE_ANY = 0,
32
- GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
33
- };
34
-
35
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
36
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
37
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
38
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
39
- GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
41
- GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
42
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
44
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
45
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
46
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
47
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
48
-
49
- //
50
- // Backend
51
- //
52
-
53
- GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
54
- GGML_API const char * ggml_backend_name(ggml_backend_t backend);
55
- GGML_API void ggml_backend_free(ggml_backend_t backend);
56
-
57
- GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
58
- GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
59
- GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
60
- GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
61
-
62
- GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
63
- GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
64
-
65
- GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
66
- GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
67
-
68
- GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
69
-
70
- GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
- GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
-
73
- GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
- GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
76
- GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
77
- GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
-
79
- // tensor copy between different backends
80
- GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
81
-
82
- // asynchronous copy
83
- // the copy is performed after all the currently queued operations in backend_src
84
- // backend_dst will wait for the copy to complete before performing other operations
85
- // automatic fallback to sync copy if async is not supported
86
- GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
87
-
88
- // events
89
- GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
90
- GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
- GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
- GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
94
-
95
- //
96
- // CPU backend
97
- //
98
-
99
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
100
-
101
- GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
102
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
103
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
104
-
105
- // Create a backend buffer from an existing pointer
106
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
107
-
108
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
109
-
110
- #ifdef GGML_USE_CPU_HBM
111
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
112
- #endif
113
-
114
- //
115
- // Backend registry
116
- //
117
-
118
- // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
119
-
120
- GGML_API size_t ggml_backend_reg_get_count(void);
121
- GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
122
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
123
- GGML_API const char * ggml_backend_reg_get_name(size_t i);
124
- GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
125
- GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
126
- GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
127
-
128
- //
129
- // Backend scheduler
130
- //
131
-
132
- // The backend scheduler allows for multiple backends to be used together
133
- // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
134
- // The backends are selected based on:
135
- // - the backend that supports the operation
136
- // - the location of the pre-allocated tensors (e.g. the weights)
137
- /*
138
- Example usage:
139
-
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
- // preferrably to run on the same backend as the buffer
142
- ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
-
144
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
145
-
146
- // initialize buffers from a max size graph (optional)
147
- reserve_graph = build_graph(sched, max_batch_size);
148
-
149
- // manually assign nodes to a backend (optional, should not be needed in most cases)
150
- struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
151
- ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
152
-
153
- ggml_backend_sched_reserve(sched, reserve_graph);
154
-
155
- // compute
156
- graph = build_graph(sched);
157
- ggml_backend_sched_graph_compute(sched, graph);
158
-
159
- // if there are graph inputs:
160
- ggml_backend_sched_reset(sched);
161
- ggml_backend_sched_alloc_graph(sched, graph);
162
- ggml_backend_tensor_set(input_tensor, ...);
163
- ggml_backend_sched_graph_compute(sched, graph);
164
- }
165
- */
166
-
167
- struct ggml_backend_sched;
168
- typedef struct ggml_backend_sched * ggml_backend_sched_t;
169
-
170
- // when ask == true, the scheduler wants to know if the user wants to observe this node
171
- // this allows the scheduler to batch nodes together in order to evaluate them in a single call
172
- //
173
- // when ask == false, the scheduler is passing the node tensor to the user for observation
174
- // if the user returns false, the scheduler will cancel the graph compute
175
- //
176
- typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
177
-
178
- // Initialize a backend scheduler
179
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
180
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
181
-
182
- // Initialize backend buffers from a measure graph
183
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
-
185
- // Get the number of splits of the last graph
186
- GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
- GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
188
-
189
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
190
-
191
- GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
192
- GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
193
-
194
- // Allocate and compute graph on the backend scheduler
195
- GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
196
- GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
197
- GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
198
- GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
199
-
200
- // Reset all assignments and allocators - must be called before changing the node backends
201
- GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
202
-
203
- // Set a callback to be called for each resulting node during graph compute
204
- GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
205
-
206
- //
207
- // Utils
208
- //
209
-
210
- struct ggml_backend_graph_copy {
211
- ggml_backend_buffer_t buffer;
212
- struct ggml_context * ctx_allocated;
213
- struct ggml_context * ctx_unallocated;
214
- struct ggml_cgraph * graph;
215
- };
216
-
217
- // Copy a graph to a different backend
218
- GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
219
- GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
220
-
221
- typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
222
-
223
- // Compare the output of two backends
224
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
225
-
226
- // Tensor initialization
227
- GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
229
-
230
-
231
- #ifdef __cplusplus
232
- }
233
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-common.h DELETED
The diff for this file is too large to render. See raw diff
 
bindings/ruby/ext/ggml-cuda.h DELETED
@@ -1,43 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #ifdef GGML_USE_HIPBLAS
7
- #define GGML_CUDA_NAME "ROCm"
8
- #define GGML_CUBLAS_NAME "hipBLAS"
9
- #else
10
- #define GGML_CUDA_NAME "CUDA"
11
- #define GGML_CUBLAS_NAME "cuBLAS"
12
- #endif
13
-
14
- #ifdef __cplusplus
15
- extern "C" {
16
- #endif
17
-
18
- #define GGML_CUDA_MAX_DEVICES 16
19
-
20
- // backend API
21
- GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
22
-
23
- GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
24
-
25
- // device buffer
26
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
27
-
28
- // split tensor buffer that splits matrices by rows across multiple devices
29
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
30
-
31
- // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
32
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
33
-
34
- GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
35
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
36
- GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
37
-
38
- GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
- GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
-
41
- #ifdef __cplusplus
42
- }
43
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-impl.h DELETED
@@ -1,272 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
-
5
- // GGML internal header
6
-
7
- #include <assert.h>
8
- #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
9
- #include <stddef.h>
10
- #include <stdbool.h>
11
- #include <string.h> // memcpy
12
- #include <math.h> // fabsf
13
-
14
- #ifdef __cplusplus
15
- extern "C" {
16
- #endif
17
-
18
- // static_assert should be a #define, but if it's not,
19
- // fall back to the _Static_assert C11 keyword.
20
- // if C99 - static_assert is noop
21
- // ref: https://stackoverflow.com/a/53923785/4039976
22
- #ifndef __cplusplus
23
- #ifndef static_assert
24
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
25
- #define static_assert(cond, msg) _Static_assert(cond, msg)
26
- #else
27
- #define static_assert(cond, msg) struct global_scope_noop_trick
28
- #endif
29
- #endif
30
- #endif
31
-
32
- // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
33
- #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
34
- #ifndef __FMA__
35
- #define __FMA__
36
- #endif
37
- #ifndef __F16C__
38
- #define __F16C__
39
- #endif
40
- #endif
41
-
42
- // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
43
- #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
44
- #ifndef __SSE3__
45
- #define __SSE3__
46
- #endif
47
- #ifndef __SSSE3__
48
- #define __SSSE3__
49
- #endif
50
- #endif
51
-
52
- // 16-bit float
53
- // on Arm, we use __fp16
54
- // on x86, we use uint16_t
55
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
56
-
57
- // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
58
- //
59
- // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
60
- //
61
- #include <arm_neon.h>
62
-
63
- typedef __fp16 ggml_fp16_internal_t;
64
-
65
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
66
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
67
-
68
- #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
69
-
70
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
71
- ggml_fp16_internal_t tmp;
72
- memcpy(&tmp, &h, sizeof(ggml_fp16_t));
73
- return (float)tmp;
74
- }
75
-
76
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
77
- ggml_fp16_t res;
78
- ggml_fp16_internal_t tmp = f;
79
- memcpy(&res, &tmp, sizeof(ggml_fp16_t));
80
- return res;
81
- }
82
-
83
- #else
84
-
85
- typedef uint16_t ggml_fp16_internal_t;
86
-
87
- #ifdef __wasm_simd128__
88
- #include <wasm_simd128.h>
89
- #else
90
- #ifdef __POWER9_VECTOR__
91
- #include <altivec.h>
92
- #undef bool
93
- #define bool _Bool
94
- #else
95
- #if defined(_MSC_VER) || defined(__MINGW32__)
96
- #include <intrin.h>
97
- #else
98
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
99
- #if !defined(__riscv)
100
- #include <immintrin.h>
101
- #endif
102
- #endif
103
- #endif
104
- #endif
105
- #endif
106
-
107
- #ifdef __riscv_v_intrinsic
108
- #include <riscv_vector.h>
109
- #endif
110
-
111
- #ifdef __F16C__
112
-
113
- #ifdef _MSC_VER
114
- #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
115
- #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
116
- #else
117
- #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
118
- #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
119
- #endif
120
-
121
- #elif defined(__POWER9_VECTOR__)
122
-
123
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
124
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
125
- /* the inline asm below is about 12% faster than the lookup method */
126
- #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
127
- #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
128
-
129
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
130
- register float f;
131
- register double d;
132
- __asm__(
133
- "mtfprd %0,%2\n"
134
- "xscvhpdp %0,%0\n"
135
- "frsp %1,%0\n" :
136
- /* temp */ "=d"(d),
137
- /* out */ "=f"(f):
138
- /* in */ "r"(h));
139
- return f;
140
- }
141
-
142
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
143
- register double d;
144
- register ggml_fp16_t r;
145
- __asm__( /* xscvdphp can work on double or single precision */
146
- "xscvdphp %0,%2\n"
147
- "mffprd %1,%0\n" :
148
- /* temp */ "=d"(d),
149
- /* out */ "=r"(r):
150
- /* in */ "f"(f));
151
- return r;
152
- }
153
-
154
- #else
155
-
156
- // FP16 <-> FP32
157
- // ref: https://github.com/Maratyszcza/FP16
158
-
159
- static inline float fp32_from_bits(uint32_t w) {
160
- union {
161
- uint32_t as_bits;
162
- float as_value;
163
- } fp32;
164
- fp32.as_bits = w;
165
- return fp32.as_value;
166
- }
167
-
168
- static inline uint32_t fp32_to_bits(float f) {
169
- union {
170
- float as_value;
171
- uint32_t as_bits;
172
- } fp32;
173
- fp32.as_value = f;
174
- return fp32.as_bits;
175
- }
176
-
177
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
178
- const uint32_t w = (uint32_t) h << 16;
179
- const uint32_t sign = w & UINT32_C(0x80000000);
180
- const uint32_t two_w = w + w;
181
-
182
- const uint32_t exp_offset = UINT32_C(0xE0) << 23;
183
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
184
- const float exp_scale = 0x1.0p-112f;
185
- #else
186
- const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
187
- #endif
188
- const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
189
-
190
- const uint32_t magic_mask = UINT32_C(126) << 23;
191
- const float magic_bias = 0.5f;
192
- const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
193
-
194
- const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
195
- const uint32_t result = sign |
196
- (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
197
- return fp32_from_bits(result);
198
- }
199
-
200
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
201
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
202
- const float scale_to_inf = 0x1.0p+112f;
203
- const float scale_to_zero = 0x1.0p-110f;
204
- #else
205
- const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
206
- const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
207
- #endif
208
- float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
209
-
210
- const uint32_t w = fp32_to_bits(f);
211
- const uint32_t shl1_w = w + w;
212
- const uint32_t sign = w & UINT32_C(0x80000000);
213
- uint32_t bias = shl1_w & UINT32_C(0xFF000000);
214
- if (bias < UINT32_C(0x71000000)) {
215
- bias = UINT32_C(0x71000000);
216
- }
217
-
218
- base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
219
- const uint32_t bits = fp32_to_bits(base);
220
- const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
221
- const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
222
- const uint32_t nonsign = exp_bits + mantissa_bits;
223
- return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
224
- }
225
-
226
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
227
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
228
-
229
- #endif // __F16C__
230
-
231
- #endif // __ARM_NEON
232
-
233
- // precomputed f32 table for f16 (256 KB)
234
- // defined in ggml.c, initialized in ggml_init()
235
- extern float ggml_table_f32_f16[1 << 16];
236
-
237
- // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
238
- // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
239
- // This is also true for POWER9.
240
- #if !defined(GGML_FP16_TO_FP32)
241
- inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
242
- uint16_t s;
243
- memcpy(&s, &f, sizeof(uint16_t));
244
- return ggml_table_f32_f16[s];
245
- }
246
-
247
- #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
248
- #endif
249
-
250
- #if !defined(GGML_FP32_TO_FP16)
251
- #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
252
- #endif
253
-
254
- #define GGML_HASHTABLE_FULL ((size_t)-1)
255
- #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
256
-
257
- struct ggml_hash_set ggml_hash_set_new(size_t size);
258
-
259
- bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
260
-
261
- // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
262
- size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
263
-
264
- // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
265
- size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
266
-
267
- // return index, asserts if table is full
268
- size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
269
-
270
- #ifdef __cplusplus
271
- }
272
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-kompute.h DELETED
@@ -1,46 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #include <stdbool.h>
7
- #include <stddef.h>
8
- #include <stdint.h>
9
-
10
- #ifdef __cplusplus
11
- extern "C" {
12
- #endif
13
-
14
- struct ggml_vk_device {
15
- int index;
16
- int type; // same as VkPhysicalDeviceType
17
- size_t heapSize;
18
- const char * name;
19
- const char * vendor;
20
- int subgroupSize;
21
- uint64_t bufferAlignment;
22
- uint64_t maxAlloc;
23
- };
24
-
25
- struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
26
- bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
27
- bool ggml_vk_has_vulkan(void);
28
- bool ggml_vk_has_device(void);
29
- struct ggml_vk_device ggml_vk_current_device(void);
30
-
31
- //
32
- // backend API
33
- //
34
-
35
- // forward declaration
36
- typedef struct ggml_backend * ggml_backend_t;
37
-
38
- GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
39
-
40
- GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
41
-
42
- GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
43
-
44
- #ifdef __cplusplus
45
- }
46
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-metal.h DELETED
@@ -1,66 +0,0 @@
1
- // An interface allowing to compute ggml_cgraph with Metal
2
- //
3
- // This is a fully functional interface that extends ggml with GPU support for Apple devices.
4
- // A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
5
- //
6
- // How it works?
7
- //
8
- // As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
9
- // interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
10
- // use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
11
- //
12
- // You only need to make sure that all memory buffers that you used during the graph creation
13
- // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
14
- // used during the graph evaluation to determine the arguments of the compute kernels.
15
- //
16
- // Synchronization between device and host memory (for example for input and output tensors)
17
- // is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
18
- //
19
-
20
- #pragma once
21
-
22
- #include "ggml.h"
23
- #include "ggml-backend.h"
24
-
25
- #include <stddef.h>
26
- #include <stdbool.h>
27
-
28
- // max memory buffers that can be mapped to the device
29
- #define GGML_METAL_MAX_BUFFERS 64
30
-
31
- struct ggml_tensor;
32
- struct ggml_cgraph;
33
-
34
- #ifdef __cplusplus
35
- extern "C" {
36
- #endif
37
-
38
- //
39
- // backend API
40
- // user-code should use only these functions
41
- //
42
-
43
- GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
44
-
45
- GGML_API ggml_backend_t ggml_backend_metal_init(void);
46
-
47
- GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
48
-
49
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
50
-
51
- GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
52
-
53
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
54
-
55
- // helper to check if the device supports a specific family
56
- // ideally, the user code should be doing these checks
57
- // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
58
- GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
59
-
60
- // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
61
- GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
62
-
63
- #ifdef __cplusplus
64
- }
65
- #endif
66
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-opencl.h DELETED
@@ -1,36 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #ifdef __cplusplus
7
- extern "C" {
8
- #endif
9
-
10
- GGML_API void ggml_cl_init(void);
11
-
12
- GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
- GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
- GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
15
- GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
16
- GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
17
-
18
- // GGML_API void * ggml_cl_host_malloc(size_t size);
19
- // GGML_API void ggml_cl_host_free(void * ptr);
20
-
21
- GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
22
-
23
- GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
24
-
25
- // backend API
26
-
27
- // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
28
-
29
- // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
30
-
31
- GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
32
- // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
33
-
34
- #ifdef __cplusplus
35
- }
36
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-quants.c DELETED
The diff for this file is too large to render. See raw diff
 
bindings/ruby/ext/ggml-quants.h DELETED
@@ -1,133 +0,0 @@
1
- #pragma once
2
-
3
- #define GGML_COMMON_DECL_C
4
- #include "ggml-common.h"
5
-
6
- #include "ggml.h"
7
-
8
- // GGML internal header
9
-
10
- #ifdef __cplusplus
11
- extern "C" {
12
- #endif
13
-
14
- // Quantization
15
- void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
16
- void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
17
- void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
18
- void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
19
- void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
20
- void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
21
-
22
- void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
23
- void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
24
- void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
25
- void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
26
- void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
27
- void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
28
-
29
- void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
30
- void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
31
- void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
32
- void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
33
- void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
34
-
35
- void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
36
- void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
37
- void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
38
- void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
39
- void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
40
- void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
41
-
42
- void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
43
- void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
44
- void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
45
- void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
46
- void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
47
- void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
48
-
49
- void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
50
- void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
51
- void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
52
- void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
53
- void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
54
-
55
- // Dequantization
56
- void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
57
- void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
58
- void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
59
- void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
60
- void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
61
- //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
62
-
63
- void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
64
- void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
65
- void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
66
- void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
67
- void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
68
- void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
69
-
70
- void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
71
- void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
72
- void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
73
- void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
74
- void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
75
- void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
76
- void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
77
- void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
78
- void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
79
-
80
- // Dot product
81
- void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
82
- void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
83
- void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
84
- void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
85
- void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
86
-
87
- void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
88
- void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
89
- void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
90
- void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
91
- void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
92
-
93
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
94
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
95
- void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
96
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
97
- void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
98
- void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
99
- void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
100
- void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
101
- void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
102
-
103
- // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
104
- size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
105
- size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
106
- size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
107
- size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
108
- size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
109
- size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
110
- size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
111
- size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
112
- size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
113
-
114
- size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
115
- size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
116
- size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
117
- size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
118
- size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
119
- size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
120
- size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
121
- size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
122
- size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
123
- size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
124
-
125
- void iq2xs_init_impl(enum ggml_type type);
126
- void iq2xs_free_impl(enum ggml_type type);
127
- void iq3xs_init_impl(int grid_size);
128
- void iq3xs_free_impl(int grid_size);
129
-
130
- #ifdef __cplusplus
131
- }
132
- #endif
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-sycl.h DELETED
@@ -1,49 +0,0 @@
1
- //
2
- // MIT license
3
- // Copyright (C) 2024 Intel Corporation
4
- // SPDX-License-Identifier: MIT
5
- //
6
-
7
- #pragma once
8
-
9
- #include "ggml.h"
10
- #include "ggml-backend.h"
11
-
12
- #ifdef __cplusplus
13
- extern "C" {
14
- #endif
15
-
16
- #define GGML_SYCL_MAX_DEVICES 48
17
- #define GGML_SYCL_NAME "SYCL"
18
-
19
- // backend API
20
- GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
21
-
22
- // devide buffer
23
- GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
24
-
25
- // split tensor buffer that splits matrices by rows across multiple devices
26
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
27
-
28
- // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
29
- GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
30
-
31
- GGML_API void ggml_backend_sycl_print_sycl_devices(void);
32
- GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
33
- GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
34
- GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
35
- GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
36
- GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
37
-
38
- // TODO: these are temporary
39
- // ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
40
- GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
41
- GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
42
- GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
43
-
44
- // SYCL doesn't support registering host memory, keep here for reference
45
- // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
46
- // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
47
- #ifdef __cplusplus
48
- }
49
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/ext/ggml-vulkan.h DELETED
@@ -1,29 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #ifdef __cplusplus
7
- extern "C" {
8
- #endif
9
-
10
- #define GGML_VK_NAME "Vulkan"
11
- #define GGML_VK_MAX_DEVICES 16
12
-
13
- GGML_API void ggml_vk_instance_init(void);
14
-
15
- // backend API
16
- GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
17
-
18
- GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
19
- GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
20
- GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
21
- GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
22
-
23
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
24
- // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
25
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
26
-
27
- #ifdef __cplusplus
28
- }
29
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bindings/ruby/extsources.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ ../../src:
3
+ - ext/whisper.cpp
4
+ ../../include:
5
+ - ext/whisper.h
6
+ ../../ggml/src:
7
+ - ext/ggml.c
8
+ - ext/ggml-impl.h
9
+ - ext/ggml-aarch64.h
10
+ - ext/ggml-aarch64.c
11
+ - ext/ggml-alloc.c
12
+ - ext/ggml-backend-impl.h
13
+ - ext/ggml-backend.cpp
14
+ - ext/ggml-common.h
15
+ - ext/ggml-quants.h
16
+ - ext/ggml-quants.c
17
+ - ext/ggml-cpu-impl.h
18
+ ../../ggml/include:
19
+ - ext/ggml.h
20
+ - ext/ggml-alloc.h
21
+ - ext/ggml-backend.h
22
+ - ext/ggml-cuda.h
23
+ - ext/ggml-kompute.h
24
+ - ext/ggml-metal.h
25
+ - ext/ggml-sycl.h
26
+ - ext/ggml-vulkan.h
27
+ ../../examples:
28
+ - ext/dr_wav.h
29
+ ../..:
30
+ - README.md
31
+ - LICENSE
32
+
bindings/ruby/tests/test_whisper.rb CHANGED
@@ -1,11 +1,10 @@
1
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
2
- EXTDIR = File.join(TOPDIR, 'ext')
3
- #$LIBDIR = File.join(TOPDIR, 'lib')
4
- #$:.unshift(LIBDIR)
5
- $:.unshift(EXTDIR)
6
 
7
  require 'whisper'
8
  require 'test/unit'
 
 
 
9
 
10
  class TestWhisper < Test::Unit::TestCase
11
  def setup
@@ -128,4 +127,25 @@ class TestWhisper < Test::Unit::TestCase
128
  }
129
  end
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  end
 
1
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
 
 
 
 
2
 
3
  require 'whisper'
4
  require 'test/unit'
5
+ require 'tempfile'
6
+ require 'tmpdir'
7
+ require 'shellwords'
8
 
9
  class TestWhisper < Test::Unit::TestCase
10
  def setup
 
127
  }
128
  end
129
 
130
+ def test_build
131
+ Tempfile.create do |file|
132
+ assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
133
+ assert_path_exist file.to_path
134
+ end
135
+ end
136
+
137
+ sub_test_case "Building binary on installation" do
138
+ def setup
139
+ system "rake", "build", exception: true
140
+ end
141
+
142
+ def test_install
143
+ filename = `rake -Tbuild`.match(/(whispercpp-(?:.+)\.gem)/)[1]
144
+ basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
145
+ Dir.mktmpdir do |dir|
146
+ system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
147
+ assert_path_exist File.join(dir, "gems/whispercpp-1.3.0/lib", basename)
148
+ end
149
+ end
150
+ end
151
  end
bindings/ruby/whispercpp.gemspec CHANGED
@@ -1,3 +1,5 @@
 
 
1
  Gem::Specification.new do |s|
2
  s.name = "whispercpp"
3
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@@ -7,10 +9,8 @@ Gem::Specification.new do |s|
7
  s.email = '[email protected]'
8
  s.extra_rdoc_files = ['LICENSE', 'README.md']
9
 
10
- s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
11
 
12
- #### Load-time details
13
- s.require_paths = ['lib','ext']
14
  s.summary = %q{Ruby whisper.cpp bindings}
15
  s.test_files = ["tests/test_whisper.rb"]
16
 
 
1
+ require "yaml"
2
+
3
  Gem::Specification.new do |s|
4
  s.name = "whispercpp"
5
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
 
9
  s.email = '[email protected]'
10
  s.extra_rdoc_files = ['LICENSE', 'README.md']
11
 
12
+ s.files = `git ls-files . -z`.split("\x0") + YAML.load_file("extsources.yaml").values.flatten
13
 
 
 
14
  s.summary = %q{Ruby whisper.cpp bindings}
15
  s.test_files = ["tests/test_whisper.rb"]
16