Spaces:
Sleeping
Sleeping
bench : pass memcpy threads from cli
Browse files- whisper.cpp +8 -8
whisper.cpp
CHANGED
|
@@ -6138,7 +6138,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6138 |
|
| 6139 |
// multi-thread
|
| 6140 |
|
| 6141 |
-
for (uint32_t
|
| 6142 |
char * src = (char *) malloc(size);
|
| 6143 |
char * dst = (char *) malloc(size);
|
| 6144 |
|
|
@@ -6149,8 +6149,8 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6149 |
double tsum = 0.0;
|
| 6150 |
|
| 6151 |
auto helper = [&](int th) {
|
| 6152 |
-
const int64_t i0 = (th + 0)*size/
|
| 6153 |
-
const int64_t i1 = (th + 1)*size/
|
| 6154 |
|
| 6155 |
for (size_t i = 0; i < n; i++) {
|
| 6156 |
memcpy(dst + i0, src + i0, i1 - i0);
|
|
@@ -6161,14 +6161,14 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6161 |
|
| 6162 |
const int64_t t0 = ggml_time_us();
|
| 6163 |
|
| 6164 |
-
std::vector<std::thread> threads(
|
| 6165 |
-
for (uint32_t th = 0; th <
|
| 6166 |
threads[th] = std::thread(helper, th);
|
| 6167 |
}
|
| 6168 |
|
| 6169 |
-
helper(
|
| 6170 |
|
| 6171 |
-
for (uint32_t th = 0; th <
|
| 6172 |
threads[th].join();
|
| 6173 |
}
|
| 6174 |
|
|
@@ -6176,7 +6176,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|
| 6176 |
|
| 6177 |
tsum += (t1 - t0)*1e-6;
|
| 6178 |
|
| 6179 |
-
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9),
|
| 6180 |
s += strbuf;
|
| 6181 |
|
| 6182 |
// needed to prevent the compiler from optimizing the memcpy away
|
|
|
|
| 6138 |
|
| 6139 |
// multi-thread
|
| 6140 |
|
| 6141 |
+
for (uint32_t k = 1; k <= n_threads; k++) {
|
| 6142 |
char * src = (char *) malloc(size);
|
| 6143 |
char * dst = (char *) malloc(size);
|
| 6144 |
|
|
|
|
| 6149 |
double tsum = 0.0;
|
| 6150 |
|
| 6151 |
auto helper = [&](int th) {
|
| 6152 |
+
const int64_t i0 = (th + 0)*size/k;
|
| 6153 |
+
const int64_t i1 = (th + 1)*size/k;
|
| 6154 |
|
| 6155 |
for (size_t i = 0; i < n; i++) {
|
| 6156 |
memcpy(dst + i0, src + i0, i1 - i0);
|
|
|
|
| 6161 |
|
| 6162 |
const int64_t t0 = ggml_time_us();
|
| 6163 |
|
| 6164 |
+
std::vector<std::thread> threads(k - 1);
|
| 6165 |
+
for (uint32_t th = 0; th < k - 1; ++th) {
|
| 6166 |
threads[th] = std::thread(helper, th);
|
| 6167 |
}
|
| 6168 |
|
| 6169 |
+
helper(k - 1);
|
| 6170 |
|
| 6171 |
+
for (uint32_t th = 0; th < k - 1; ++th) {
|
| 6172 |
threads[th].join();
|
| 6173 |
}
|
| 6174 |
|
|
|
|
| 6176 |
|
| 6177 |
tsum += (t1 - t0)*1e-6;
|
| 6178 |
|
| 6179 |
+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), k);
|
| 6180 |
s += strbuf;
|
| 6181 |
|
| 6182 |
// needed to prevent the compiler from optimizing the memcpy away
|