ggerganov commited on
Commit
027d207
·
unverified ·
1 Parent(s): c0ee6b3

bench : pass memcpy threads from cli

Browse files
Files changed (1) hide show
  1. whisper.cpp +8 -8
whisper.cpp CHANGED
@@ -6138,7 +6138,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6138
 
6139
  // multi-thread
6140
 
6141
- for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
6142
  char * src = (char *) malloc(size);
6143
  char * dst = (char *) malloc(size);
6144
 
@@ -6149,8 +6149,8 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6149
  double tsum = 0.0;
6150
 
6151
  auto helper = [&](int th) {
6152
- const int64_t i0 = (th + 0)*size/n_threads;
6153
- const int64_t i1 = (th + 1)*size/n_threads;
6154
 
6155
  for (size_t i = 0; i < n; i++) {
6156
  memcpy(dst + i0, src + i0, i1 - i0);
@@ -6161,14 +6161,14 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6161
 
6162
  const int64_t t0 = ggml_time_us();
6163
 
6164
- std::vector<std::thread> threads(n_threads - 1);
6165
- for (uint32_t th = 0; th < n_threads - 1; ++th) {
6166
  threads[th] = std::thread(helper, th);
6167
  }
6168
 
6169
- helper(n_threads - 1);
6170
 
6171
- for (uint32_t th = 0; th < n_threads - 1; ++th) {
6172
  threads[th].join();
6173
  }
6174
 
@@ -6176,7 +6176,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6176
 
6177
  tsum += (t1 - t0)*1e-6;
6178
 
6179
- snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
6180
  s += strbuf;
6181
 
6182
  // needed to prevent the compiler from optimizing the memcpy away
 
6138
 
6139
  // multi-thread
6140
 
6141
+ for (uint32_t k = 1; k <= n_threads; k++) {
6142
  char * src = (char *) malloc(size);
6143
  char * dst = (char *) malloc(size);
6144
 
 
6149
  double tsum = 0.0;
6150
 
6151
  auto helper = [&](int th) {
6152
+ const int64_t i0 = (th + 0)*size/k;
6153
+ const int64_t i1 = (th + 1)*size/k;
6154
 
6155
  for (size_t i = 0; i < n; i++) {
6156
  memcpy(dst + i0, src + i0, i1 - i0);
 
6161
 
6162
  const int64_t t0 = ggml_time_us();
6163
 
6164
+ std::vector<std::thread> threads(k - 1);
6165
+ for (uint32_t th = 0; th < k - 1; ++th) {
6166
  threads[th] = std::thread(helper, th);
6167
  }
6168
 
6169
+ helper(k - 1);
6170
 
6171
+ for (uint32_t th = 0; th < k - 1; ++th) {
6172
  threads[th].join();
6173
  }
6174
 
 
6176
 
6177
  tsum += (t1 - t0)*1e-6;
6178
 
6179
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), k);
6180
  s += strbuf;
6181
 
6182
  // needed to prevent the compiler from optimizing the memcpy away