aixsatoshi Shinnosuke Takagi commited on
Commit
c8284f2
·
1 Parent(s): 8fca6dd

ggml-rpc: chunk send()/recv() to avoid EINVAL for very large tensors over RPC (macOS & others) (llama/15188)

Browse files

* ggml-rpc: chunk send()/recv() to avoid EINVAL for very large tensors over RPC (macOS & others). Fixes #15055

* ggml-rpc: rename RPC_IO_CHUNK->MAX_CHUNK_SIZE, use std::min() for cap, switch to GGML_LOG_ERROR, handle 0-length send/recv

* rpc: drop n==0 special case in send_data(); retry in loop per review

* rpc: remove trailing whitespace in send_data()

---------

Co-authored-by: Shinnosuke Takagi <[email protected]>

Files changed (1) hide show
  1. ggml/src/ggml-rpc/ggml-rpc.cpp +18 -5
ggml/src/ggml-rpc/ggml-rpc.cpp CHANGED
@@ -29,9 +29,12 @@
29
  #include <cstring>
30
  #include <fstream>
31
  #include <filesystem>
 
32
 
33
  namespace fs = std::filesystem;
34
 
 
 
35
  #ifdef _WIN32
36
  typedef SOCKET sockfd_t;
37
  using ssize_t = __int64;
@@ -323,11 +326,14 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
323
  static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
324
  size_t bytes_sent = 0;
325
  while (bytes_sent < size) {
326
- ssize_t n = send(sockfd, (const char *)data + bytes_sent, size - bytes_sent, 0);
 
327
  if (n < 0) {
 
 
328
  return false;
329
  }
330
- bytes_sent += n;
331
  }
332
  return true;
333
  }
@@ -335,11 +341,18 @@ static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
335
  static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
336
  size_t bytes_recv = 0;
337
  while (bytes_recv < size) {
338
- ssize_t n = recv(sockfd, (char *)data + bytes_recv, size - bytes_recv, 0);
339
- if (n <= 0) {
 
 
 
 
 
 
 
340
  return false;
341
  }
342
- bytes_recv += n;
343
  }
344
  return true;
345
  }
 
29
  #include <cstring>
30
  #include <fstream>
31
  #include <filesystem>
32
+ #include <algorithm>
33
 
34
  namespace fs = std::filesystem;
35
 
36
+ static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
37
+
38
  #ifdef _WIN32
39
  typedef SOCKET sockfd_t;
40
  using ssize_t = __int64;
 
326
  static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
327
  size_t bytes_sent = 0;
328
  while (bytes_sent < size) {
329
+ size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
330
+ ssize_t n = send(sockfd, (const char *)data + bytes_sent, size_to_send, 0);
331
  if (n < 0) {
332
+ GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
333
+ bytes_sent, size_to_send);
334
  return false;
335
  }
336
+ bytes_sent += (size_t)n;
337
  }
338
  return true;
339
  }
 
341
  static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
342
  size_t bytes_recv = 0;
343
  while (bytes_recv < size) {
344
+ size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
345
+ ssize_t n = recv(sockfd, (char *)data + bytes_recv, size_to_recv, 0);
346
+ if (n < 0) {
347
+ GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
348
+ bytes_recv, size_to_recv);
349
+ return false;
350
+ }
351
+ if (n == 0) {
352
+ GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
353
  return false;
354
  }
355
+ bytes_recv += (size_t)n;
356
  }
357
  return true;
358
  }