rgerganov commited on
Commit
691c071
·
1 Parent(s): cf46d5c

rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (llama/12943)

Browse files

RPC_CMD_SET_TENSOR always returns an empty response and we send this 4
times per token. We can improve TG speed if we don't wait for this empty
response.

The performance impact of this change depends on the network latency.

ggml/include/ggml-rpc.h CHANGED
@@ -7,7 +7,7 @@
7
  extern "C" {
8
  #endif
9
 
10
- #define RPC_PROTO_MAJOR_VERSION 1
11
  #define RPC_PROTO_MINOR_VERSION 0
12
  #define RPC_PROTO_PATCH_VERSION 0
13
  #define GGML_RPC_MAX_SERVERS 16
 
7
  extern "C" {
8
  #endif
9
 
10
+ #define RPC_PROTO_MAJOR_VERSION 2
11
  #define RPC_PROTO_MINOR_VERSION 0
12
  #define RPC_PROTO_PATCH_VERSION 0
13
  #define GGML_RPC_MAX_SERVERS 16
ggml/src/ggml-rpc/ggml-rpc.cpp CHANGED
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
378
  }
379
 
380
  // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
381
- // RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
382
- static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
383
  uint8_t cmd_byte = cmd;
384
  if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
385
  return false;
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
390
  if (!send_data(sock->fd, input, input_size)) {
391
  return false;
392
  }
 
 
 
 
 
 
 
 
 
393
  // TODO: currently the output_size is always known, do we need support for commands with variable output size?
394
  // even if we do, we can skip sending output_size from the server for commands with known output size
395
  uint64_t out_size;
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
555
  memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
556
  memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
557
  memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
558
- bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
559
  GGML_ASSERT(status);
560
  }
561
 
@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
1428
  if (!server.set_tensor(input)) {
1429
  return;
1430
  }
1431
- if (!send_msg(sockfd, nullptr, 0)) {
1432
- return;
1433
- }
1434
  break;
1435
  }
1436
  case RPC_CMD_SET_TENSOR_HASH: {
 
378
  }
379
 
380
  // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
381
+ // No response
382
+ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
383
  uint8_t cmd_byte = cmd;
384
  if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
385
  return false;
 
390
  if (!send_data(sock->fd, input, input_size)) {
391
  return false;
392
  }
393
+ return true;
394
+ }
395
+
396
+ // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
397
+ // RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
398
+ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
399
+ if (!send_rpc_cmd(sock, cmd, input, input_size)) {
400
+ return false;
401
+ }
402
  // TODO: currently the output_size is always known, do we need support for commands with variable output size?
403
  // even if we do, we can skip sending output_size from the server for commands with known output size
404
  uint64_t out_size;
 
564
  memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
565
  memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
566
  memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
567
+ bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
568
  GGML_ASSERT(status);
569
  }
570
 
 
1437
  if (!server.set_tensor(input)) {
1438
  return;
1439
  }
 
 
 
1440
  break;
1441
  }
1442
  case RPC_CMD_SET_TENSOR_HASH: {