Spaces:
Running
Running
rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (llama/12943)
Browse filesRPC_CMD_SET_TENSOR always returns an empty response and we send this 4
times per token. We can improve TG speed if we don't wait for this empty
response.
The performance impact of this change depends on the network latency.
- ggml/include/ggml-rpc.h +1 -1
- ggml/src/ggml-rpc/ggml-rpc.cpp +12 -6
ggml/include/ggml-rpc.h
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
extern "C" {
|
| 8 |
#endif
|
| 9 |
|
| 10 |
-
#define RPC_PROTO_MAJOR_VERSION
|
| 11 |
#define RPC_PROTO_MINOR_VERSION 0
|
| 12 |
#define RPC_PROTO_PATCH_VERSION 0
|
| 13 |
#define GGML_RPC_MAX_SERVERS 16
|
|
|
|
| 7 |
extern "C" {
|
| 8 |
#endif
|
| 9 |
|
| 10 |
+
#define RPC_PROTO_MAJOR_VERSION 2
|
| 11 |
#define RPC_PROTO_MINOR_VERSION 0
|
| 12 |
#define RPC_PROTO_PATCH_VERSION 0
|
| 13 |
#define GGML_RPC_MAX_SERVERS 16
|
ggml/src/ggml-rpc/ggml-rpc.cpp
CHANGED
|
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
|
|
| 378 |
}
|
| 379 |
|
| 380 |
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
| 381 |
-
//
|
| 382 |
-
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size
|
| 383 |
uint8_t cmd_byte = cmd;
|
| 384 |
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
| 385 |
return false;
|
|
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|
| 390 |
if (!send_data(sock->fd, input, input_size)) {
|
| 391 |
return false;
|
| 392 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
| 394 |
// even if we do, we can skip sending output_size from the server for commands with known output size
|
| 395 |
uint64_t out_size;
|
|
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
|
| 555 |
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
| 556 |
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
| 557 |
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
| 558 |
-
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size()
|
| 559 |
GGML_ASSERT(status);
|
| 560 |
}
|
| 561 |
|
|
@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|
| 1428 |
if (!server.set_tensor(input)) {
|
| 1429 |
return;
|
| 1430 |
}
|
| 1431 |
-
if (!send_msg(sockfd, nullptr, 0)) {
|
| 1432 |
-
return;
|
| 1433 |
-
}
|
| 1434 |
break;
|
| 1435 |
}
|
| 1436 |
case RPC_CMD_SET_TENSOR_HASH: {
|
|
|
|
| 378 |
}
|
| 379 |
|
| 380 |
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
| 381 |
+
// No response
|
| 382 |
+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
|
| 383 |
uint8_t cmd_byte = cmd;
|
| 384 |
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
| 385 |
return false;
|
|
|
|
| 390 |
if (!send_data(sock->fd, input, input_size)) {
|
| 391 |
return false;
|
| 392 |
}
|
| 393 |
+
return true;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
| 397 |
+
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
| 398 |
+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
|
| 399 |
+
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
|
| 400 |
+
return false;
|
| 401 |
+
}
|
| 402 |
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
| 403 |
// even if we do, we can skip sending output_size from the server for commands with known output size
|
| 404 |
uint64_t out_size;
|
|
|
|
| 564 |
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
| 565 |
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
| 566 |
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
| 567 |
+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
| 568 |
GGML_ASSERT(status);
|
| 569 |
}
|
| 570 |
|
|
|
|
| 1437 |
if (!server.set_tensor(input)) {
|
| 1438 |
return;
|
| 1439 |
}
|
|
|
|
|
|
|
|
|
|
| 1440 |
break;
|
| 1441 |
}
|
| 1442 |
case RPC_CMD_SET_TENSOR_HASH: {
|