Spaces:

natasa365
/

whisper.cpp

Running

rgerganov commited on Apr 25

Commit

691c071

1 Parent(s): cf46d5c

rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (llama/12943)

RPC_CMD_SET_TENSOR always returns an empty response and we send this 4
times per token. We can improve TG speed if we don't wait for this empty
response.

The performance impact of this change depends on the network latency.

Files changed (2) hide show

ggml/include/ggml-rpc.h +1 -1
ggml/src/ggml-rpc/ggml-rpc.cpp +12 -6

ggml/include/ggml-rpc.h CHANGED Viewed

@@ -7,7 +7,7 @@
 extern "C" {
 #endif
-#define RPC_PROTO_MAJOR_VERSION    1
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

 extern "C" {
 #endif
+#define RPC_PROTO_MAJOR_VERSION    2
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

ggml/src/ggml-rpc/ggml-rpc.cpp CHANGED Viewed

@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
 }
 // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
-// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
-static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
     uint8_t cmd_byte = cmd;
     if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
         return false;
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
     if (!send_data(sock->fd, input, input_size)) {
         return false;
     }
     // TODO: currently the output_size is always known, do we need support for commands with variable output size?
     // even if we do, we can skip sending output_size from the server for commands with known output size
     uint64_t out_size;
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
     memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
     memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
     memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
     GGML_ASSERT(status);
 }
@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                 if (!server.set_tensor(input)) {
                     return;
                 }
-                if (!send_msg(sockfd, nullptr, 0)) {
-                    return;
-                }
                 break;
             }
             case RPC_CMD_SET_TENSOR_HASH: {

 }
 // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
+// No response
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
     uint8_t cmd_byte = cmd;
     if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
         return false;
     if (!send_data(sock->fd, input, input_size)) {
         return false;
     }
+    return true;
+}
+// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
+// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
+    if (!send_rpc_cmd(sock, cmd, input, input_size)) {
+        return false;
+    }
     // TODO: currently the output_size is always known, do we need support for commands with variable output size?
     // even if we do, we can skip sending output_size from the server for commands with known output size
     uint64_t out_size;
     memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
     memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
     memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
     GGML_ASSERT(status);
 }
                 if (!server.set_tensor(input)) {
                     return;
                 }
                 break;
             }
             case RPC_CMD_SET_TENSOR_HASH: {