litong bobqianic commited on
Commit
30cdb60
·
unverified ·
1 Parent(s): fa72f91

Examples: Add save audio to file option in stream.cpp (#1310)

Browse files

* save the recorded audio to a file

* Alignment -help

* Save the correct audio

* chage to a consistent coding style

* Correct typo

* Update examples/stream/stream.cpp

* Update examples/stream/stream.cpp

* Correct variable misuse

* Update examples/stream/stream.cpp

* Update examples/stream/stream.cpp

* Update examples/stream/stream.cpp

* Update examples/stream/stream.cpp

---------

Co-authored-by: bobqianic <[email protected]>

Files changed (1) hide show
  1. examples/stream/stream.cpp +75 -7
examples/stream/stream.cpp CHANGED
@@ -2,7 +2,7 @@
2
  //
3
  // A very quick-n-dirty implementation serving mainly as a proof of concept.
4
  //
5
-
6
  #include "common-sdl.h"
7
  #include "common.h"
8
  #include "whisper.h"
@@ -13,7 +13,60 @@
13
  #include <thread>
14
  #include <vector>
15
  #include <fstream>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
17
  // 500 -> 00:05.000
18
  // 6000 -> 01:00.000
19
  std::string to_timestamp(int64_t t) {
@@ -52,6 +105,7 @@ struct whisper_params {
52
  std::string language = "en";
53
  std::string model = "models/ggml-base.en.bin";
54
  std::string fname_out;
 
55
  };
56
 
57
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -82,6 +136,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
82
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
83
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
84
  else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
 
85
 
86
  else {
87
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -117,6 +172,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
117
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
118
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
119
  fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
 
120
  fprintf(stderr, "\n");
121
  }
122
 
@@ -154,7 +210,6 @@ int main(int argc, char ** argv) {
154
  audio.resume();
155
 
156
  // whisper init
157
-
158
  if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
159
  fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
160
  whisper_print_usage(argc, argv, params);
@@ -211,15 +266,28 @@ int main(int argc, char ** argv) {
211
  return 1;
212
  }
213
  }
214
-
215
- printf("[Start speaking]");
 
 
 
 
 
 
 
 
 
 
216
  fflush(stdout);
217
 
218
- auto t_last = std::chrono::high_resolution_clock::now();
219
  const auto t_start = t_last;
220
 
221
  // main audio loop
222
  while (is_running) {
 
 
 
223
  // handle Ctrl + C
224
  is_running = sdl_poll_events();
225
 
@@ -371,7 +439,7 @@ int main(int argc, char ** argv) {
371
  fout << std::endl;
372
  }
373
 
374
- if (use_vad){
375
  printf("\n");
376
  printf("### Transcription %d END\n", n_iter);
377
  }
@@ -408,4 +476,4 @@ int main(int argc, char ** argv) {
408
  whisper_free(ctx);
409
 
410
  return 0;
411
- }
 
2
  //
3
  // A very quick-n-dirty implementation serving mainly as a proof of concept.
4
  //
5
+ #include <fstream>
6
  #include "common-sdl.h"
7
  #include "common.h"
8
  #include "whisper.h"
 
13
  #include <thread>
14
  #include <vector>
15
  #include <fstream>
16
+ #include <ctime>
17
+
18
+ class SimpleWavWriter {
19
+ private:
20
+ std::ofstream file;
21
+ int32_t dataSize = 0;
22
+
23
+ public:
24
+ SimpleWavWriter(const std::string &filename, int sampleRate, int bitsPerSample, int channels) {
25
+ file.open(filename, std::ios::binary);
26
+
27
+ file.write("RIFF", 4);
28
+ file.write("\0\0\0\0", 4); // Placeholder for file size
29
+ file.write("WAVE", 4);
30
+ file.write("fmt ", 4);
31
+
32
+ int32_t subChunkSize = 16;
33
+ int16_t audioFormat = 1; // PCM format
34
+ int32_t byteRate = sampleRate * channels * bitsPerSample / 8;
35
+ int16_t blockAlign = channels * bitsPerSample / 8;
36
+
37
+ file.write(reinterpret_cast<char *>(&subChunkSize), 4);
38
+ file.write(reinterpret_cast<char *>(&audioFormat), 2);
39
+ file.write(reinterpret_cast<char *>(&channels), 2);
40
+ file.write(reinterpret_cast<char *>(&sampleRate), 4);
41
+ file.write(reinterpret_cast<char *>(&byteRate), 4);
42
+ file.write(reinterpret_cast<char *>(&blockAlign), 2);
43
+ file.write(reinterpret_cast<char *>(&bitsPerSample), 2);
44
+ file.write("data", 4);
45
+ file.write("\0\0\0\0", 4); // Placeholder for data size
46
+ }
47
+
48
+ void writeData(const float *data, size_t length) {
49
+ for (size_t i = 0; i < length; ++i) {
50
+ int16_t intSample = static_cast<int16_t>(data[i] * 32767);
51
+ file.write(reinterpret_cast<char *>(&intSample), sizeof(int16_t));
52
+ dataSize += sizeof(int16_t);
53
+ }
54
+ if (file.is_open()) {
55
+ file.seekp(4, std::ios::beg);
56
+ int32_t fileSize = 36 + dataSize;
57
+ file.write(reinterpret_cast<char *>(&fileSize), 4);
58
+ file.seekp(40, std::ios::beg);
59
+ file.write(reinterpret_cast<char *>(&dataSize), 4);
60
+ file.seekp(0, std::ios::end);
61
+ }
62
+ }
63
 
64
+ ~SimpleWavWriter() {
65
+ if (file.is_open()) {
66
+ file.close();
67
+ }
68
+ }
69
+ };
70
  // 500 -> 00:05.000
71
  // 6000 -> 01:00.000
72
  std::string to_timestamp(int64_t t) {
 
105
  std::string language = "en";
106
  std::string model = "models/ggml-base.en.bin";
107
  std::string fname_out;
108
+ bool save_audio = false; // save audio to wav file
109
  };
110
 
111
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 
136
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
137
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
138
  else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
139
+ else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
140
 
141
  else {
142
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 
172
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
173
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
174
  fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
175
+ fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
176
  fprintf(stderr, "\n");
177
  }
178
 
 
210
  audio.resume();
211
 
212
  // whisper init
 
213
  if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
214
  fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
215
  whisper_print_usage(argc, argv, params);
 
266
  return 1;
267
  }
268
  }
269
+ // save wav file
270
+ SimpleWavWriter *wavWriter = nullptr;
271
+ if (params.save_audio) {
272
+ // Get current date/time for filename
273
+ time_t now = time(0);
274
+ char buffer[80];
275
+ strftime(buffer, sizeof(buffer), "%Y%m%d%H%M%S", localtime(&now));
276
+ std::string filename = std::string(buffer) + ".wav";
277
+
278
+ wavWriter = new SimpleWavWriter(filename, WHISPER_SAMPLE_RATE, 16, 1);
279
+ }
280
+ printf("[Start speaking]\n");
281
  fflush(stdout);
282
 
283
+ auto t_last = std::chrono::high_resolution_clock::now();
284
  const auto t_start = t_last;
285
 
286
  // main audio loop
287
  while (is_running) {
288
+ if (params.save_audio && wavWriter) {
289
+ wavWriter->writeData(pcmf32_new.data(), pcmf32_new.size());
290
+ }
291
  // handle Ctrl + C
292
  is_running = sdl_poll_events();
293
 
 
439
  fout << std::endl;
440
  }
441
 
442
+ if (use_vad) {
443
  printf("\n");
444
  printf("### Transcription %d END\n", n_iter);
445
  }
 
476
  whisper_free(ctx);
477
 
478
  return 0;
479
+ }