ggerganov commited on
Commit
0ac55b8
·
unverified ·
1 Parent(s): 09b6ef7

whisper : fix UB when reading buffer of length 0 bytes (#265)

Browse files
Files changed (1) hide show
  1. whisper.cpp +10 -3
whisper.cpp CHANGED
@@ -549,13 +549,20 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
549
  //}
550
 
551
  std::string word;
 
552
  for (int i = 0; i < n_vocab; i++) {
553
  uint32_t len;
554
  read_safe(fin, len);
555
 
556
- std::vector<char> tmp(len); // create a buffer
557
- fin.read( &tmp[0], tmp.size() ); // read to buffer
558
- word.assign(&tmp[0], tmp.size());
 
 
 
 
 
 
559
 
560
  vocab.token_to_id[word] = i;
561
  vocab.id_to_token[i] = word;
 
549
  //}
550
 
551
  std::string word;
552
+ std::vector<char> tmp;
553
  for (int i = 0; i < n_vocab; i++) {
554
  uint32_t len;
555
  read_safe(fin, len);
556
 
557
+ if (len > 0) {
558
+ tmp.resize(len);
559
+ fin.read(&tmp[0], tmp.size()); // read to buffer
560
+ word.assign(&tmp[0], tmp.size());
561
+ } else {
562
+ // seems like we have an empty-string token in multi-language models (i = 50256)
563
+ //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
564
+ word = "";
565
+ }
566
 
567
  vocab.token_to_id[word] = i;
568
  vocab.id_to_token[i] = word;