Spaces:
Running
Running
File size: 12,284 Bytes
c0943fb dabc473 0ab5514 dabc473 c0943fb dabc473 c0943fb a3ee28a ce2cd6d c0943fb e4f586b 9ed1355 11bb554 e4f586b 11bb554 e4f586b 11bb554 e4f586b 9ed1355 ce2cd6d c0943fb ce2cd6d c0943fb ce2cd6d c0943fb 11bb554 c0943fb ce2cd6d c0943fb ce2cd6d c0943fb 11bb554 c0943fb ce2cd6d c0943fb 11bb554 c0943fb ce2cd6d c0943fb 11bb554 c0943fb ce2cd6d e4f586b 8311a60 c0943fb ce2cd6d c0943fb 6113fbd ce2cd6d 6113fbd c0943fb ce2cd6d c0943fb ce2cd6d c0943fb ce2cd6d 8d1f7e9 c0943fb ce2cd6d c0943fb d0b1d9e c0943fb 11bb554 c0943fb 99f2636 c0943fb 83926f7 e4f586b 83926f7 d311de4 c0943fb 99f2636 c0943fb 9ed1355 11bb554 c0943fb dabc473 606a6bc 11bb554 5ef0168 dabc473 9a513f5 c0943fb e4f586b e48ba5c e4f586b bec875e 11bb554 bec875e 28726dd 8ad3dbf c0943fb 99f2636 83926f7 d311de4 c0943fb 99f2636 dabc473 ce2cd6d c0943fb 11bb554 c0943fb 84ba527 d5d2d41 11bb554 d5d2d41 ce2cd6d dabc473 ce2cd6d dabc473 ce2cd6d dabc473 6f2f468 83926f7 6f2f468 b777dde e4f586b b777dde 6f2f468 c0943fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
#ifndef WHISPER_H
#define WHISPER_H
#include <stdint.h>
#include <stdbool.h>
#ifdef WHISPER_SHARED
# ifdef _WIN32
# ifdef WHISPER_BUILD
# define WHISPER_API __declspec(dllexport)
# else
# define WHISPER_API __declspec(dllimport)
# endif
# else
# define WHISPER_API __attribute__ ((visibility ("default")))
# endif
#else
# define WHISPER_API
#endif
#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
// concurrently.
//
// Basic usage:
//
// #include "whisper.h"
//
// ...
//
// struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
// return 7;
// }
//
// const int n_segments = whisper_full_n_segments(ctx);
// for (int i = 0; i < n_segments; ++i) {
// const char * text = whisper_full_get_segment_text(ctx, i);
// printf("%s", text);
// }
//
// whisper_free(ctx);
//
// ...
//
// This is a demonstration of the most straightforward usage of the library.
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
//
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
// understanding of how the model works.
//
struct whisper_context;
typedef int whisper_token;
typedef struct whisper_token_data {
whisper_token id; // token id
whisper_token tid; // forced timestamp token id
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token
int64_t t1; // end time of the token
float vlen; // voice length of the token
} whisper_token_data;
// Allocates all memory needed for the model and loads the model from the given file.
// Returns NULL on failure.
WHISPER_API struct whisper_context * whisper_init(const char * path_model);
// Frees all memory allocated by the model.
WHISPER_API void whisper_free(struct whisper_context * ctx);
// Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx,
const float * samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80
// Returns 0 on success
WHISPER_API int whisper_set_mel(
struct whisper_context * ctx,
const float * data,
int n_len,
int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success
WHISPER_API int whisper_encode(
struct whisper_context * ctx,
int offset,
int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
// tokens + n_tokens is the provided context for the decoder.
// n_past is the number of tokens to use from previous decoder calls.
// Returns 0 on success
WHISPER_API int whisper_decode(
struct whisper_context * ctx,
const whisper_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
// You can also implement your own sampling method using the whisper_get_probs() function.
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
// Return the id of the specified language, returns -1 if not found
WHISPER_API int whisper_lang_id(const char * lang);
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
// The probabilities for the next token
WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
// Special tokens
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
// Task tokens
WHISPER_API whisper_token whisper_token_translate (void);
WHISPER_API whisper_token whisper_token_transcribe(void);
// Performance information
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
// Print system information
WHISPER_API const char * whisper_print_system_info(void);
////////////////////////////////////////////////////////////////////////////
// Available sampling strategies
enum whisper_sampling_strategy {
WHISPER_SAMPLING_GREEDY, // Always select the most probable token
WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
};
// Text segment callback
// Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
// Parameters for the whisper_full() function
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
// whisper_full_default_params()
struct whisper_full_params {
enum whisper_sampling_strategy strategy;
int n_threads;
int n_max_text_ctx;
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
bool translate;
bool no_context;
bool single_segment; // force single segment output (useful for streaming)
bool print_special;
bool print_progress;
bool print_realtime;
bool print_timestamps;
// [EXPERIMENTAL] token-level timestamps
bool token_timestamps; // enable token-level timestamps
float thold_pt; // timestamp token probability threshold (~0.01)
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
int max_len; // max segment length in characters
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
// tokens to provide the whisper model as initial prompt
// these are prepended to any existing text context from a previous call
const whisper_token * prompt_tokens;
int prompt_n_tokens;
const char * language;
struct {
int n_past;
} greedy;
struct {
int n_past;
int beam_width;
int n_best;
} beam_search;
whisper_new_segment_callback new_segment_callback;
void * new_segment_callback_user_data;
whisper_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data;
};
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full()
// It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
WHISPER_API int whisper_full_parallel(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples,
int n_processors);
// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
// Get the start and end time of the specified segment.
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
// Get the text of the specified segment.
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
// Get number of tokens in the specified segment.
WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
// Get the token text of the specified token in the specified segment.
WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
// Get token data for the specified token in the specified segment.
// This contains probabilities, timestamps, etc.
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
#ifdef __cplusplus
}
#endif
#endif
|