# Bark Memory Profiling
Bark has two ways to reduce GPU memory: 
 - Small models: a smaller version of the model. This can be set by using the environment variable `SUNO_USE_SMALL_MODELS`
 - offloading models to CPU: Holding only one model at a time on the GPU, and shuttling the models to the CPU in between generations. 

## NOTE: this requires a GPU to run

# $ \\ $
## First, we'll use the most memory efficient configuration

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["SUNO_USE_SMALL_MODELS"] = "1"
os.environ["SUNO_OFFLOAD_CPU"] = "1"

from bark.generation import preload_models
from bark import generate_audio, SAMPLE_RATE

import torch

In [2]:
torch.cuda.reset_peak_memory_stats()
preload_models()
audio_array = generate_audio("madam I'm adam", history_prompt="v2/en_speaker_5")
max_utilization = torch.cuda.max_memory_allocated()
print(f"max memory usage = {max_utilization / 1024 / 1024:.0f}MB")

100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 62.17it/s]
100%|████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.74it/s]


max memory usage = 2396MB


# Memory Profiling:
We can profile the memory consumption of 4 scenarios
 - Small models, offloading to CPU
 - Large models, offloading to CPU
 - Small models, not offloading to CPU
 - Large models, not offloading to CPU

In [1]:
import os
from bark.generation import preload_models
from bark import generate_audio, SAMPLE_RATE
import torch
import time

In [2]:
offload_models = True
use_small_models = True

for offload_models in (True, False):
    for use_small_models in (True, False):
        torch.cuda.reset_peak_memory_stats()
        preload_models(
            text_use_small=use_small_models,
            coarse_use_small=use_small_models,
            fine_use_small=use_small_models,
            force_reload=True,
        )
        t0 = time.time()
        audio_array = generate_audio("madam I'm adam", history_prompt="v2/en_speaker_5", silent=True)
        dur = time.time() - t0
        max_utilization = torch.cuda.max_memory_allocated()
        print(f"Small models {use_small_models}, offloading to CPU: {offload_models}")
        print(f"\tmax memory usage = {max_utilization / 1024 / 1024:.0f}MB, time {dur:.0f}s\n")

Small models True, offloading to CPU: True
	max memory usage = 2949MB, time 3s

Small models False, offloading to CPU: True
	max memory usage = 7826MB, time 4s

Small models True, offloading to CPU: False
	max memory usage = 5504MB, time 2s

Small models False, offloading to CPU: False
	max memory usage = 7825MB, time 5s

