feat: initial import

Files changed (7) hide show

README.md +110 -0
config.yaml +93 -0
hparams.yaml +15 -0
overrides.yaml +22 -0
pytorch_model.bin +3 -0
tfevents.bin +3 -0
train.log +18 -0

README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+---
+tags:
+- pyannote
+- audio
+- voice
+- speech
+- speaker
+- speaker segmentation
+- voice activity detection
+- overlapped speech detection
+- resegmentation
+datasets:
+- ami
+- dihard
+- voxconverse
+license: mit
+inference: false
+---
+# Pretrained speaker segmentation model
+This model relies on `pyannote.audio` 2.0 (which is still in development):
+```bash
+$ pip install https://github.com/pyannote/pyannote-audio/archive/develop.zip
+```
+## Basic inference
+```python
+>>> from pyannote.audio import Inference
+>>> inference = Inference("pyannote/Segmentation")
+>>> segmentation = inference("audio.wav")
+```
+## Advanced pipelines
+### Voice activity detection
+```python
+>>> from pyannote.audio.pipelines import VoiceActivityDetection
+>>> HYPER_PARAMETERS = {"onset": 0.5, "offset": 0.5, "min_duration_on": 0.0, "min_duration_off": 0.0}
+>>> pipeline = VoiceActivityDetection(segmentation="pyannote/Segmentation").instantiate(HYPER_PARAMETERS)
+>>> vad = pipeline("audio.wav")
+```
+Dataset         | `onset` | `offset` | `min_duration_on` | `min_duration_off`
+----------------|---------|----------|-------------------|-------------------
+AMI Mix-Headset | TODO    | TODO     | TODO              | TODO
+DIHARD3         | TODO    | TODO     | TODO              | TODO
+VoxConverse     | TODO    | TODO     | TODO              | TODO
+### Overlapped speech detection
+```python
+>>> from pyannote.audio.pipelines import OverlappedSpeechDetection
+>>> pipeline = OverlappedSpeechDetection(segmentation="pyannote/Segmentation").instantiate(HYPER_PARAMETERS)
+>>> osd = pipeline("audio.wav")
+```
+Dataset         | `onset` | `offset` | `min_duration_on` | `min_duration_off`
+----------------|---------|----------|-------------------|-------------------
+AMI Mix-Headset | TODO    | TODO     | TODO              | TODO
+DIHARD3         | TODO    | TODO     | TODO              | TODO
+VoxConverse     | TODO    | TODO     | TODO              | TODO
+### Segmentation
+```python
+>>> from pyannote.audio.pipelines import Segmentation
+>>> pipeline = Segmentation(segmentation="pyannote/Segmentation").instantiate(HYPER_PARAMETERS)
+>>> seg = pipeline("audio.wav")
+```
+Dataset         | `onset` | `offset` | `min_duration_on` | `min_duration_off`
+----------------|---------|----------|-------------------|-------------------
+AMI Mix-Headset | TODO    | TODO     | TODO              | TODO
+DIHARD3         | TODO    | TODO     | TODO              | TODO
+VoxConverse     | TODO    | TODO     | TODO              | TODO
+### Resegmentation
+```python
+>>> from pyannote.audio.pipelines import Resegmentation
+>>> pipeline = Resegmentation(segmentation="pyannote/Segmentation", diarization="baseline")
+>>> assert isinstance(baseline, pyannote.core.Annotation)
+>>> resegmented_baseline = pipeline({"audio": "audio.wav", "baseline": baseline})
+```
+Dataset         | `onset` | `offset` | `min_duration_on` | `min_duration_off`
+----------------|---------|----------|-------------------|-------------------
+AMI Mix-Headset | TODO    | TODO     | TODO              | TODO
+DIHARD3         | TODO    | TODO     | TODO              | TODO
+VoxConverse     | TODO    | TODO     | TODO              | TODO
+## Citations
+```bibtex
+@inproceedings{Bredin2020,
+  Title = {{pyannote.audio: neural building blocks for speaker diarization}},
+  Author = {{Bredin}, Herv{\'e} and {Yin}, Ruiqing and {Coria}, Juan Manuel and {Gelly}, Gregory and {Korshunov}, Pavel and {Lavechin}, Marvin and {Fustes}, Diego and {Titeux}, Hadrien and {Bouaziz}, Wassim and {Gill}, Marie-Philippe},
+  Booktitle = {ICASSP 2020, IEEE International Conference on Acoustics, Speech, and Signal Processing},
+  Address = {Barcelona, Spain},
+  Month = {May},
+  Year = {2020},
+}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+protocol: X.SpeakerDiarization.Custom
+patience: 20
+task:
+  _target_: pyannote.audio.tasks.Segmentation
+  duration: 5.0
+  warm_up: 0.0
+  balance: null
+  overlap:
+    probability: 0.5
+    snr_min: 0.0
+    snr_max: 10.0
+  weight: null
+  batch_size: 32
+  num_workers: 10
+  pin_memory: false
+  loss: bce
+  vad_loss: bce
+model:
+  _target_: pyannote.audio.models.segmentation.PyanNet
+  sincnet:
+    stride: 10
+  lstm:
+    num_layers: 4
+    monolithic: true
+    dropout: 0.5
+  linear:
+    num_layers: 2
+optimizer:
+  _target_: torch.optim.Adam
+  lr: 0.001
+  betas:
+  - 0.9
+  - 0.999
+  eps: 1.0e-08
+  weight_decay: 0
+  amsgrad: false
+trainer:
+  _target_: pytorch_lightning.Trainer
+  accelerator: ddp
+  accumulate_grad_batches: 1
+  amp_backend: native
+  amp_level: O2
+  auto_lr_find: false
+  auto_scale_batch_size: false
+  auto_select_gpus: true
+  benchmark: true
+  check_val_every_n_epoch: 1
+  checkpoint_callback: true
+  deterministic: false
+  fast_dev_run: false
+  flush_logs_every_n_steps: 100
+  gpus: -1
+  gradient_clip_val: 0.5
+  limit_test_batches: 1.0
+  limit_train_batches: 1.0
+  limit_val_batches: 1.0
+  log_every_n_steps: 50
+  log_gpu_memory: null
+  max_epochs: 1000
+  max_steps: null
+  min_epochs: 1
+  min_steps: null
+  num_nodes: 1
+  num_processes: 1
+  num_sanity_val_steps: 2
+  overfit_batches: 0.0
+  precision: 32
+  prepare_data_per_node: true
+  process_position: 0
+  profiler: null
+  progress_bar_refresh_rate: 1
+  reload_dataloaders_every_epoch: false
+  replace_sampler_ddp: true
+  sync_batchnorm: false
+  terminate_on_nan: false
+  tpu_cores: null
+  track_grad_norm: -1
+  truncated_bptt_steps: null
+  val_check_interval: 1.0
+  weights_save_path: null
+  weights_summary: top
+augmentation:
+  transform: Compose
+  params:
+    shuffle: false
+    transforms:
+    - transform: AddBackgroundNoise
+      params:
+        background_paths: /gpfswork/rech/eie/commun/data/background/musan
+        min_snr_in_db: 5.0
+        max_snr_in_db: 15.0
+        mode: per_example
+        p: 0.9

hparams.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+linear:
+  hidden_size: 128
+  num_layers: 2
+lstm:
+  batch_first: true
+  bidirectional: true
+  dropout: 0.5
+  hidden_size: 128
+  monolithic: true
+  num_layers: 4
+num_channels: 1
+sample_rate: 16000
+sincnet:
+  sample_rate: 16000
+  stride: 10

overrides.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+- protocol=X.SpeakerDiarization.Custom
+- task=Segmentation
+- task.batch_size=32
+- task.num_workers=10
+- task.duration=5.
+- task.warm_up=0.
+- task.loss=bce
+- task.vad_loss=bce
+- patience=20
+- model=PyanNet
+- +model.sincnet.stride=10
+- +model.lstm.num_layers=4
+- +model.lstm.monolithic=True
+- +model.lstm.dropout=0.5
+- +model.linear.num_layers=2
+- optimizer=Adam
+- optimizer.lr=0.001
+- trainer.benchmark=True
+- trainer.gradient_clip_val=0.5
+- trainer.gpus=-1
+- trainer.accelerator=ddp
+- +augmentation=background

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d2e72ce20167e5eb05ce163b7af9762e92ef5fec7313435b676b74b8498afe
+size 17739960

tfevents.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b33b3855ecc446b1913916d8369ede8597b66491541a6c67e5ceafc15bcdb3
+size 13357699

train.log ADDED Viewed

	@@ -0,0 +1,18 @@

+[2021-03-19 18:29:57,529][lightning][INFO] - GPU available: True, used: True
+[2021-03-19 18:29:57,531][lightning][INFO] - TPU available: None, using: 0 TPU cores
+[2021-03-19 18:29:57,531][lightning][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+[2021-03-19 18:30:08,622][lightning][INFO] - initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4
+[2021-03-19 18:32:58,993][lightning][INFO] - Set SLURM handle signals.
+[2021-03-19 18:32:59,068][lightning][INFO] -
+  | Name       | Type       | Params | In sizes       | Out sizes
+------------------------------------------------------------------------------------------------------------
+0 | sincnet    | SincNet    | 42.6 K | [32, 1, 80000] | [32, 60, 293]
+1 | lstm       | LSTM       | 1.4 M  | [32, 293, 60]  | [[32, 293, 256], [[8, 32, 128], [8, 32, 128]]]
+2 | linear     | ModuleList | 49.4 K | ?              | ?
+3 | classifier | Linear     | 516    | [32, 293, 128] | [32, 293, 4]
+4 | activation | Sigmoid    | 0      | [32, 293, 4]   | [32, 293, 4]
+------------------------------------------------------------------------------------------------------------
+1.5 M     Trainable params
+0         Non-trainable params
+1.5 M     Total params
+[2021-03-23 02:26:47,615][lightning][INFO] - bypassing sigterm