# Diarization config fbank_dim: 80 embedding_size: 192 feature_extractor: obj: speakerlab.process.processor.FBank args: n_mels: sample_rate: mean_nor: True embedding_model: obj: speakerlab.models.campplus.DTDNN.CAMPPlus args: feat_dim: embedding_size: # for visual embeddings extraction min_track: 10 num_failed_det: 10 crop_scale: 0.4 min_face_size: 1 face_det_stride: 5 # 每5帧检测一次人脸 shot_stride: 50 # for clustering audio_cluster: obj: speakerlab.process.cluster.CommonClustering args: cluster_type: spectral min_num_spks: 1 max_num_spks: 15 min_cluster_size: 1 oracle_num: null pval: 0.032 mer_cos: 0.8 vision_cluster: obj: speakerlab.process.cluster.CommonClustering args: cluster_type: AHC cluster_line: 2 min_cluster_size: 1 fix_cos_thr: 0.25 cluster: obj: speakerlab.process.cluster.JointClustering args: audio_cluster: vision_cluster: