Source code for wespeaker.cli.utils

# Copyright (c) 2023 Binbin Zhang (binbzha@qq.com)
#                    Shuai Wang (wsstriving@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse


[docs]def get_args():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-t',
                        '--task',
                        choices=[
                            'embedding',
                            'embedding_kaldi',
                            'similarity',
                            'diarization',
                            'diarization_list',
                        ],
                        default='embedding',
                        help='task type')
    parser.add_argument('-l',
                        '--language',
                        choices=[
                            'chinese',
                            'english',
                        ],
                        default='chinese',
                        help='language type')
    parser.add_argument(
        '--campplus',
        action='store_true',
        help='whether to use the damo/speech_campplus_sv_zh-cn_16k-common model'
    )
    parser.add_argument(
        '--eres2net',
        action='store_true',
        help='whether to use the damo/speech_eres2net_sv_zh-cn_16k-common model'
    )
    parser.add_argument(
        '--vblinkp',
        action='store_true',
        help='whether to use the samresnet34 model pretrained on voxblink2'
    )
    parser.add_argument(
        '--vblinkf',
        action='store_true',
        help="whether to use the samresnet34 model pretrained on voxblink2 and"
             "fintuned on voxceleb2"
    )
    parser.add_argument('-p',
                        '--pretrain',
                        type=str,
                        default="",
                        help='model directory')
    parser.add_argument('--device',
                        type=str,
                        default='cpu',
                        help="device type (most commonly cpu or cuda,"
                             "but also potentially mps, xpu, xla or meta)"
                             "and optional device ordinal for the device type.")
    parser.add_argument('--audio_file', help='audio file')
    parser.add_argument('--audio_file2',
                        help='audio file2, specifically for similarity task')
    parser.add_argument('--wav_scp',
                        help='path to wav.scp, for extract and saving '
                        'kaldi-stype embeddings')
    parser.add_argument('--resample_rate',
                        type=int,
                        default=16000,
                        help='resampling rate')
    parser.add_argument('--vad',
                        action='store_true',
                        help='whether to do VAD or not')
    parser.add_argument('--output_file',
                        default=None,
                        help='output file to save speaker embedding '
                        'or save diarization result')
    # diarization params
    parser.add_argument('--diar_min_duration',
                        type=float,
                        default=0.255,
                        help='VAD min duration')
    parser.add_argument('--diar_window_secs',
                        type=float,
                        default=1.5,
                        help='the window seconds in embedding extraction')
    parser.add_argument('--diar_period_secs',
                        type=float,
                        default=0.75,
                        help='the shift seconds in embedding extraction')
    parser.add_argument('--diar_frame_shift',
                        type=int,
                        default=10,
                        help='frame shift in fbank extraction (ms)')
    parser.add_argument('--diar_emb_bs',
                        type=int,
                        default=32,
                        help='batch size for embedding extraction')
    parser.add_argument('--diar_subseg_cmn',
                        type=bool,
                        default=True,
                        help='do cmn after or before fbank sub-segmentation')
    args = parser.parse_args()
    return args