kaldi腳本注釋一


utils/split_data.sh

##再$data文件夾下,創建split{num_split}文件夾,再split×里面創建所有的數字文件夾
#后面基本上是把$data文件夾下的各個文件都進行split切分,存放到split{num_split}
下面的各個數字文件夾下,實現多線程處理
#為多線程同時獨立處理做准備
#!/bin/bash
# Copyright 2010-2013 Microsoft Corporation
#                     Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

split_per_spk=true  #默認  按說話人  划分
if [ "$1" == "--per-utt" ]; then
  split_per_spk=false
  shift
fi

if [ $# != 2 ]; then
  echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
  echo "E.g.: $0 data/train 50"
  echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
  echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
  echo ""
  echo "This script will not split the data-dir if it detects that the output is newer than the input."
  echo "By default it splits per speaker (so each speaker is in only one split dir),"
  echo "but with the --per-utt option it will ignore the speaker information while splitting."
  exit 1
fi

data=$1                          #數據文件夾
numsplit=$2                      #切分的數目

if ! [ "$numsplit" -gt 0 ]; then
  echo "Invalid num-split argument $numsplit";
  exit 1;
fi

if $split_per_spk; then
  warning_opt=
else
  # suppress warnings from filter_scps.pl about 'some input lines were output
  # to multiple files'.
  warning_opt="--no-warn"
fi

n=0;
feats=""
wavs=""
utt2spks=""
texts=""

nu=`cat $data/utt2spk | wc -l`    #查看打印 $data/utt2spk文件內容,統計出行數

nf=`cat $data/feats.scp 2>/dev/null | wc -l`   #查看打印feats.scp內容,錯誤->黑洞,不顯示;統計出行數
nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file  #查看打印text 出錯不顯示, 統計出行數
if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then    如果不存在feats.scp文件, 而且  發音數目  不等於 特征行數;則打印提示
  echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
  echo "**  use utils/fix_data_dir.sh $data to fix this."
fi
if [ -f $data/text ] && [ $nu -ne $nt ]; then  #如果不存在$data/text文件,而且 發音行數 不等於 發音文本行數,則打印提示
  echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
  echo "** use utils/fix_data_dir.sh to fix this."
fi


if $split_per_spk; then`       #如果按說話人划分,則
  utt2spk_opt="--utt2spk=$data/utt2spk"
  utt=""
else
  utt2spk_opt=
  utt="utt"
fi

s1=$data/split${numsplit}${utt}/1
if [ ! -d $s1 ]; then   #如果不存在該文件夾,則需要進行分裂
  need_to_split=true
else
  need_to_split=false
  for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
    vad.scp segments reco2file_and_channel utt2lang; do
    if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then  #遍歷上面所有文件,  如果data下該文件存在,而且 ($s1下面的對應文件不存在  或者  則對應文件比data下的對應文件 舊),則需要分裂更新
      need_to_split=true
    fi
  done
fi

if ! $need_to_split; then #如果不需要分裂,則退出
  exit 0;
fi

utt2spks=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2spk; done) #numsplit等於-nj線程數,   顯示分裂數字文件夾下生成的文件名

directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n; done)

# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then #試圖創建這些文件夾,如果因為參數過長而創建失敗,則->黑洞,不顯示丟棄
  for n in `seq $numsplit`; do  #則分文件夾,一個個的創建
    mkdir -p $data/split${numsplit}${utt}/$n
  done
fi

# If lockfile is not installed, just don't lock it.  It's not a big deal.
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM

utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 #將$data下的utt2spk   切分成很多分到各個數字文件夾下的utt2spk
#后面基本上是把$data文件夾下的各個文件都進行split切分,存放到split*下面的各個數字文件夾下,實現多線程處理
for n in `seq $numsplit`; do
  dsn=$data/split${numsplit}${utt}/$n
  utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; #遍歷切分的數字文件夾,根據utt2spk  生成spk2utt
done

maybe_wav_scp=
if [ ! -f $data/segments ]; then   #如果不存在seg信息文件,則每個發音對應一個wav文件
  maybe_wav_scp=wav.scp  # If there is no segments file, then wav file is
                         # indexed per utt.
fi

# split some things that are indexed by utterance.
for f in feats.scp text vad.scp utt2lang $maybe_wav_scp utt2dur utt2num_frames; do
  if [ -f $data/$f ]; then
    utils/filter_scps.pl JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/utt2spk $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
  fi
done

# split some things that are indexed by speaker
for f in spk2gender spk2warp cmvn.scp; do
  if [ -f $data/$f ]; then
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
  fi
done

if [ -f $data/segments ]; then    #如果存在seg文件
  utils/filter_scps.pl JOB=1:$numsplit \
     $data/split${numsplit}${utt}/JOB/utt2spk $data/segments $data/split${numsplit}${utt}/JOB/segments || exit 1
  for n in `seq $numsplit`; do
    dsn=$data/split${numsplit}${utt}/$n
    awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
  done
  if [ -f $data/reco2file_and_channel ]; then   #如果存在...
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \
      $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1
  fi
  if [ -f $data/wav.scp ]; then #如果存在
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
      $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
  fi
  for f in $data/split${numsplit}${utt}/*/tmp.reco; do rm $f; done
fi

exit 0

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM