#!/usr/bin/env bash
set -e

argslist=""
for (( i = 1; i <= $# ; i++ ))
  do
    j=${!i}
    argslist="${argslist} $j"
  done
# echo $argslist >&2
echo $argslist

# setup mbart model and tokenizer
MBART=/home/tiger/mbart.cc25
if [ ! -d $MBART ]; then
    hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
    tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
fi
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict_extend_extra2.txt

base_dir=/opt/tiger/sumtest/multilingual

python3 ${base_dir}/fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70 \
    $argslist
