#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

pwd=$(dirname $(readlink -f $0))
ccPrefix=/opt/tiger/sumtest/cc100
summDIR=/home/tiger/summ
DATAVER=ZhEnFr_Unsup_infillNoise_r0.35_maxlen256
langs=(en zh fr)

declare -A langtoken_dict
langtoken_dict=([en]="en_XX" [fr]="fr_XX" [zh]="zh_CN")

TMPDIR=~/tmp_afterTokenize_${DATAVER}
mkdir $TMPDIR

INDIR=${ccPrefix}
OUTDIR=$TMPDIR/MSPM
mkdir $OUTDIR

for split in dev train
do
    rm $OUTDIR/${split}.noshuffle.spm.doc
    rm $OUTDIR/${split}.noshuffle.spm.sum
    for lg in ${langs[*]}
    do
        LG_TAG=${langtoken_dict[$lg]}
        # echo "[${split} $lg ${LG_TAG}] Tokenizing unsupervised data ..."
        # python3 ../addNoise.py -m removeLongInstance -i ${INDIR}/${lg}.${split}.spm -os ${TMPDIR}/${lg}.${split}.spm.short --max-length 256 -l ${lg} -t
        # python3 ../addNoise.py -m noiseV2 -i ${TMPDIR}/${lg}.${split}.spm.short -os ${TMPDIR}/${lg}.${split}.spm.src -ot ${TMPDIR}/${lg}.${split}.spm.tgt -l ${lg} -t -r 0.35
        # cat ${TMPDIR}/${lg}.${split}.spm.src | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        #     > ${OUTDIR}/${lg}.${split}.spm.src
        # cat ${TMPDIR}/${lg}.${split}.spm.tgt | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        #     > ${OUTDIR}/${lg}.${split}.spm.tgt

        cat ${OUTDIR}/${lg}.${split}.spm.src >> $OUTDIR/${split}.noshuffle.spm.doc
        cat ${OUTDIR}/${lg}.${split}.spm.tgt >> $OUTDIR/${split}.noshuffle.spm.sum
    done

    shuf --random-source=<(get_seeded_random 66) $OUTDIR/${split}.noshuffle.spm.doc > $OUTDIR/${split}.spm.doc
    shuf --random-source=<(get_seeded_random 66) $OUTDIR/${split}.noshuffle.spm.sum > $OUTDIR/${split}.spm.sum
done

echo "Binary data ..."
bash binary.sh $TMPDIR $DATAVER MSPM