import argparse
import os
import ipdb
from tqdm import tqdm
import random


if __name__ == '__main__':
    parser = argparse.ArgumentParser('clean input file')
    parser.add_argument('--fp1', type=str, required=True, help='input file')
    parser.add_argument('--fp2', type=str, required=True, help='input file')
    parser.add_argument('--out1', type=str, required=True, help='output file')
    parser.add_argument('--out2', type=str, required=True, help='output file')
    args = parser.parse_args()

    random.seed(0)
    with open(args.fp1, 'r') as f1,\
        open(args.fp2, 'r') as f2,\
        open(args.out1, 'w') as f3,\
        open(args.out2, 'w') as f4:
        data1 = f1.readlines()
        data2 = f2.readlines()
        assert len(data1) == len(data2)
        input = []
        for ind in range(len(data1)):
            ext = data1[ind].strip()
            sent = data2[ind].strip()
            input.append(ext + ' ||| ' + sent)
        random.shuffle(input)
        valid = input[0:1000]
        train = input[1000:]
        f3.write("\n".join(train)+'\n')
        f4.write("\n".join(valid)+'\n')