import argparse
import openpyxl
import os


def download_videos(
        filename,
        underline_id_col=0,
        anthology_id_col=None,
        mp4_col=3
    ):

    wb = openpyxl.load_workbook(filename, data_only=True)
    ws = wb.active

    # anthology_prefix = "https://aclanthology.org/"
    for row in ws.iter_rows(min_row=2, values_only=True):

        underline_id, mp4 = row[underline_id_col], row[mp4_col]

        # skip entries for which no video file is available
        if not mp4:
            print(f"Skipping {underline_id} (no video available)")
            continue

        video_id = None
        if anthology_id_col:
            anthology_id = row[anthology_id_col]

            if anthology_id != None and "https://" in anthology_id:
                # remove website stuff
                anthology_id = anthology_id.replace("https://aclanthology.org", "").replace("/", "")

            if anthology_id not in [0, "#N/A"]:
                # assert anthology_prefix in anthology_id
                # assert anthology_id[-4:] == ".pdf"
                # video_id = anthology_id[len(anthology_prefix):-4]
                # video_id = anthology_id[len(anthology_prefix):]
                video_id = anthology_id
        

        # fall back to underline ID iff anthology ID not available
        if video_id == None:
            video_id = underline_id
        
        print(f"Downloading {video_id}...")
        os.system(f"wget -O {video_id}.mp4 '{mp4}'")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--filename", type=str, help="Path to the input file with video links")
    parser.add_argument("--underline_id_col", type=int, default=0)
    parser.add_argument("--anthology_id_col", type=int, default=None)
    parser.add_argument("--mp4_col", type=int, default=12)
    args = parser.parse_args()

    download_videos(
        filename=args.filename,
        underline_id_col=args.underline_id_col,
        anthology_id_col=args.anthology_id_col,
        mp4_col=args.mp4_col
    )