import requests
import numpy as np
from urllib import parse
from argparse import ArgumentParser

from processing.base.utils import *


class Table(object):

    def __init__(self, table_id, url, table_id_in_page):
        self.table_id = table_id
        self.url = url
        self.table_id_in_page = table_id_in_page

        self.html = None

    def set_html(self, html):
        self.html = html

    def __repr__(self):
        return "table_id: {}, url: {}, table_id_in_page: {}".format(self.table_id, self.url, self.table_id_in_page)


def get_table_html(table, soup, new_page=False):
    for table_html in soup.find_all('table'):
        pattern = re.compile(f'Table\s*{table.table_id_in_page}(?!\d)')
        if pattern.search(table_html.find('caption').text):
            return table_html
    if new_page:
        table_html = soup.find('table')
        if table_html:
            return table_html
    return None


def get_table_href(table, soup):
    for table_html in soup.find_all('a'):
        pattern = re.compile(f'Table\s*{table.table_id_in_page}(?!\d)')
        if pattern.search(table_html.text):
            href = table_html['href']
            return parse.urljoin(table.url, href)
    return None


def request_url(url):
    while True:
        response = requests.get(url, timeout=10000)
        page_html = response.text
        if len(page_html) != 5211:  # TODO: modify ugly detection
            break
    soup = BeautifulSoup(page_html, 'html.parser')
    return response, soup


def extract_and_save_table_html(table, soup, new_page=False):
    table_html = get_table_html(table, soup, new_page)
    if table_html is not None:
        table.set_html(table_html)
        with open(os.path.join(args.root_dir, args.data_dir, args.html_dir, f"{table.table_id}.html"), "w", encoding='utf-8') as f:
            f.write(str(table_html))


def crawl_tables(table_dict: Dict[int, Table]):
    for i in range(len(table_dict)):
        try:
            table = table_dict[i]
            if os.path.exists(os.path.join(args.root_dir, args.data_dir, args.html_dir, f"{table.table_id}.html")):
                continue
            print("Crawling table {} (Table {} in page) at {}".format(i, table.table_id_in_page, table.url))
            response, soup = request_url(table.url)
            extract_and_save_table_html(table, soup)
            if table.html is not None:
                continue
            time.sleep(0.3)

            table_href = get_table_href(table, soup)
            print("Redirect to {}".format(table_href))
            if table_href is None:
                raise ValueError("Table href not found in the page.")
            response, soup = request_url(table_href)
            extract_and_save_table_html(table, soup, True)
            if table.html is None:
                raise ValueError("Table not found in the page.")
            time.sleep(0.3)
        except Exception as e:
            print(e)
            print(table.url)


def html2spreadsheet(table_dict: Dict[int, Table], record_dict: Dict[int, List]):
    for i, table in table_dict.items():
        try:
            wb = Workbook()
            ws = wb.active
            ws.title = 'labeling'
            max_rows, max_cols = find_ranges(table)
            print("Processing table {}, max_rows: {}; max_cols: {}".format(table.table_id, max_rows, max_cols))

            # write caption (title and summary)
            caption_text = prettify_caption(table.html.find('caption').text.strip().split('\n'))
            ws.cell(1, 1).value = caption_text
            ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=min(max_cols, 10))

            # write table content
            table_mask = np.zeros((max_rows + 3, max_cols + 1))  # row start at 3 (first 2 for caption)
            row, col = 3, 1
            for x in table.html.find_all('tr'):
                for y in x.find_all(['th', 'td']):
                    if row > max_rows + 2:  # table-footer, currently not add footer
                        continue
                    font, fill, border, alignment, number_format = get_cell_style(y)
                    colspan = get_int(y, 'colspan')
                    rowspan = get_int(y, 'rowspan')
                    row, col = find_position(table_mask, row, col, max_rows, max_cols)

                    # remove footer in cell
                    prev_text = y.text.strip()
                    text = clear_footer(y)
                    ws.cell(row, col).value = text
                    ws.cell(row, col).font = font
                    ws.cell(row, col).fill = fill
                    ws.cell(row, col).border = border
                    ws.cell(row, col).alignment = alignment
                    ws.cell(row, col).number_format = number_format
                    if colspan > 1 and rowspan > 1:
                        ws.merge_cells(start_row=row, start_column=col, end_row=row+rowspan-1, end_column=col+colspan-1)
                    elif colspan > 1:
                        ws.merge_cells(start_row=row, start_column=col, end_row=row, end_column=col+colspan-1)
                    elif rowspan > 1:
                        ws.merge_cells(start_row=row, start_column=col, end_row=row+rowspan-1, end_column=col)
                    table_mask[row: row + rowspan, col: col + colspan] = 1
                col = 1
                row += 1
            auto_fit_column_width(ws)

            # write annotation prompts
            row += 2
            for record in record_dict[table.table_id]:
                ws.cell(row, 1).value = 'table descriptive sentence id:'
                ws.cell(row, 1).font = Font(bold=True)
                ws.cell(row, 2).value = record['sentence_id']

                ws.cell(row+1, 1).value = 'table descriptive sentence:'
                ws.cell(row+1, 2).value = record['description']
                ws.cell(row+1, 1).font = Font(bold=True)

                ws.cell(row+3, 1).value = 'sub-sentence (complete & fix grammar):'
                ws.cell(row+4, 1).value = 'sub-sentence after deletion & decontextualization:'
                ws.cell(row+5, 1).value = 'key part to be questioned:'
                ws.cell(row+6, 1).value = 'schema linking phrases:'
                ws.cell(row+7, 1).value = 'schema linking positions:'
                ws.cell(row+8, 1).value = 'question rewrite:'
                ws.cell(row+9, 1).value = 'answer (formula):'
                ws.cell(row+10, 1).value = 'aggregation type:'

                row += 12  # next prompt block
            # save file
            file_name = '{}.xlsx'.format(table.table_id)
            save_path = os.path.join(args.root_dir, args.data_dir, 'spreadsheet/', file_name)
            source = wb.active
            target = wb.copy_worksheet(source)
            target.title = 'original'
            wb.save(save_path)

        except Exception as e:
            print(f"Error when dumping into .xlsx: {e}")



def main():
    print("---------------------Reading annotation----------------")
    table_dict_raw, record_dict = \
        read_annotated(os.path.join(args.root_dir, args.data_dir, args.anno_file_name), args.num_descriptions)
    print("Done.")

    print("---------------------Crawling----------------")
    crawl_tables(table_dict_raw)
    print('Done')

    print("---------------------Loading-----------------")
    table_dict = load_tables(args.root_dir, os.path.join(args.data_dir, args.html_dir))  # Table() with only htmls
    print("Done.")

    print("---------------------Converting-----------------")
    html2spreadsheet(table_dict, record_dict)
    print("Done.")


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--root_dir', type=str, default='/data/home/hdd3000/USER/HMT/')
    parser.add_argument('--data_dir', type=str, default='qa/data/')
    parser.add_argument('--html_dir', type=str, default='html/')
    parser.add_argument('--anno_file_name', type=str, required=True)
    parser.add_argument('--num_descriptions', type=int, required=True)
    args = parser.parse_args()

    main()
