#!/usr/bin/python

import sys, os, re

os.system("wget http://faculty.washington.edu/bejan/data/ECB1.0.tar.gz")
os.system("tar -xzf ECB1.0.tar.gz")

# N or V? (0)  Topic(1)  Doc(2) Sentence Number(3) CorefID(4) StartIdx(5)  EndIdx(6) StartCharIdx(7)  EndCharIdx(8)  

f = open("mentions.txt")

def main():
  topic = 0
  doc = 0
  mentions = {}
  for line in f.readlines():
    if (line[0]=='#'):
      continue
    tokens = line[:-1].split("\t")
    if(topic!=0 and topic+"-"+doc != tokens[1]+"-"+tokens[2]):
      annotateDoc(topic, doc, mentions)
      mentions = {}
    for i in range(3,9):
      if(i==4):
        continue
      tokens[i] = int(tokens[i])
    topic = tokens[1]
    doc = tokens[2]
    if(tokens[3] not in mentions):
      mentions[tokens[3]] = []
    mentions[tokens[3]].append(tokens)
  # last document
  annotateDoc(topic, doc, mentions)

# do annotation for one doc
def annotateDoc(topic, doc, mentions):
  ecb = open("ECB1.0/data/"+topic+"/"+doc+".ecb")
  os.system("mkdir -p EECB1.0/data/"+topic+"/")
  eecb = open("EECB1.0/data/"+topic+"/"+doc+".eecb", 'w')
  sentIdx = 0
  for line in ecb.readlines():
    # remove old annotation
    line2write = getCleanSentence(line)
    if(sentIdx in mentions):
      line2write = addAnnotation(line2write, mentions[sentIdx])
    eecb.write(line2write)
    sentIdx = sentIdx+1

def getCleanSentence(line):
  anno = re.compile("<[^>]*>")
  clean = anno.sub('', line)
  return clean

def addAnnotation(clean, mentions):
  annotated = ""
  annoStartCharIdx = {}
  annoEndCharIdx = {}
  for mention in mentions:
    startCharIdx = int(mention[7])
    endCharIdx = int(mention[8])
    if(startCharIdx not in annoStartCharIdx):
      annoStartCharIdx[startCharIdx] = []
    if(endCharIdx not in annoEndCharIdx):
      annoEndCharIdx[endCharIdx] = []
    annoStartCharIdx[startCharIdx].append(mention)
    annoEndCharIdx[endCharIdx].append(mention)
 
  charIdx = 0 
  for idx in range(len(clean)):
    char = clean[idx]
    if(charIdx in annoEndCharIdx):
      annoEnds = annoEndCharIdx.pop(charIdx)
      annoEnds = sorted(annoEnds, key=lambda mention: mention[7], reverse=True)
      for endMention in annoEnds:
        tag = "</ENTITY>"
        if(endMention[0]=='V'):
          tag = "</EVENT>"
        annotated = annotated + tag
    if(charIdx in annoStartCharIdx and clean[idx]!=' '):
      annoStarts = annoStartCharIdx.pop(charIdx)
      annoStarts = sorted(annoStarts, key=lambda mention: mention[8], reverse=True)
      for startMention in annoStarts:
        tag = "COREFID=\""+startMention[4]+"\">"
        if(startMention[0]=='V'):
          tag = "<EVENT " + tag
        else:
          tag = "<ENTITY " + tag
        annotated = annotated + tag
        multipleStart = True
    annotated = annotated + char
    if(char!=' '):
      charIdx = charIdx + 1
  return annotated

if __name__ == '__main__':
  main()

