#!/usr/bin/python

import sys
import re
import os

## The output of this script is a text file of the source language which 
## is derived from the input text file. For each word in the input file,
## if that word is found in the translation model (represented here by the 
## unigrams file output from reduce model.py), then that word is output 
## as is; if the word is not in the model, then the word is fed to omorfi 
## for analysis, and each of the morphemes found in the token by omorfi 
## is written to output. The use of the dictionary structure and "try"
## control takes advantage of the speed of handling errors versus 
## searching a list.

try:
	unigramfile = sys.argv[1]
	inputfilename = sys.argv[2]
	outputfilename = sys.argv[3]
except:
	print "Specify model, input, and output files"
	exit()

with open(inputfilename) as f:
	content = f.read()
	f.close()

with open(unigramfile) as f:
	unigramtext = f.read()
	f.close()

unigrams = {}
for word in unigramtext.split('\n'):
	unigrams[word]=""

outputfile = open(outputfilename, "w")

for line in content.strip().split('\n'):
	outputline = ""
	for word in line.strip().split():
		try:
			unigrams[word]
			outputline+=word+" "
			continue
		except:
			os.system("./omorfi "+word+" > clean_omorfi_output.txt")
			f = open('clean_omorfi_output.txt')
			morphemes = f.read()
			f.close()
			for token in morphemes.strip().split():
				outputline+=token+" "
	outputfile.write(outputline.strip()+'\n')
outputfile.close()
exit()
