#!/usr/bin/python

import sys
import codecs

# Usage: python [this script] [original CTB5 file] [reannotation dic file] > [reannotated CTB5 file]
def main():
    sys.stdin = codecs.getreader('utf-8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
    targetName = sys.argv[1]
    goldName = sys.argv[2]
    targetFileHandle = codecs.open(targetName, 'rU', 'utf-8')
    goldFileHandle = codecs.open(goldName, 'rU', 'utf-8')
    reannotatedList = []
    loadMap(reannotatedList, goldFileHandle)
    loadCTB(reannotatedList, targetFileHandle)

def loadMap(reannotatedList, goldFileHandle):
    for line in goldFileHandle:
        if not line.startswith('#'):
            ll = line.strip()
            reannotatedList.append(ll)

def loadCTB(reannotatedList, targetFileHandle):
    ss = 0
    for line in targetFileHandle:
        if line.startswith('#'):
            sys.stdout.write(line)
        else:
            assert ss < len(reannotatedList)
            ll = line.strip()
            pairs = ll.split(' ')
            dicPairs = reannotatedList[ss].split(' ')
            if not len(dicPairs) == len(pairs):
                sys.stderr.write('Sentence length does not match its reannotated version, skipped:\n')
                sys.stderr.write(ll + '\n')
                ss += 1
                continue
            for ii in xrange(0, len(pairs)):
                word_pos = pairs[ii].split('_')
                assert len(word_pos) == 2
                if ii > 0:
                    sys.stdout.write(' ')
                sys.stdout.write(pairs[ii] + '_' + dicPairs[ii])
            sys.stdout.write('\n')
            ss += 1

if __name__ == '__main__':
    main()
