oov checker.py
#!/usr/bin/python
import sys
if len(sys.argv) is not 3:
print "!!USAGE: ./oov_checker.py txt dic"
exit()
print "TEXT: ", sys.argv[1]
vocab = set(line.strip() for line in open(sys.argv[2]))
print "Dictionary: ", sys.argv[2], " Size: ", len(vocab)
txt = open(sys.argv[1])
unk_file = open(sys.argv[1]+".unk", "w")
line_N = 0
for line in txt:
words = line.split()
for word in words :
if word in vocab:
unk_file.write(word)
else:
unk_file.write("
unk_file.write(" ")
unk_file.write("
")
line_N = line_N + 1
print "Total ", line_N," lines were processed and saved to ", sys.argv[1]+".unk"