예상보다 너무 오래걸렸다.
문제는 이 친구도 착하지만 느린친구라 예상보다 일하는게 오래걸린다ㅎㅎ
언젠가;; python 공부 한다는 생각으로 최적화 해보는 것이 좋겠다.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import math
import os
import pickle
import sys
from re import *
import operator
def tokenize(input_text):
'''Returns a list of words in the input text.'''
#input_text = input_text.decode('latin1')
# English clitics.
input_text = input_text.replace("'ll", " will")
input_text = input_text.replace("'re", " are")
input_text = input_text.replace("'ve", " have")
input_text = input_text.replace("won't", "will not")
input_text = input_text.replace("n't", " not")
input_text = input_text.replace("'m", " am")
# Convert single text lines to fluent text and lower the letters.
input_text = input_text.lower()
# Regex to search for words.
# IMPORTANT: The search regex and the string to search in have to be
# unicode, otherwise it will not work properly.
re_words = compile("[abcdefghijklmnopqrsšzžtuvwõäöüxy'-]+", IGNORECASE | MULTILINE | DOTALL)
# Now the words can be extracted.
text_word_list = re_words.findall(input_text)
return text_word_list
# Calculates the length of a vector.
def calculate_vector_length(vector_1):
vector_length = 0
# Your code goes here
for x in vector_1:
vector_length = vector_length + math.pow(x,2)
vector_length = math.sqrt(vector_length)
return vector_length
# Calculates the Euclidean distance between vector_1 und vector_2.
def calculate_euclidean_distance(vector_1, vector_2):
distance = 0
# Your code goes here
if len(vector_1) != len(vector_2):
return -1
for i in range(len(vector_1)):
distance = distance + math.pow(vector_1[i] - vector_2[i],2)
distance = math.sqrt(distance)
return&nb\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0sp;distance
def create_index(docs_dict, document_vector):
"""Create postings list for faster access of Document Frequency."""
"""
index
key: token
value: array with file names
Example
{'shop': ['00042_ham.txt'], 'confirmation': ['00047_spam.txt'], 'very': ['00037_spam.txt', '00049_spam.txt']}
"""
index = {}
# Your code goes here
for doc in docs_dict:
for term in document_vector:
if term in docs_dict[doc][1]:
if term in index :
index[term].append(doc)
else :
index[term] = []
index[term].append(doc)
return index
def get_stop_word_list():
"""Returns the list of stop words."""
in_f = open("stopwords.txt")
stop_words = in_f.read().strip().split("
")
in_f.close()
return stop_words
def create_document_vectors(training_folder):
"""
Computes the document vectors for documents from the training data
and return them and the general document vector.
"""
"""
docs_dict
Key: Training file name
Value: Array of length 2 (Lets call this array 'A')
Array 'A'
Length: 3
A[0]: 'ham' or 'spam'
A[1]: Another array&nbs\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0p;with tokens from the file (no stopwords in this array)
A[2]: Array with tf-idf values
Example of docs_dict
{'00043_ham.txt': ['ham', ['thank', 'posting', 'same', 'issue', 'listmaster', 'lists', 'debian', 'org'], [0.0, 0.0, 0.0, 0.0, 0.0, 0.2011798118406769, 0.0, 0.2011798118406769, 0.0, 0.0, 0.0, 0.0, 0.2011798118406769]}
document_vector
Array of all distinct tokens in the training set (no stop words)
"""
docs_dict = {}
document_vector = []
stop_words = get_stop_word_list()
# Your code goes here
# Read in and tokenize each document of the training data. Remember to remove stop words.
term_frequencies={}
docu_frequencies={}
for doc in os.listdir(training_folder):
A=[]
in_f = open(training_folder+"/"+ doc)
doc_class = in_f.readline().strip() #class of the test document ('spam' or 'ham')
if doc_class != "spam" and doc_class != "ham" :
break;
A.append(doc_class)
doc_tokens = tokenize(in_f.read().strip()) #array of tokens of the test document
#W = [x for x in doc_tokens if x not in stop_words]
W = []
term_frequencies[doc]={}
for x in doc_tokens:
if x not in stop_words:
W.append(x)
if x in term_frequencies[doc]:
term_frequencies[doc][x] = term_frequencies[doc][x] + 1
else:
term_frequencies[doc][x] = 1
if x in docu_frequencies:
docu_frequencies[x].append(doc)
else:
&nbs\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0p; docu_frequencies[x]=[]
docu_frequencies[x].append(doc)
document_vector.extend(W)
A.append(W)
docs_dict[doc]=A
# Create document_vector
document_vector = list(set(document_vector)) # deduplication
# Create tf-idf weighted document vector. You will need to use document frequency values from create_index.
term_index = create_index(docs_dict, document_vector)
for doc in docs_dict:
doc_vect = []
for term in document_vector:
# Your code goes here
# Term Frequency : Log scale
if term not in term_frequencies[doc]:
doc_vect.append(0)
continue
else :
term_frequency = math.log10(term_frequencies[doc][term]) + 1
if term not in docu_frequencies:
print "document frequency error"
document_frequency = len(set(docu_frequencies[term]))
# Idf
idf = math.log10(len(docs_dict) / document_frequency)
tf_idf = term_frequency * idf
doc_vect.append(tf_idf)
&n\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0bsp; docs_dict[doc].append(doc_vect)
# Normalize vector
for doc in docs_dict:
m=0
for s in docs_dict[doc][2]:
m = m + s*s
m = math.sqrt(m)
if m == 0:
continue
docs_dict[doc][2][:] = [x / m for x in docs_dict[doc][2]]
return docs_dict, document_vector, term_index
def create_test_document_vector(docs_dict, test_document, document_vector, index):
# Process test document.
in_f = open(test_document)
doc_class = in_f.readline().strip() #class of the test document ('spam' or 'ham')
if doc_class != "spam" and doc_class != "ham" :
print "Test file don't have a class type"
return [], []
doc_tokens = tokenize(in_f.read().strip()) #array of tokens of the test document
stop_words = get_stop_word_list()
# Remove stop words from doc_tokens
tmp_doc_tokens = []
for doc_token in doc_tokens:
if doc_token not in stop_words:
tmp_doc_tokens.append(doc_token)
doc_tokens=[]
doc_tokens = tmp_doc_tokens
# Your code goes here
doc_data = [doc_class, doc_tokens] #Add token array to doc_class array
# Create tf-idf weighted document vector using document_vector
doc_vect = []
tf_idf = 0
for term in document_vector:
# Your code goes here
# Term Frequency : Log scale
term_frequency = doc_tokens.count(term)
if term_frequency > 0:
term_frequency = math.log10(term_frequency) + 1
else :
doc_vect.append(0)
continue
# Idf
&nbs\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0p; document_frequency = 0
for docs in docs_dict:
if term in docs_dict[docs][1]:
document_frequency = document_frequency + 1
if document_frequency == 0 :
print "document frequency error"
idf = math.log10(len(docs_dict) / document_frequency)
tf_idf = term_frequency * idf
doc_vect.append(tf_idf)
# Normalize the vector.
# Your code goes here
m=0
for s in doc_vect:
m = m + s*s
m = math.sqrt(m)
if m > 0:
doc_vect[:] = [x / m for x in doc_vect]
"""
doc_data should now look like this:
Type: Array
Size: 2
doc_data[0]: 'spam' or 'ham'
doc_data[1]: array of tokens (with stop words removed)
doc_vect
Type: Array with tf-idf values
Example
[0.11856790094712363, 0.0, 0.0, 0.0, 0.0, 0.39387404130807196, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.27530614036094825]
"""
return doc_data, doc_vect
# Main program.
if __name__ == "__main__":
if len(sys.argv) < 4:
print "Usage: python knn.py train test_file k"
else:
training_folder = sys.argv[1]
test_document = sys.argv[2]
k = int(sys.argv[3])
docs_dict, document_vector, term_index = create_document_vectors(training_folder)
t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0est_doc_data, test_doc_vector = create_test_document_vector(docs_dict, test_document, document_vector, term_index)
# Calculate the Euclidean distance of the test document to training documents.
distances = {}
doc_cnt = 0
print "Calculating Euclidean distance between the new document and the training documents ..."
# Your code goes here
for doc in docs_dict:
distances[doc] = calculate_euclidean_distance(test_doc_vector, docs_dict[doc][2])
# Sort nearest docs and their distances to the new document.
# Your code goes here
distances = sorted(distances.items(), key=operator.itemgetter(1))
# Calculate the majority class of the nearest neighbours.
# Your code goes here
idx = 0
ham = 0
spam = 0
for doc in distances:
if idx >= k:
break
if docs_dict[doc[0]][0] == "ham":
ham = ham + 1
if docs_dict[doc[0]][0] == "spam":
spam = spam + 1
idx = idx + 1
if spam > ham:
final_label = "spam"
elif spam < ham:
final_label = "ham"
else:
total_ham = 0
total_spam = 0
for doc in docs_dict:
if docs_dict[doc][0] == "spam":
total_spam = total_spam + 1
 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0; if docs_dict[doc][0] == "ham":
total_ham = total_ham + 1
if total_spam > total_ham:
final_label = "spam"
elif total_spam < total_ham:
final_label = "ham"
else:
final_label = "spam"
# Show the results.
print "Assigned class: %s" % final_label
if test_doc_data[0] == final_label:
print "Correct class assigned to new document! :-)"
else:
print "Wrong class assigned to new document! :-("
print "Nearest neighbours:"
# Your code goes here
idx = 0
for doc in distances:
if idx >= k:
break
print "DocID: " + training_folder+"/"+ doc[0] + " / Distance: " + str(doc[1]) + " / Class:" + docs_dict[doc[0]][0]
idx = idx + 1