coming Soon... beta0.02

What's new?

IIR_Boolean_retrieval: AND, skip pointer - 컴퓨터

#사이트에 올리고 나면 항상 버그가 보이게 마련;;;;
#제대로 도는지 많이 확인 안해봐서 인덱스 처리랑 AND 연산만 실제 set 연산값과 비교해 봤다.
#파이썬 문법은.. 음; 신세계구나;

import os
import tokenizer
import collections
import math

# folder containing text documents should be named 'files'
# the 'files' folder should be in the same directory as this .py file.
def make_inverted_index(input_folder):
    '''
    Opens text files in a specified folder, normalizes them using the tokenizer
    and adds all the tokens found in the documents to the inverted index.
    '''

    files_data = {}
    inverted_index = {}
    # Go through the files subfolder and read in all text files in it.
    for fn in os.listdir(input_folder):

        # Read contents of text files.
        f = open("%s/%s" % (input_folder, fn), encoding='utf-8')
        files_data[fn] = f.read()
        f.close()

    # Dictionary to keep track which document corresponds to which id.
    doc_id_mapping = {}
    for doc in enumerate(files_data):
        doc_id_mapping[doc[0]] = doc[1]

    # The entire text for each document is now saved in 'files_data'.
    # Now tokenize each document using 'tokenizer.tokenize'
    # Don't forget to use .lower() function before tokenizing the text

    # Code to create inverted index goes here
        tokens = tokenizer.tokenize(files_data[doc[1]].lower())
    # postings list for each word must be automatically sorted because docId, that is doc[0], is grown from 0 orderly.
        for token in enumerate(set(tokens)):
            if token[1] in inverted_index:
                inverted_index[token[1]].add(doc[0])
            else:
                inverted_index[token[1]] = set([doc[0]])

    #Sorting the inverted index \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0;by words is explained as one of steps of constructing the invert_index according to page 6 of "02-voc-flat.pdf".
    #however I think this is not a necessary procedure for this assignment.
    inverted_index = collections.OrderedDict(sorted(inverted_index.items()))
#    print(inverted_index)
#    print(doc_id_mapping)
    return inverted_index, doc_id_mapping

#if query doesn't have "AND"
def process_simple_query(query, inverted_index, doc_id_mapping):
    '''Returns the postings list (and the corresponding document names)
    for a one-word query.'''

    # Code to process a one-word query goes here
    # Nothing needs to be returned. Simply print the results.
    if query in inverted_index:
        for doc_id in inverted_index[query]:
            print("Term '" + query +"' found in document "+str(doc_id)+" ("+doc_id_mapping[doc_id] + ")")

    else :
        print("Term '"+query+"' is out of vocabulary.")

# only one AND!!!
# do not use set().intersection pseudocode from PPT.
# This function was created by reference to a page 48 of "02-voc-flat.pdf".
def intersect(p1, p2):
    '''Intersects two postings lists and returns the intersection.'''

    answer = []

    # code to find intersection goes here
    idx_p1=0
    idx_p2=0

    # according to page 50 of "02-voc-flat.pdf", for postings list of length P, use sqrt(P) evenly-spaced skip pointers.
    # If fixed skip distances is used, then additional memory space for keeping skip list would not be needed.
    skip_dist1 = int(math.sqrt(len(p1)))
    skip_dist2 = int(math.sqrt(len(p2)))

    while idx_p1 < len(p1) and idx_p2 < len(p2):
        if p1[idx_p1] == p2[\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0idx_p2]:
            answer.append(p1[idx_p1])
            idx_p1 += 1
            idx_p2 += 1
        elif p1[idx_p1] < p2[idx_p2]:
            if ( idx_p1 < len(p1) - 1 and idx_p1 % skip_dist1 == 0 ) and p1[(len(p1) - 1) if (idx_p1 + skip_dist1) >= len(p1) else (idx_p1 + skip_dist1)] <= p2[idx_p2]:
                while ( idx_p1 < len(p1) - 1 and idx_p1 % skip_dist1 == 0 ) and p1[(len(p1) - 1) if (idx_p1 + skip_dist1) >= len(p1) else (idx_p1 + skip_dist1)] <= p2[idx_p2] :
                    idx_p1 = ((len(p1) - 1) if (idx_p1 + skip_dist1) >= len(p1) else (idx_p1 + skip_dist1))
            else:
                idx_p1 += 1
        elif ( idx_p2 < len(p2) - 1 and idx_p2 % skip_dist2 == 0 ) and p2[(len(p2) - 1) if (idx_p2 + skip_dist2) >= len(p2) else (idx_p2 + skip_dist2)] <= p1[idx_p1]:
            while ( idx_p2 < len(p2) - 1 and idx_p2 % skip_dist2 == 0 ) and p2[(len(p2) - 1) if (idx_p2 + skip_dist2) >= len(p2) else (idx_p2 + skip_dist2)] <= p1[idx_p1]:
                idx_p2 = ((len(p2) - 1) if (idx_p2 + skip_dist2) >= len(p2) else (idx_p2 + skip_dist2))
        else:
            idx_p2 += 1
        if idx_p1 > len(p1) or idx_p2 > len(p2):
            print ( str(len(p1)) + " must be bigger than or equal to " + str(idx_p1) + ", " + str(len(p2)) + " must be bigger than or equal to " + str(idx_p2))

    return answer

def process_conjunctive_query(query, inverted_index, doc_id_mapping):
    '''
    Calls the intersection algorithm and displays the intersection (with
    the corresponding document names) of two posting lists for a query
    connected by an 'AND'.
    '''

    # split user query and remove 'AND'
    query_terms = [t.strip() for t in query.split("AND")]

&nb\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0sp;   # Code to process conjunctive query goes here.
    # You will need to call the intersect() function.
    # No need to return anything. Simply print the results.

    if query_terms[0] not in inverted_index:
        print("Term '"+query_terms[0]+"' is out of vocabulary.")
        return

    if query_terms[1] not in inverted_index:
        print("Term '"+query_terms[1]+"' is out of vocabulary.")
        return

    intersected_postings_list = intersect(list(inverted_index[query_terms[0]]), list(inverted_index[query_terms[1]]))
    reference_postings_list = inverted_index[query_terms[0]].intersection(list(inverted_index[query_terms[1]]))
    if intersected_postings_list == list(reference_postings_list):
        for doc_id in intersected_postings_list:
            print("Term '" + query +"' found in document "+str(doc_id)+" ("+doc_id_mapping[doc_id] + ")")

    else :
        #Additional work to check if the intersection function is working correctly
        isHit = False
        print("Intersected incorrectly!!")
        print("Your answer: A postings list of '" + query_terms[0] +"' AND '"+query_terms[1]+"'" , end='' )
        for doc_id in intersected_postings_list:
            print(" -> " + str(doc_id) + "(" + doc_id_mapping[doc_id] + ")", end ='')
            isHit = True

        if isHit:
            print("
INFO: The document name corresponding to each document id is in a parenthesis.")
        else :
            print(" is empty.")

        isHit = False
        print("Correct answer: A postings list of '" +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 query_terms[0] +"' AND '"+query_terms[1]+"'" , end='' )
        for doc_id in reference_postings_list:
            print(" -> " + str(doc_id) + "(" + doc_id_mapping[doc_id] + ")", end ='')
            isHit = True

        if isHit:
            print("
INFO: The document name corresponding to each document id is in a parenthesis.")
        else :
            print(" is empty.")

# Start of the main program.
if __name__ == "__main__":

    # Dictionary for the inverted index.
    inverted_index, doc_name_mapping = make_inverted_index("files")

    # Get user input.
    query = input("Query: ")

    # A conjunctive query.
    if "AND" in query:
        process_conjunctive_query(query, inverted_index, doc_name_mapping)
    # Simple query with one term.
    else:
        process_simple_query(query, inverted_index, doc_name_mapping)

written time : 2017-03-23 20:09:58.0

챙길 것 - 일상

노트북 케이블
멀티탭?

written time : 2017-03-02 14:12:25.0

Above the night - incognito - 음악

written time : 2017-02-19 15:22:09.0

... 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | ...

status

picture

calender

What's new?

IIR_Boolean_retrieval: AND, skip pointer - 컴퓨터

챙길 것 - 일상

Above the night - incognito - 음악