import build_trie import csv dict_file_name = 'elec_brand_dic.txt' training_set_name = 'split_training_set.csv' tuning_set_name = 'split_tuning_set.csv' total_training_set_name = 'training_set.csv' testing_set_name = 'testing_set.csv' # input file names for tuning # input_training_set_name = training_set_name # input_testing_set_name = tuning_set_name # input file names for final testing input_training_set_name = total_training_set_name input_testing_set_name = testing_set_name # build tries [trie, trie_2] = build_trie.construct(dict_file_name, input_training_set_name) with open(input_testing_set_name, 'rb') as csv_file: spam_reader = csv.reader(csv_file) true_brand_list = [] brand_list = [] i = 0 for row in spam_reader: # create the truth brand name list true_brand_list.append(row[1].strip()) brand_candidate = row[2].split() # number of words in product name length_candidate = len(brand_candidate) # considering the existence of the factory name and the brand name # when factory name exists (find1 = True), # the existence of brand name is indicated by find2 find_1 = False # find2 = True does not mean find brand name, just for loop control find_2 = True # the location of the first located name in the string location_1 = -1 # the location of the second located name in the string location_2 = -1 longest_prefix_1 = '' for x in range(0, length_candidate): y = x # checking item start from brandCandidate[x] check_item = brand_candidate[x].strip() # checking if checkItem in Trie new_longest_prefix = trie_2.longest_prefix(check_item.lower(), default='') if not find_1: # when brand name is composed by multiple words while len(check_item) == len(new_longest_prefix): find_1 = True find_2 = False longest_prefix_1 = check_item location_1 = x # if have no reach the end of string if y < length_candidate - 1: y += 1 check_item += ' ' + brand_candidate[y].strip() new_longest_prefix = trie_2.longest_prefix( check_item.lower(), default='') else: break continue # find the second brand name if not find_2: check_item = brand_candidate[y].strip() longest_prefix_2 = trie_2.longest_prefix(check_item.lower(), default='') new_longest_prefix = longest_prefix_2 while len(check_item) == len(new_longest_prefix): find_2 = True location_2 = x longest_prefix_2 = check_item # if have no reach the end of string if y < length_candidate - 1: y += 1 check_item += ' ' + brand_candidate[y].strip() new_longest_prefix = trie_2.longest_prefix( check_item.lower(), default='') else: break # require the brand name must following the factory name find_2 = (find_2 is True) and ((location_2 - location_1) < 2) and ( (len(''.join(longest_prefix_1).split()) - len( ''.join(longest_prefix_2).split())) != 1) find_2 = (find_2 is True) and (len( trie.longest_prefix(longest_prefix_2.lower(), default='')) == len(longest_prefix_2)) if find_2: brand_list.append(''.join(longest_prefix_2)) # print '--'+ str(y) +' matching:' break # if do not find any brand name if not find_1: brand_list.append('Missing') # if find only one candidate brand name elif not find_2: if (len(trie.longest_prefix(longest_prefix_1.lower(), default='')) == len(longest_prefix_1)): brand_list.append(''.join(longest_prefix_1)) else: brand_list.append('Missing') i += 1 correct_name = 0 predict_name = 0 true_name = 0 # Wrong label list: (#: Prediction -- True brand) print 'Wrong label list: (#: Prediction -- True brand)' for i in range(len(brand_list)): prediction = brand_list[i] true_brand = true_brand_list[i] if prediction != true_brand: print str(i) + ': ' + prediction + ' -- ' + true_brand for x in range(0, len(brand_list)): if ((brand_list[x] == true_brand_list[x]) and ( brand_list[x] != 'Missing')): correct_name += 1 for x in range(0, len(brand_list)): if brand_list[x] != 'Missing': predict_name += 1 for x in range(0, len(brand_list)): if true_brand_list[x] != 'Missing': true_name += 1 Recall = correct_name * 1.0 / true_name precision = correct_name * 1.0 / predict_name print print 'Precision: ' + str(precision) print 'Recall: ' + str(Recall)