import pytrie import csv import numpy as np def construct(dict_file_name=None, training_set_name=None): # load provided brand name dictionary dict_array = np.genfromtxt(fname=dict_file_name, dtype=bytes, delimiter='\t') # build laptop brand list (from wikipedia) laptop_brand_list = ['Lenovo', 'lenovo', 'TravelMate', 'Extensa', 'Aspire', 'Gateway', 'Packard Bell', 'Acer Chromebook', 'MacBook Air', 'MacBook Pro', 'MacBook', 'Asus Eee', 'Zenbook', 'ROG Series', 'Asus N', 'Asus X', 'Asus Chromebook', 'Inspiron', 'Latitude', 'Precision', 'Optiplex', 'Studio', 'Vostro', 'XPS', 'Studio XPS', 'Alienware', 'HP Probook', 'Probook', 'ProBook', 'Pavilion', 'Omnibook', 'Elitebook', 'Envy', 'OMEN', 'HP Mini', 'Mini', 'ThinkPad', 'IdeaPad', '3000 series', 'Dynabook', 'Portege', 'Tecra', 'Satellite', 'Qosmio', 'Libretto'] # build complete list to be inserted into trie trie_list = [] trie_list += laptop_brand_list # add training set labels into trie with open(training_set_name, 'rb') as training_set_csv: csv_reader = csv.reader(training_set_csv) for row in csv_reader: cur_brand = row[1] if cur_brand != 'Missing': trie_list.append(cur_brand) # build complete and auxiliary trie trie = pytrie.Trie() trie_2 = pytrie.Trie() for i in range(len(dict_array)): try: # original brand name in dictionary original = dict_array[i][0] trie_list.append(original) except: pass for i in range(len(trie_list)): try: original = trie_list[i] # build suffix lists # suffix_inc = [' inc', ' Inc', ', inc', ', Inc', '. inc', '. Inc'] # suffix_corp = [' corp', ' Corp', ' corporation', ' Corporation', # '. corp', '. Corp', '. corporation', # '. Corporation'] suffix_inc = [' inc', ', inc', '. inc'] suffix_corp = [' corp', ' corporation', '. corp', '. corporation', ' co.', '. co.'] for j in suffix_inc: # add suffix add_inc = original + j trie_list.append(add_inc) # # UPPERCASE # add_inc_upper = add_inc.upper() # trie_list.append(add_inc_upper) # # lowercase # add_inc_lower = add_inc.lower() # trie_list.append(add_inc_lower) for k in suffix_corp: # add suffix add_corp = original + k trie_list.append(add_corp) # # UPPERCASE # add_corp_upper = add_corp.upper() # trie_list.append(add_corp_upper) # # lowercase # add_corp_lower = add_corp.lower() # trie_list.append(add_corp_lower) # # UPPERCASE # original_upper = original.upper() # trie_list.append(original_upper) # # lowercase # original_lower = original.lower() # trie_list.append(original_lower) # # Capitalize Each Word # original_split_list = original_lower.split() # original_cap = '' # for item in original_split_list: # original_cap += item[0].upper() + item[1:].lower() + ' ' # trie_list.append(original_cap) # acronym # original_split_list = original.split() # original_acronym = '' # if len(original_split_list) > 1: # for item in original_split_list: # original_acronym += item[0].lower() # trie_list.append(original_acronym) except: pass for item in trie_list: trie[item.lower()] = item.lower() cur_part = '' item_split_list = item.split() for i in range(len(item_split_list)): cur_part += item_split_list[i] trie_2[cur_part.lower()] = cur_part.lower() if i < len(item_split_list) - 1: cur_part += ' ' trie_2[cur_part.lower()] = cur_part.lower() return [trie, trie_2]