In [1]:
import magellan as mg
DEBUG:Cloud:Log file (/Users/aliHitawala/.picloud/cloud.log) opened
In [2]:
mg.init_jvm()
Out[2]:
True
In [3]:
# read tables
A = mg.read_csv('/Users/aliHitawala/Documents/UW-2015/Fall 2015/784 - Data model/Phase2revisited/bikedekho_clean.csv', key='id')
B = mg.read_csv('/Users/aliHitawala/Documents/UW-2015/Fall 2015/784 - Data model/Phase2revisited/bikewale_clean.csv', key='id')
In [4]:
A.head(2)
Out[4]:
id bike_name city_posted km_driven color fuel_type price model_year owner_type url
0 36082 Royal Enfield Bullet Electra Twinspark Delhi 900 black Petrol 110000 2015 FirstOwner http://www.bikedekho.com/
1 36115 Royal Enfield Thunderbird 350 Mumbai 15000 black Petrol 110000 2013 FirstOwner http://www.bikedekho.com/
In [5]:
B.head(2)
Out[5]:
id bike_name city_posted km_driven color fuel_type price model_year owner_type url
0 12 TVS Apache RTR 160 Rear Drum Brake Mumbai 28500 grey Petrol 40000 2010 First http://www.bikewale.com/used/bikes-in-mumbai/t...
1 20 Bajaj Discover 125 Disc Mumbai 21300 green Petrol 27000 2009 First http://www.bikewale.com/used/bikes-in-mumbai/b...
In [6]:
# Blocking - already done and stored in CC.csv
C = mg.read_csv('C_after_fix.csv', ltable=A, rtable=B)
len(C)
Out[6]:
4932
In [33]:
# Plan

# 1. Sample candidate set --> S
# 2. Label S --> G
# 3. Split G into development set I, and evaluation set J
# 4. Select best learning-based matcher Y, using I 
# 5. Add triggers to Y ---> Z
# 6. Evaluate Z using J
In [34]:
# Sample the candidate set
S = mg.sample_table(C, 450)
In [35]:
# Label S
G = mg.label_table(S, 'gold')
In [854]:
G.to_csv('golden_3.csv')
Out[854]:
True
In [7]:
# use pre-labeled data if present
G = mg.read_csv('golden.csv', ltable=A, rtable=B)
len(G)
Out[7]:
450
In [8]:
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
In [9]:
# checking the number of tuples in I and J
(len(I), len(J))
Out[9]:
(315, 135)
In [10]:
# Selecting the best learning-based matcher using I

# Plan 

# 1. Create a set of ML-matchers
# 2. Generate features --> feature_table
# 3. Extract feature vectors using I and feature_table
# 4. Select best learning-based matcher using CV
# 5. Debug the selected matcher (and repeat the above steps)
In [11]:
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher()
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')
In [12]:
# check names of the matchers
(dt.name, svm.name)
Out[12]:
('DecisionTree', 'SVM_61402071609014438441')
In [13]:
# Generate features
feat_table = mg.get_features_for_matching(A, B)
# We faced this problem where the internal type of 'color' from both the table didn't match.

# To fix this, we explicitly initiated the types for 'color' attribute and passed 
# in the get_feature api
WARNING:magellan.feature.autofeaturegen:Magellan types: color type (str_eq_1w) and color type (str_bt_1w_5w) are different.If you want to set them to be same and generate features, update output from get_attr_types and use get_features command.

In [14]:
mg._atypes1['color'] = mg._atypes2['color']
In [15]:
feat_table = mg.get_features(A, B, mg._atypes1, mg._atypes2, mg._match_c,mg._match_t, mg._match_s)
In [16]:
feat_table
# feature generated after the 'color' fix
Out[16]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
0 id_id_exm id id None None exact_match <function id_id_exm at 0x125024938> from magellan.feature.simfunctions import *\nf...
1 id_id_anm id id None None abs_norm <function id_id_anm at 0x125024a28> from magellan.feature.simfunctions import *\nf...
2 id_id_lev id id None None lev <function id_id_lev at 0x125024aa0> from magellan.feature.simfunctions import *\nf...
3 bike_name_bike_name_jac_qgm_3_qgm_3 bike_name bike_name qgm_3 qgm_3 jaccard <function bike_name_bike_name_jac_qgm_3_qgm_3 ... from magellan.feature.simfunctions import *\nf...
4 bike_name_bike_name_cos_dlm_dc0_dlm_dc0 bike_name bike_name dlm_dc0 dlm_dc0 cosine <function bike_name_bike_name_cos_dlm_dc0_dlm_... from magellan.feature.simfunctions import *\nf...
5 bike_name_bike_name_jac_dlm_dc0_dlm_dc0 bike_name bike_name dlm_dc0 dlm_dc0 jaccard <function bike_name_bike_name_jac_dlm_dc0_dlm_... from magellan.feature.simfunctions import *\nf...
6 bike_name_bike_name_mel bike_name bike_name None None monge_elkan <function bike_name_bike_name_mel at 0x125024c80> from magellan.feature.simfunctions import *\nf...
7 bike_name_bike_name_lev bike_name bike_name None None lev <function bike_name_bike_name_lev at 0x125024cf8> from magellan.feature.simfunctions import *\nf...
8 bike_name_bike_name_nmw bike_name bike_name None None needleman_wunsch <function bike_name_bike_name_nmw at 0x125024d70> from magellan.feature.simfunctions import *\nf...
9 bike_name_bike_name_sw bike_name bike_name None None smith_waterman <function bike_name_bike_name_sw at 0x125024de8> from magellan.feature.simfunctions import *\nf...
10 bike_name_bike_name_swg bike_name bike_name None None smith_waterman_gotoh <function bike_name_bike_name_swg at 0x125024e60> from magellan.feature.simfunctions import *\nf...
11 city_posted_city_posted_lev city_posted city_posted None None lev <function city_posted_city_posted_lev at 0x125... from magellan.feature.simfunctions import *\nf...
12 city_posted_city_posted_jar city_posted city_posted None None jaro <function city_posted_city_posted_jar at 0x125... from magellan.feature.simfunctions import *\nf...
13 city_posted_city_posted_jwn city_posted city_posted None None jaro_winkler <function city_posted_city_posted_jwn at 0x125... from magellan.feature.simfunctions import *\nf...
14 city_posted_city_posted_sdx city_posted city_posted None None soundex <function city_posted_city_posted_sdx at 0x125... from magellan.feature.simfunctions import *\nf...
15 city_posted_city_posted_exm city_posted city_posted None None exact_match <function city_posted_city_posted_exm at 0x125... from magellan.feature.simfunctions import *\nf...
16 city_posted_city_posted_jac_qgm_3_qgm_3 city_posted city_posted qgm_3 qgm_3 jaccard <function city_posted_city_posted_jac_qgm_3_qg... from magellan.feature.simfunctions import *\nf...
17 km_driven_km_driven_exm km_driven km_driven None None exact_match <function km_driven_km_driven_exm at 0x12502f230> from magellan.feature.simfunctions import *\nf...
18 km_driven_km_driven_anm km_driven km_driven None None abs_norm <function km_driven_km_driven_anm at 0x12502f2a8> from magellan.feature.simfunctions import *\nf...
19 km_driven_km_driven_lev km_driven km_driven None None lev <function km_driven_km_driven_lev at 0x12502f320> from magellan.feature.simfunctions import *\nf...
20 color_color_jac_qgm_3_qgm_3 color color qgm_3 qgm_3 jaccard <function color_color_jac_qgm_3_qgm_3 at 0x125... from magellan.feature.simfunctions import *\nf...
21 color_color_cos_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 cosine <function color_color_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
22 color_color_jac_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 jaccard <function color_color_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
23 color_color_mel color color None None monge_elkan <function color_color_mel at 0x12502f500> from magellan.feature.simfunctions import *\nf...
24 color_color_lev color color None None lev <function color_color_lev at 0x12502f578> from magellan.feature.simfunctions import *\nf...
25 color_color_nmw color color None None needleman_wunsch <function color_color_nmw at 0x12502f5f0> from magellan.feature.simfunctions import *\nf...
26 color_color_sw color color None None smith_waterman <function color_color_sw at 0x12502f668> from magellan.feature.simfunctions import *\nf...
27 color_color_swg color color None None smith_waterman_gotoh <function color_color_swg at 0x12502f6e0> from magellan.feature.simfunctions import *\nf...
28 fuel_type_fuel_type_lev fuel_type fuel_type None None lev <function fuel_type_fuel_type_lev at 0x12502f758> from magellan.feature.simfunctions import *\nf...
29 fuel_type_fuel_type_jar fuel_type fuel_type None None jaro <function fuel_type_fuel_type_jar at 0x12502f7d0> from magellan.feature.simfunctions import *\nf...
30 fuel_type_fuel_type_jwn fuel_type fuel_type None None jaro_winkler <function fuel_type_fuel_type_jwn at 0x12502f848> from magellan.feature.simfunctions import *\nf...
31 fuel_type_fuel_type_sdx fuel_type fuel_type None None soundex <function fuel_type_fuel_type_sdx at 0x12502f8c0> from magellan.feature.simfunctions import *\nf...
32 fuel_type_fuel_type_exm fuel_type fuel_type None None exact_match <function fuel_type_fuel_type_exm at 0x12502f938> from magellan.feature.simfunctions import *\nf...
33 fuel_type_fuel_type_jac_qgm_3_qgm_3 fuel_type fuel_type qgm_3 qgm_3 jaccard <function fuel_type_fuel_type_jac_qgm_3_qgm_3 ... from magellan.feature.simfunctions import *\nf...
34 price_price_exm price price None None exact_match <function price_price_exm at 0x12502fa28> from magellan.feature.simfunctions import *\nf...
35 price_price_anm price price None None abs_norm <function price_price_anm at 0x12502faa0> from magellan.feature.simfunctions import *\nf...
36 price_price_lev price price None None lev <function price_price_lev at 0x12502fb18> from magellan.feature.simfunctions import *\nf...
37 model_year_model_year_exm model_year model_year None None exact_match <function model_year_model_year_exm at 0x12502... from magellan.feature.simfunctions import *\nf...
38 model_year_model_year_anm model_year model_year None None abs_norm <function model_year_model_year_anm at 0x12502... from magellan.feature.simfunctions import *\nf...
39 model_year_model_year_lev model_year model_year None None lev <function model_year_model_year_lev at 0x12502... from magellan.feature.simfunctions import *\nf...
40 owner_type_owner_type_lev owner_type owner_type None None lev <function owner_type_owner_type_lev at 0x12502... from magellan.feature.simfunctions import *\nf...
41 owner_type_owner_type_jar owner_type owner_type None None jaro <function owner_type_owner_type_jar at 0x12502... from magellan.feature.simfunctions import *\nf...
42 owner_type_owner_type_jwn owner_type owner_type None None jaro_winkler <function owner_type_owner_type_jwn at 0x12502... from magellan.feature.simfunctions import *\nf...
43 owner_type_owner_type_sdx owner_type owner_type None None soundex <function owner_type_owner_type_sdx at 0x12502... from magellan.feature.simfunctions import *\nf...
44 owner_type_owner_type_exm owner_type owner_type None None exact_match <function owner_type_owner_type_exm at 0x12502... from magellan.feature.simfunctions import *\nf...
45 owner_type_owner_type_jac_qgm_3_qgm_3 owner_type owner_type qgm_3 qgm_3 jaccard <function owner_type_owner_type_jac_qgm_3_qgm_... from magellan.feature.simfunctions import *\nf...
46 url_url_lev url url None None lev <function url_url_lev at 0x12503c050> from magellan.feature.simfunctions import *\nf...
47 url_url_jar url url None None jaro <function url_url_jar at 0x12503c0c8> from magellan.feature.simfunctions import *\nf...
48 url_url_jwn url url None None jaro_winkler <function url_url_jwn at 0x12503c140> from magellan.feature.simfunctions import *\nf...
49 url_url_sdx url url None None soundex <function url_url_sdx at 0x12503c1b8> from magellan.feature.simfunctions import *\nf...
50 url_url_exm url url None None exact_match <function url_url_exm at 0x12503c230> from magellan.feature.simfunctions import *\nf...
51 url_url_jac_qgm_3_qgm_3 url url qgm_3 qgm_3 jaccard <function url_url_jac_qgm_3_qgm_3 at 0x12503c2a8> from magellan.feature.simfunctions import *\nf...
In [17]:
mg._match_s
Out[17]:
{'abs_norm': <function magellan.feature.simfunctions.abs_norm>,
 'cosine': <function magellan.feature.simfunctions.cosine>,
 'exact_match': <function magellan.feature.simfunctions.exact_match>,
 'jaccard': <function magellan.feature.simfunctions.jaccard>,
 'jaro': <function magellan.feature.simfunctions.jaro>,
 'jaro_winkler': <function magellan.feature.simfunctions.jaro_winkler>,
 'lev': <function magellan.feature.simfunctions.lev>,
 'monge_elkan': <function magellan.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function magellan.feature.simfunctions.needleman_wunsch>,
 'rel_diff': <function magellan.feature.simfunctions.rel_diff>,
 'smith_waterman': <function magellan.feature.simfunctions.smith_waterman>,
 'smith_waterman_gotoh': <function magellan.feature.simfunctions.smith_waterman_gotoh>,
 'soundex': <function magellan.feature.simfunctions.soundex>}
In [18]:
mg._match_t
Out[18]:
{'dlm_dc0': <function magellan.feature.tokenizers.tok_delim>,
 'qgm_2': <function magellan.feature.tokenizers.tok_qgram>,
 'qgm_3': <function magellan.feature.tokenizers.tok_qgram>}
In [19]:
mg._match_c['corres']
Out[19]:
[('id', 'id'),
 ('bike_name', 'bike_name'),
 ('city_posted', 'city_posted'),
 ('km_driven', 'km_driven'),
 ('color', 'color'),
 ('fuel_type', 'fuel_type'),
 ('price', 'price'),
 ('model_year', 'model_year'),
 ('owner_type', 'owner_type'),
 ('url', 'url')]
In [20]:
# Features used:
# Adding features related to 'bike_name' and 'color' features to get the best learning-based matcher 
# and using extra triggers on top of it to take care of boundary cases.
In [21]:
feat_subset_iter1 = (feat_table[3:11].append(feat_table[20:28]))
In [22]:
# Get feature vectors
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')
In [23]:
K.head()
Out[23]:
_id ltable.id rtable.id bike_name_bike_name_jac_qgm_3_qgm_3 bike_name_bike_name_cos_dlm_dc0_dlm_dc0 bike_name_bike_name_jac_dlm_dc0_dlm_dc0 bike_name_bike_name_mel bike_name_bike_name_lev bike_name_bike_name_nmw bike_name_bike_name_sw bike_name_bike_name_swg color_color_jac_qgm_3_qgm_3 color_color_cos_dlm_dc0_dlm_dc0 color_color_jac_dlm_dc0_dlm_dc0 color_color_mel color_color_lev color_color_nmw color_color_sw color_color_swg gold
0 0 42500 1873 0.615385 0.800000 0.666667 0.900000 0.769231 0.769231 0.850000 0.900000 0 0 0 0.4 0 0.5 0.333333 0.4 0
1 1 44841 17451 0.378378 0.408248 0.250000 0.817391 0.562500 0.640625 0.739130 0.756522 1 1 1 1.0 1 1.0 1.000000 1.0 1
2 2 43816 7915 0.212121 0.447214 0.285714 0.473684 0.480000 0.620000 0.368421 0.505263 1 1 1 1.0 1 1.0 1.000000 1.0 0
3 3 47734 17347 0.243243 0.471405 0.285714 0.760000 0.400000 0.542857 0.733333 0.760000 1 1 1 1.0 1 1.0 1.000000 1.0 0
4 4 43387 8359 0.761905 0.894427 0.800000 1.000000 0.692308 0.692308 1.000000 1.000000 1 1 1 1.0 1 1.0 1.000000 1.0 1
In [24]:
# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='f1', random_state=1) 
In [25]:
result['selected_matcher']
Out[25]:
<magellan.matcher.rfmatcher.RFMatcher at 0x124e1e290>
In [26]:
result['cv_stats']
Out[26]:
Name Matcher Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 DecisionTree <magellan.matcher.dtmatcher.DTMatcher object a... 5 0.960000 0.974359 0.962963 0.930233 0.937500 0.953011
1 RF <magellan.matcher.rfmatcher.RFMatcher object a... 5 0.916667 1.000000 0.962963 0.926829 0.969697 0.955231
2 SVM_61402071609014438441 <magellan.matcher.svmmatcher.SVMMatcher object... 5 0.842105 0.844444 0.800000 0.785714 0.809524 0.816358
3 NB <magellan.matcher.nbmatcher.NBMatcher object a... 5 0.842105 0.844444 0.800000 0.785714 0.809524 0.816358
4 LogReg <magellan.matcher.logregmatcher.LogRegMatcher ... 5 0.857143 0.883721 0.823529 0.785714 0.850000 0.840021
5 LinReg <magellan.matcher.linregmatcher.LinRegMatcher ... 5 0.901961 0.904762 0.848485 0.880000 0.864865 0.880014
In [27]:
# Debug decision tree

# Split feature vectors to train and test
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [28]:
mg.vis_debug_dt(dt, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')
In [29]:
# Add features for 'bike_name' and 'color' to be used in ML algorithm and triggers
# 1. create two features for triggers - both are blackbox features
#   a. bike_name_dont_match: to label the output as not match if the bikename present in one but not in the other. Eg: Pulsar in left and not in right
#   b. bike_name_do_match: to label the output as match if the bikename present in both left and right, also the color should match
# 2. create another feature on jaro-winkler for ML algorithm on bike_name
# 3. Add the features create above to the feature table.
In [30]:
# define blackbox features

BIKENAME = ['pulsar', "avenger", 'passion', 'hunk', 'cbr', 'fazer', 'xtreme', 'karizma zmr',
           'yamaha fz16', 'stunner', 'splendor plus', 'splendor pro', 'discover', 'enfield thunderbird',
           'enfield electra', 'unicorn', 'dazzler', 'activa', 'enfield bullet']

def bike_name_dont_match(x, y):
    x_bike_name = x['bike_name'].lower()
    y_bike_name = y['bike_name'].lower()
    for bike in BIKENAME:
        if (bike in x_bike_name) != (bike in y_bike_name):
            return 0
    return 1

def bike_name_do_match(x, y):
    x_bike_name = x['bike_name'].lower()
    y_bike_name = y['bike_name'].lower()
    x_color = x['color'].lower()
    y_color = y['color'].lower()
    for bike in BIKENAME:
        if (bike in x_bike_name and bike in y_bike_name) and (x_color == y_color):
            return 1
    return 0

# Create a feature on bike_name to be used in the ML algorithm 
feature = mg.get_feature_fn("jaro_winkler(ltuple['bike_name'], rtuple['bike_name'])", mg._match_t, mg._match_s)
In [31]:
# Add features to feature table
mg.add_feature(feat_table, 'bike_name_bike_name_jw', feature)
mg.add_blackbox_feature(feat_table, 'bike_name_dont_match', bike_name_dont_match)
mg.add_blackbox_feature(feat_table, 'bike_name_do_match', bike_name_do_match)
Out[31]:
True
In [32]:
feat_table
Out[32]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
0 id_id_exm id id None None exact_match <function id_id_exm at 0x125024938> from magellan.feature.simfunctions import *\nf...
1 id_id_anm id id None None abs_norm <function id_id_anm at 0x125024a28> from magellan.feature.simfunctions import *\nf...
2 id_id_lev id id None None lev <function id_id_lev at 0x125024aa0> from magellan.feature.simfunctions import *\nf...
3 bike_name_bike_name_jac_qgm_3_qgm_3 bike_name bike_name qgm_3 qgm_3 jaccard <function bike_name_bike_name_jac_qgm_3_qgm_3 ... from magellan.feature.simfunctions import *\nf...
4 bike_name_bike_name_cos_dlm_dc0_dlm_dc0 bike_name bike_name dlm_dc0 dlm_dc0 cosine <function bike_name_bike_name_cos_dlm_dc0_dlm_... from magellan.feature.simfunctions import *\nf...
5 bike_name_bike_name_jac_dlm_dc0_dlm_dc0 bike_name bike_name dlm_dc0 dlm_dc0 jaccard <function bike_name_bike_name_jac_dlm_dc0_dlm_... from magellan.feature.simfunctions import *\nf...
6 bike_name_bike_name_mel bike_name bike_name None None monge_elkan <function bike_name_bike_name_mel at 0x125024c80> from magellan.feature.simfunctions import *\nf...
7 bike_name_bike_name_lev bike_name bike_name None None lev <function bike_name_bike_name_lev at 0x125024cf8> from magellan.feature.simfunctions import *\nf...
8 bike_name_bike_name_nmw bike_name bike_name None None needleman_wunsch <function bike_name_bike_name_nmw at 0x125024d70> from magellan.feature.simfunctions import *\nf...
9 bike_name_bike_name_sw bike_name bike_name None None smith_waterman <function bike_name_bike_name_sw at 0x125024de8> from magellan.feature.simfunctions import *\nf...
10 bike_name_bike_name_swg bike_name bike_name None None smith_waterman_gotoh <function bike_name_bike_name_swg at 0x125024e60> from magellan.feature.simfunctions import *\nf...
11 city_posted_city_posted_lev city_posted city_posted None None lev <function city_posted_city_posted_lev at 0x125... from magellan.feature.simfunctions import *\nf...
12 city_posted_city_posted_jar city_posted city_posted None None jaro <function city_posted_city_posted_jar at 0x125... from magellan.feature.simfunctions import *\nf...
13 city_posted_city_posted_jwn city_posted city_posted None None jaro_winkler <function city_posted_city_posted_jwn at 0x125... from magellan.feature.simfunctions import *\nf...
14 city_posted_city_posted_sdx city_posted city_posted None None soundex <function city_posted_city_posted_sdx at 0x125... from magellan.feature.simfunctions import *\nf...
15 city_posted_city_posted_exm city_posted city_posted None None exact_match <function city_posted_city_posted_exm at 0x125... from magellan.feature.simfunctions import *\nf...
16 city_posted_city_posted_jac_qgm_3_qgm_3 city_posted city_posted qgm_3 qgm_3 jaccard <function city_posted_city_posted_jac_qgm_3_qg... from magellan.feature.simfunctions import *\nf...
17 km_driven_km_driven_exm km_driven km_driven None None exact_match <function km_driven_km_driven_exm at 0x12502f230> from magellan.feature.simfunctions import *\nf...
18 km_driven_km_driven_anm km_driven km_driven None None abs_norm <function km_driven_km_driven_anm at 0x12502f2a8> from magellan.feature.simfunctions import *\nf...
19 km_driven_km_driven_lev km_driven km_driven None None lev <function km_driven_km_driven_lev at 0x12502f320> from magellan.feature.simfunctions import *\nf...
20 color_color_jac_qgm_3_qgm_3 color color qgm_3 qgm_3 jaccard <function color_color_jac_qgm_3_qgm_3 at 0x125... from magellan.feature.simfunctions import *\nf...
21 color_color_cos_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 cosine <function color_color_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
22 color_color_jac_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 jaccard <function color_color_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
23 color_color_mel color color None None monge_elkan <function color_color_mel at 0x12502f500> from magellan.feature.simfunctions import *\nf...
24 color_color_lev color color None None lev <function color_color_lev at 0x12502f578> from magellan.feature.simfunctions import *\nf...
25 color_color_nmw color color None None needleman_wunsch <function color_color_nmw at 0x12502f5f0> from magellan.feature.simfunctions import *\nf...
26 color_color_sw color color None None smith_waterman <function color_color_sw at 0x12502f668> from magellan.feature.simfunctions import *\nf...
27 color_color_swg color color None None smith_waterman_gotoh <function color_color_swg at 0x12502f6e0> from magellan.feature.simfunctions import *\nf...
28 fuel_type_fuel_type_lev fuel_type fuel_type None None lev <function fuel_type_fuel_type_lev at 0x12502f758> from magellan.feature.simfunctions import *\nf...
29 fuel_type_fuel_type_jar fuel_type fuel_type None None jaro <function fuel_type_fuel_type_jar at 0x12502f7d0> from magellan.feature.simfunctions import *\nf...
30 fuel_type_fuel_type_jwn fuel_type fuel_type None None jaro_winkler <function fuel_type_fuel_type_jwn at 0x12502f848> from magellan.feature.simfunctions import *\nf...
31 fuel_type_fuel_type_sdx fuel_type fuel_type None None soundex <function fuel_type_fuel_type_sdx at 0x12502f8c0> from magellan.feature.simfunctions import *\nf...
32 fuel_type_fuel_type_exm fuel_type fuel_type None None exact_match <function fuel_type_fuel_type_exm at 0x12502f938> from magellan.feature.simfunctions import *\nf...
33 fuel_type_fuel_type_jac_qgm_3_qgm_3 fuel_type fuel_type qgm_3 qgm_3 jaccard <function fuel_type_fuel_type_jac_qgm_3_qgm_3 ... from magellan.feature.simfunctions import *\nf...
34 price_price_exm price price None None exact_match <function price_price_exm at 0x12502fa28> from magellan.feature.simfunctions import *\nf...
35 price_price_anm price price None None abs_norm <function price_price_anm at 0x12502faa0> from magellan.feature.simfunctions import *\nf...
36 price_price_lev price price None None lev <function price_price_lev at 0x12502fb18> from magellan.feature.simfunctions import *\nf...
37 model_year_model_year_exm model_year model_year None None exact_match <function model_year_model_year_exm at 0x12502... from magellan.feature.simfunctions import *\nf...
38 model_year_model_year_anm model_year model_year None None abs_norm <function model_year_model_year_anm at 0x12502... from magellan.feature.simfunctions import *\nf...
39 model_year_model_year_lev model_year model_year None None lev <function model_year_model_year_lev at 0x12502... from magellan.feature.simfunctions import *\nf...
40 owner_type_owner_type_lev owner_type owner_type None None lev <function owner_type_owner_type_lev at 0x12502... from magellan.feature.simfunctions import *\nf...
41 owner_type_owner_type_jar owner_type owner_type None None jaro <function owner_type_owner_type_jar at 0x12502... from magellan.feature.simfunctions import *\nf...
42 owner_type_owner_type_jwn owner_type owner_type None None jaro_winkler <function owner_type_owner_type_jwn at 0x12502... from magellan.feature.simfunctions import *\nf...
43 owner_type_owner_type_sdx owner_type owner_type None None soundex <function owner_type_owner_type_sdx at 0x12502... from magellan.feature.simfunctions import *\nf...
44 owner_type_owner_type_exm owner_type owner_type None None exact_match <function owner_type_owner_type_exm at 0x12502... from magellan.feature.simfunctions import *\nf...
45 owner_type_owner_type_jac_qgm_3_qgm_3 owner_type owner_type qgm_3 qgm_3 jaccard <function owner_type_owner_type_jac_qgm_3_qgm_... from magellan.feature.simfunctions import *\nf...
46 url_url_lev url url None None lev <function url_url_lev at 0x12503c050> from magellan.feature.simfunctions import *\nf...
47 url_url_jar url url None None jaro <function url_url_jar at 0x12503c0c8> from magellan.feature.simfunctions import *\nf...
48 url_url_jwn url url None None jaro_winkler <function url_url_jwn at 0x12503c140> from magellan.feature.simfunctions import *\nf...
49 url_url_sdx url url None None soundex <function url_url_sdx at 0x12503c1b8> from magellan.feature.simfunctions import *\nf...
50 url_url_exm url url None None exact_match <function url_url_exm at 0x12503c230> from magellan.feature.simfunctions import *\nf...
51 url_url_jac_qgm_3_qgm_3 url url qgm_3 qgm_3 jaccard <function url_url_jac_qgm_3_qgm_3 at 0x12503c2a8> from magellan.feature.simfunctions import *\nf...
52 bike_name_bike_name_jw bike_name bike_name PARSE_EXP PARSE_EXP jaro_winkler <function fn at 0x1250245f0> def fn(ltuple, rtuple):\n return jaro_winkl...
53 bike_name_dont_match None None None None None <function bike_name_dont_match at 0x125024320> None
54 bike_name_do_match None None None None None <function bike_name_do_match at 0x125024140> None
In [33]:
# Select all bike_name feature but not the jac0 and all the color features. Apart from those add
# newly created feature on bike_name (jaro-winkler)
feat_subset_iter2 = (feat_table[3:5].append(feat_table[6:10]).append(feat_table[20:28]).append(feat_table[52:53]))
In [34]:
# printing the selected features
feat_subset_iter2
Out[34]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
3 bike_name_bike_name_jac_qgm_3_qgm_3 bike_name bike_name qgm_3 qgm_3 jaccard <function bike_name_bike_name_jac_qgm_3_qgm_3 ... from magellan.feature.simfunctions import *\nf...
4 bike_name_bike_name_cos_dlm_dc0_dlm_dc0 bike_name bike_name dlm_dc0 dlm_dc0 cosine <function bike_name_bike_name_cos_dlm_dc0_dlm_... from magellan.feature.simfunctions import *\nf...
6 bike_name_bike_name_mel bike_name bike_name None None monge_elkan <function bike_name_bike_name_mel at 0x125024c80> from magellan.feature.simfunctions import *\nf...
7 bike_name_bike_name_lev bike_name bike_name None None lev <function bike_name_bike_name_lev at 0x125024cf8> from magellan.feature.simfunctions import *\nf...
8 bike_name_bike_name_nmw bike_name bike_name None None needleman_wunsch <function bike_name_bike_name_nmw at 0x125024d70> from magellan.feature.simfunctions import *\nf...
9 bike_name_bike_name_sw bike_name bike_name None None smith_waterman <function bike_name_bike_name_sw at 0x125024de8> from magellan.feature.simfunctions import *\nf...
20 color_color_jac_qgm_3_qgm_3 color color qgm_3 qgm_3 jaccard <function color_color_jac_qgm_3_qgm_3 at 0x125... from magellan.feature.simfunctions import *\nf...
21 color_color_cos_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 cosine <function color_color_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
22 color_color_jac_dlm_dc0_dlm_dc0 color color dlm_dc0 dlm_dc0 jaccard <function color_color_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
23 color_color_mel color color None None monge_elkan <function color_color_mel at 0x12502f500> from magellan.feature.simfunctions import *\nf...
24 color_color_lev color color None None lev <function color_color_lev at 0x12502f578> from magellan.feature.simfunctions import *\nf...
25 color_color_nmw color color None None needleman_wunsch <function color_color_nmw at 0x12502f5f0> from magellan.feature.simfunctions import *\nf...
26 color_color_sw color color None None smith_waterman <function color_color_sw at 0x12502f668> from magellan.feature.simfunctions import *\nf...
27 color_color_swg color color None None smith_waterman_gotoh <function color_color_swg at 0x12502f6e0> from magellan.feature.simfunctions import *\nf...
52 bike_name_bike_name_jw bike_name bike_name PARSE_EXP PARSE_EXP jaro_winkler <function fn at 0x1250245f0> def fn(ltuple, rtuple):\n return jaro_winkl...
In [35]:
# Get new set of feature vectors
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter2, attrs_after='gold')
In [36]:
# Split feature vectors into U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [37]:
# Check whether the added features improves the accuracy in the test set.
# Steps
# 1. Train DT using U
# 2. Predict V using DT
# 3. Evaluate predictions
In [138]:
# Train dt using U (use other algorithms also to get the predictions on them)
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')
In [139]:
# Predict V using dt
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)
In [140]:
# Writing it to the csv file to check the output
P.to_csv('test0.csv')
Out[140]:
True
In [141]:
# Evaluate the predictions
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
Precision : 88.24% (45/51)
Recall : 97.83% (45/46)
F1 : 92.78%
False positives : 6 (out of 51 positive predictions)
False negatives : 1 (out of 107 negative predictions)
In [136]:
# Apply cross validation to find if there is a better matcher
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='f1') 
In [137]:
result['cv_stats']
Out[137]:
Name Matcher Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 DecisionTree <magellan.matcher.dtmatcher.DTMatcher object a... 5 0.914286 1.000000 1.000000 0.938776 0.933333 0.957279
1 RF <magellan.matcher.rfmatcher.RFMatcher object a... 5 0.938776 1.000000 0.857143 0.875000 1.000000 0.934184
2 SVM_61402071609014438441 <magellan.matcher.svmmatcher.SVMMatcher object... 5 0.790698 0.865672 0.808511 0.750000 0.842105 0.811397
3 NB <magellan.matcher.nbmatcher.NBMatcher object a... 5 0.821429 0.785714 0.810811 0.880000 0.777778 0.815146
4 LogReg <magellan.matcher.logregmatcher.LogRegMatcher ... 5 0.792453 0.875000 0.850000 0.900000 0.844444 0.852379
5 LinReg <magellan.matcher.linregmatcher.LinRegMatcher ... 5 0.830189 0.862745 0.833333 0.871795 0.736842 0.826981
In [337]:
# Select DT as the best matcher -- Y
# Use bike_name + color related features
In [338]:
# Add triggers on top of Y

# 1. Split K into U and V
# 2. Use U,V + Y to write triggers (examine fp, fn).
In [70]:
# Split feature vectors to U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [71]:
# Invoke debug interface to check FP and FN
mg.vis_debug_dt(dt, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')
In [72]:
# Add trigger - target false positives: using the bike_name blackbox feature created to check non matching bike_name in left and right. 
no_match_trigger = mg.MatchTrigger()
no_match_trigger.add_cond_rule('bike_name_dont_match(ltuple, rtuple) < 0.5', feat_table)
no_match_trigger.add_cond_status(True)
no_match_trigger.add_action(0)

# Add trigger - target false negatives: using the bike_name blackbox feature created to check matching bike_name in left and right. 
match_trigger = mg.MatchTrigger()
match_trigger.add_cond_rule('bike_name_do_match(ltuple, rtuple) > 0.5', feat_table)
match_trigger.add_cond_status(True)
match_trigger.add_action(1)
Out[72]:
True
In [73]:
# Check whether the added trigger improves the accuracy in the test set.
# Steps
# 1. Train DT using U
# 2. Predict V using DT
# 3. Apply trigger
# 4. Evaluate the result
In [130]:
# Train dt using U
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')
In [131]:
# Predict V using dt
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)
In [132]:
P.to_csv('before_trigger.csv')
Out[132]:
True
In [133]:
# Apply trigger
Q = no_match_trigger.execute(P, 'predicted', inplace=False)
R = match_trigger.execute(Q, 'predicted', inplace=False)
In [134]:
R.to_csv('after_trigger.csv')
Out[134]:
True
In [135]:
# Evaluate the result
eval_result = mg.eval_matches(R, 'predicted', 'gold')
mg.print_eval_summary(eval_result)
Precision : 97.83% (45/46)
Recall : 100.0% (45/45)
F1 : 98.9%
False positives : 1 (out of 46 positive predictions)
False negatives : 0 (out of 112 negative predictions)
In [98]:
# Do cross-validation for matcher + triggers using I (K)
result = mg.cv_matcher_and_trigger(dt, [no_match_trigger, match_trigger], table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')
0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 0.972 sec
In [100]:
result['cv_stats']
Out[100]:
Metric Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 precision 5 1 1 1.000000 1 1 1.000000
1 recall 5 1 1 0.950000 1 1 0.990000
2 f1 5 1 1 0.974359 1 1 0.994872
In [101]:
# Recall the CV for just the matcher (without triggers) was
result = mg.cv_matcher_and_trigger(dt, [], table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')
0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 0.205 sec
In [102]:
result['cv_stats']
Out[102]:
Metric Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 precision 5 0.944444 0.909091 1.000000 1.000000 0.9375 0.958207
1 recall 5 0.944444 0.952381 0.952381 0.950000 0.9375 0.947341
2 f1 5 0.944444 0.930233 0.975610 0.974359 0.9375 0.952429
In [353]:
# Now Z is DT (features: feat_subset_iter2) + no_match_trigger, match_trigger
# Validate Z using J
# Steps
# 1. Extract feature vectors (using feat_subset_iter2) -- > M
# 2. Train DT using H (feature vectors generated using I)
# 3. Predict M using DT
# 4. Apply triggers
# 5. Evaluate the result
In [143]:
# Extract feature vectors
M = mg.extract_feature_vecs(J, feature_table=feat_subset_iter2, attrs_after='gold')
In [144]:
# Train using feature vectors from I 
dt.fit(table=K, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')
In [145]:
# Predict M 
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)
In [146]:
# Apply trigger
T = no_match_trigger.execute(N, 'predicted', inplace=False)
T = match_trigger.execute(T, 'predicted', inplace=False)
In [147]:
# Evaluate the result
eval_result = mg.eval_matches(T, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
Precision : 100.0% (34/34)
Recall : 100.0% (34/34)
F1 : 100.0%
False positives : 0 (out of 34 positive predictions)
False negatives : 0 (out of 101 negative predictions)
In [148]:
T.to_csv('final_matches.csv')
Out[148]:
True
In [ ]: