In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.compose
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.impute
import sklearn.linear_model
import sklearn.metrics
import pandas
In [2]:
# Read the Adult dataset
data = pandas.read_csv("adult.data", header = None, index_col = None, names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"])
data.head(5)
Out[2]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country label
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [3]:
# Split features and labels
x = data.iloc[:, 0:14]
y = data.iloc[:, 14]
print(x.shape, y.shape)
(32561, 14) (32561,)
In [4]:
# Use column transformer on different columns
continuous = ["age", "education-num"]
discrete = ["workclass", "occupation"]
imputer_c = sklearn.impute.SimpleImputer(strategy = "median")
imputer_d = sklearn.impute.SimpleImputer(strategy = "constant", fill_value = "None")
transformer_c = sklearn.preprocessing.StandardScaler()
transformer_d = sklearn.preprocessing.OneHotEncoder()
steps_c = sklearn.pipeline.Pipeline(steps = [("ic", imputer_c), ("tc", transformer_c)])
steps_d = sklearn.pipeline.Pipeline(steps = [("id", imputer_d), ("td", transformer_d)])
pre = sklearn.compose.ColumnTransformer(transformers = [("c", steps_c, continuous), ("d", steps_d, discrete)])
model = sklearn.pipeline.Pipeline(steps = [("pre", pre), ("clf", sklearn.linear_model.LogisticRegression())])
model
Out[4]:
Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c',
                                                  Pipeline(steps=[('ic',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('tc',
                                                                   StandardScaler())]),
                                                  ['age', 'education-num']),
                                                 ('d',
                                                  Pipeline(steps=[('id',
                                                                   SimpleImputer(fill_value='None',
                                                                                 strategy='constant')),
                                                                  ('td',
                                                                   OneHotEncoder())]),
                                                  ['workclass',
                                                   'occupation'])])),
                ('clf', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c',
                                                  Pipeline(steps=[('ic',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('tc',
                                                                   StandardScaler())]),
                                                  ['age', 'education-num']),
                                                 ('d',
                                                  Pipeline(steps=[('id',
                                                                   SimpleImputer(fill_value='None',
                                                                                 strategy='constant')),
                                                                  ('td',
                                                                   OneHotEncoder())]),
                                                  ['workclass',
                                                   'occupation'])])),
                ('clf', LogisticRegression())])
ColumnTransformer(transformers=[('c',
                                 Pipeline(steps=[('ic',
                                                  SimpleImputer(strategy='median')),
                                                 ('tc', StandardScaler())]),
                                 ['age', 'education-num']),
                                ('d',
                                 Pipeline(steps=[('id',
                                                  SimpleImputer(fill_value='None',
                                                                strategy='constant')),
                                                 ('td', OneHotEncoder())]),
                                 ['workclass', 'occupation'])])
['age', 'education-num']
SimpleImputer(strategy='median')
StandardScaler()
['workclass', 'occupation']
SimpleImputer(fill_value='None', strategy='constant')
OneHotEncoder()
LogisticRegression()
In [5]:
# Estimate the model and compute the confusion matrix on the training set
model.fit(x, y)
y_pred = model.predict(x)
sklearn.metrics.confusion_matrix(y, y_pred)
C:\Users\young\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[5]:
array([[23111,  1609],
       [ 5077,  2764]], dtype=int64)