In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.compose
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.impute
import sklearn.linear_model
import sklearn.metrics
import pandas
In [2]:
# Read the Adult dataset
data = pandas.read_csv("adult.data", header = None, index_col = None, names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"])
data.head(5)
Out[2]:
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
In [3]:
# Split features and labels
x = data.iloc[:, 0:14]
y = data.iloc[:, 14]
print(x.shape, y.shape)
(32561, 14) (32561,)
In [4]:
# Use column transformer on different columns
continuous = ["age", "education-num"]
discrete = ["workclass", "occupation"]
imputer_c = sklearn.impute.SimpleImputer(strategy = "median")
imputer_d = sklearn.impute.SimpleImputer(strategy = "constant", fill_value = "None")
transformer_c = sklearn.preprocessing.StandardScaler()
transformer_d = sklearn.preprocessing.OneHotEncoder()
steps_c = sklearn.pipeline.Pipeline(steps = [("ic", imputer_c), ("tc", transformer_c)])
steps_d = sklearn.pipeline.Pipeline(steps = [("id", imputer_d), ("td", transformer_d)])
pre = sklearn.compose.ColumnTransformer(transformers = [("c", steps_c, continuous), ("d", steps_d, discrete)])
model = sklearn.pipeline.Pipeline(steps = [("pre", pre), ("clf", sklearn.linear_model.LogisticRegression())])
model
Out[4]:
Pipeline(steps=[('pre', ColumnTransformer(transformers=[('c', Pipeline(steps=[('ic', SimpleImputer(strategy='median')), ('tc', StandardScaler())]), ['age', 'education-num']), ('d', Pipeline(steps=[('id', SimpleImputer(fill_value='None', strategy='constant')), ('td', OneHotEncoder())]), ['workclass', 'occupation'])])), ('clf', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pre', ColumnTransformer(transformers=[('c', Pipeline(steps=[('ic', SimpleImputer(strategy='median')), ('tc', StandardScaler())]), ['age', 'education-num']), ('d', Pipeline(steps=[('id', SimpleImputer(fill_value='None', strategy='constant')), ('td', OneHotEncoder())]), ['workclass', 'occupation'])])), ('clf', LogisticRegression())])
ColumnTransformer(transformers=[('c', Pipeline(steps=[('ic', SimpleImputer(strategy='median')), ('tc', StandardScaler())]), ['age', 'education-num']), ('d', Pipeline(steps=[('id', SimpleImputer(fill_value='None', strategy='constant')), ('td', OneHotEncoder())]), ['workclass', 'occupation'])])
['age', 'education-num']
SimpleImputer(strategy='median')
StandardScaler()
['workclass', 'occupation']
SimpleImputer(fill_value='None', strategy='constant')
OneHotEncoder()
LogisticRegression()
In [5]:
# Estimate the model and compute the confusion matrix on the training set
model.fit(x, y)
y_pred = model.predict(x)
sklearn.metrics.confusion_matrix(y, y_pred)
C:\Users\young\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Out[5]:
array([[23111, 1609], [ 5077, 2764]], dtype=int64)