# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.compose
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.impute
import sklearn.linear_model
import sklearn.metrics
import pandas

# Read the Adult dataset
data = pandas.read_csv("adult.data", header = None, index_col = None, names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"])
data.head(5)

# Split features and labels
x = data.iloc[:, 0:14]
y = data.iloc[:, 14]
print(x.shape, y.shape)

(32561, 14) (32561,)

# Use column transformer on different columns
continuous = ["age", "education-num"]
discrete = ["workclass", "occupation"]
imputer_c = sklearn.impute.SimpleImputer(strategy = "median")
imputer_d = sklearn.impute.SimpleImputer(strategy = "constant", fill_value = "None")
transformer_c = sklearn.preprocessing.StandardScaler()
transformer_d = sklearn.preprocessing.OneHotEncoder()
steps_c = sklearn.pipeline.Pipeline(steps = [("ic", imputer_c), ("tc", transformer_c)])
steps_d = sklearn.pipeline.Pipeline(steps = [("id", imputer_d), ("td", transformer_d)])
pre = sklearn.compose.ColumnTransformer(transformers = [("c", steps_c, continuous), ("d", steps_d, discrete)])
model = sklearn.pipeline.Pipeline(steps = [("pre", pre), ("clf", sklearn.linear_model.LogisticRegression())])
model

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c',
                                                  Pipeline(steps=[('ic',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('tc',
                                                                   StandardScaler())]),
                                                  ['age', 'education-num']),
                                                 ('d',
                                                  Pipeline(steps=[('id',
                                                                   SimpleImputer(fill_value='None',
                                                                                 strategy='constant')),
                                                                  ('td',
                                                                   OneHotEncoder())]),
                                                  ['workclass',
                                                   'occupation'])])),
                ('clf', LogisticRegression())])

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c',
                                                  Pipeline(steps=[('ic',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('tc',
                                                                   StandardScaler())]),
                                                  ['age', 'education-num']),
                                                 ('d',
                                                  Pipeline(steps=[('id',
                                                                   SimpleImputer(fill_value='None',
                                                                                 strategy='constant')),
                                                                  ('td',
                                                                   OneHotEncoder())]),
                                                  ['workclass',
                                                   'occupation'])])),
                ('clf', LogisticRegression())])

ColumnTransformer(transformers=[('c',
                                 Pipeline(steps=[('ic',
                                                  SimpleImputer(strategy='median')),
                                                 ('tc', StandardScaler())]),
                                 ['age', 'education-num']),
                                ('d',
                                 Pipeline(steps=[('id',
                                                  SimpleImputer(fill_value='None',
                                                                strategy='constant')),
                                                 ('td', OneHotEncoder())]),
                                 ['workclass', 'occupation'])])

['age', 'education-num']

SimpleImputer(strategy='median')

StandardScaler()

['workclass', 'occupation']

SimpleImputer(fill_value='None', strategy='constant')

OneHotEncoder()

LogisticRegression()

# Estimate the model and compute the confusion matrix on the training set
model.fit(x, y)
y_pred = model.predict(x)
sklearn.metrics.confusion_matrix(y, y_pred)

C:\Users\young\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

array([[23111,  1609],
       [ 5077,  2764]], dtype=int64)

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	label
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K