# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.model_selection
import sklearn.svm
import sklearn.metrics
import numpy
import pandas

# Read spam dataset
data = pandas.read_csv('spambase.data', header = None)
data.head()

# Split into features x, labels y
x, y = data.loc[:, 0:56], data[57]
print(data.shape, x.shape, y.shape)

(4601, 58) (4601, 57) (4601,)

# Split into train, test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3680, 57) (921, 57) (3680,) (921,)

# Make sure the train-test split is the same every time when this is run by setting a seed of 42
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state = 42, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3680, 57) (921, 57) (3680,) (921,)

# Fit a model, for example linear support vector machine
model = sklearn.svm.LinearSVC(dual = "auto")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
[list(y_pred[0:25]), list(y_test[0:25])]

[[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1],
 [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]]

# Compute the confusion matrix
sklearn.metrics.confusion_matrix(y_test, y_pred)

array([[506,  25],
       [ 50, 340]], dtype=int64)

# Compare with the perfect prediction
sklearn.metrics.confusion_matrix(y_test, y_test)

array([[531,   0],
       [  0, 390]], dtype=int64)

# Compare with if the prediction is random (half probability 0, half probability 1)
sklearn.metrics.confusion_matrix(y_test, numpy.random.randint(2, size = (len(y_test))))

array([[278, 253],
       [192, 198]], dtype=int64)

	0	1	2	4	5	6	7	8	9	...	48	49	51	52	53	54	55	56	57
0	0.00	0.64	0.64	0.32	0.00	0.00	0.00	0.00	0.00	...	0.00	0.000	0.778	0.000	0.000	3.756	61	278	1
1	0.21	0.28	0.50	0.14	0.28	0.21	0.07	0.00	0.94	...	0.00	0.132	0.372	0.180	0.048	5.114	101	1028	1
2	0.06	0.00	0.71	1.23	0.19	0.19	0.12	0.64	0.25	...	0.01	0.143	0.276	0.184	0.010	9.821	485	2259	1
3	0.00	0.00	0.00	0.63	0.00	0.31	0.63	0.31	0.63	...	0.00	0.137	0.137	0.000	0.000	3.537	40	191	1
4	0.00	0.00	0.00	0.63	0.00	0.31	0.63	0.31	0.63	...	0.00	0.135	0.135	0.000	0.000	3.537	40	191	1