In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.model_selection
import sklearn.svm
import sklearn.metrics
import numpy
import pandas
In [2]:
# Read spam dataset
data = pandas.read_csv('spambase.data', header = None)
data.head()
Out[2]:
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57
0 0.00 0.64 0.64 0.0 0.32 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.000 0.0 0.778 0.000 0.000 3.756 61 278 1
1 0.21 0.28 0.50 0.0 0.14 0.28 0.21 0.07 0.00 0.94 ... 0.00 0.132 0.0 0.372 0.180 0.048 5.114 101 1028 1
2 0.06 0.00 0.71 0.0 1.23 0.19 0.19 0.12 0.64 0.25 ... 0.01 0.143 0.0 0.276 0.184 0.010 9.821 485 2259 1
3 0.00 0.00 0.00 0.0 0.63 0.00 0.31 0.63 0.31 0.63 ... 0.00 0.137 0.0 0.137 0.000 0.000 3.537 40 191 1
4 0.00 0.00 0.00 0.0 0.63 0.00 0.31 0.63 0.31 0.63 ... 0.00 0.135 0.0 0.135 0.000 0.000 3.537 40 191 1

5 rows × 58 columns

In [3]:
# Split into features x, labels y
x, y = data.loc[:, 0:56], data[57]
print(data.shape, x.shape, y.shape)
(4601, 58) (4601, 57) (4601,)
In [4]:
# Split into train, test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(3680, 57) (921, 57) (3680,) (921,)
In [5]:
# Make sure the train-test split is the same every time when this is run by setting a seed of 42
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state = 42, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(3680, 57) (921, 57) (3680,) (921,)
In [6]:
# Fit a model, for example linear support vector machine
model = sklearn.svm.LinearSVC(dual = "auto")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
[list(y_pred[0:25]), list(y_test[0:25])]
Out[6]:
[[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1],
 [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]]
In [7]:
# Compute the confusion matrix
sklearn.metrics.confusion_matrix(y_test, y_pred)
Out[7]:
array([[506,  25],
       [ 50, 340]], dtype=int64)
In [8]:
# Compare with the perfect prediction
sklearn.metrics.confusion_matrix(y_test, y_test)
Out[8]:
array([[531,   0],
       [  0, 390]], dtype=int64)
In [9]:
# Compare with if the prediction is random (half probability 0, half probability 1)
sklearn.metrics.confusion_matrix(y_test, numpy.random.randint(2, size = (len(y_test))))
Out[9]:
array([[278, 253],
       [192, 198]], dtype=int64)