In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.model_selection
import sklearn.svm
import sklearn.metrics
import numpy
import pandas
In [2]:
# Read spam dataset
data = pandas.read_csv('spambase.data', header = None)
data.head()
Out[2]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | 0.64 | 0.64 | 0.0 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.000 | 0.0 | 0.778 | 0.000 | 0.000 | 3.756 | 61 | 278 | 1 |
1 | 0.21 | 0.28 | 0.50 | 0.0 | 0.14 | 0.28 | 0.21 | 0.07 | 0.00 | 0.94 | ... | 0.00 | 0.132 | 0.0 | 0.372 | 0.180 | 0.048 | 5.114 | 101 | 1028 | 1 |
2 | 0.06 | 0.00 | 0.71 | 0.0 | 1.23 | 0.19 | 0.19 | 0.12 | 0.64 | 0.25 | ... | 0.01 | 0.143 | 0.0 | 0.276 | 0.184 | 0.010 | 9.821 | 485 | 2259 | 1 |
3 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.137 | 0.0 | 0.137 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
4 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.135 | 0.0 | 0.135 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
5 rows × 58 columns
In [3]:
# Split into features x, labels y
x, y = data.loc[:, 0:56], data[57]
print(data.shape, x.shape, y.shape)
(4601, 58) (4601, 57) (4601,)
In [4]:
# Split into train, test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(3680, 57) (921, 57) (3680,) (921,)
In [5]:
# Make sure the train-test split is the same every time when this is run by setting a seed of 42
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state = 42, train_size = 0.8, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(3680, 57) (921, 57) (3680,) (921,)
In [6]:
# Fit a model, for example linear support vector machine
model = sklearn.svm.LinearSVC(dual = "auto")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
[list(y_pred[0:25]), list(y_test[0:25])]
Out[6]:
[[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]]
In [7]:
# Compute the confusion matrix
sklearn.metrics.confusion_matrix(y_test, y_pred)
Out[7]:
array([[506, 25], [ 50, 340]], dtype=int64)
In [8]:
# Compare with the perfect prediction
sklearn.metrics.confusion_matrix(y_test, y_test)
Out[8]:
array([[531, 0], [ 0, 390]], dtype=int64)
In [9]:
# Compare with if the prediction is random (half probability 0, half probability 1)
sklearn.metrics.confusion_matrix(y_test, numpy.random.randint(2, size = (len(y_test))))
Out[9]:
array([[278, 253], [192, 198]], dtype=int64)