In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.linear_model
import numpy
import pandas
import statsmodels.api
import matplotlib.pyplot as plt
In [2]:
# Read the data and plot the regression coefficients
data = pandas.read_csv("NewGrades.csv")
features = ["Exam1", "Project", "Lecture", "Quiz", "Lab"]
x = data[features]
y = data["Exam2"]
reg = sklearn.linear_model.LinearRegression()
reg.fit(x, y)
plt.bar(features, reg.coef_)
Out[2]:
<BarContainer object of 5 artists>
No description has been provided for this image
In [3]:
# Compute the score R^2 and adj R^2 of the regression
r2 = reg.score(x, y)
[r2, 1 - (1 - r2) * (len(y) - 1) / (len(y) - 5 - 1)]
Out[3]:
[0.42000633317727665, 0.4119508655825166]
In [4]:
# Manually compute the score using the formula
y_pred = reg.predict(x)
y_bar = numpy.mean(y)
(sum((y_pred - y_bar) ** 2)) / (sum((y - y_bar) ** 2))
Out[4]:
0.42000633317727676
In [5]:
# Remove one column and run the regression and compare the scores
scores = []
for col in x:
    xi = x.drop(col, axis = 1)
    reg.fit(xi, y)
    scores.append(reg.score(xi, y))
plt.bar(features, scores)
Out[5]:
<BarContainer object of 5 artists>
No description has been provided for this image
In [6]:
# Use statsmodel for the comparison (not required for P6)
model = statsmodels.api.OLS(y, statsmodels.api.add_constant(x)).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Exam2   R-squared:                       0.420
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     52.14
Date:                Tue, 21 Nov 2023   Prob (F-statistic):           1.32e-40
Time:                        22:08:41   Log-Likelihood:                -847.96
No. Observations:                 366   AIC:                             1708.
Df Residuals:                     360   BIC:                             1731.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.6815      2.833     -1.653      0.099     -10.252       0.889
Exam1          0.3986      0.038     10.371      0.000       0.323       0.474
Project       -0.1215      0.258     -0.471      0.638      -0.629       0.386
Lecture        0.5225      0.156      3.357      0.001       0.216       0.829
Quiz           1.4005      0.262      5.340      0.000       0.885       1.916
Lab           -0.5823      0.995     -0.586      0.559      -2.538       1.373
==============================================================================
Omnibus:                       10.077   Durbin-Watson:                   0.861
Prob(Omnibus):                  0.006   Jarque-Bera (JB):               10.103
Skew:                          -0.387   Prob(JB):                      0.00640
Kurtosis:                       3.253   Cond. No.                         455.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.