# Introduction

Implementing ordinary least squares regression from scratch for 1D case. This homework is assigned on 8 February 2023.

# Setup

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw01-ols") knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
library(lubridate)
library(ggplot2)
data <- data %>%
na_if("..") %>%
rename(c_name = "Country Name", c_code = "Country Code", "2017" = "2017 [YR2017]", "2018" = "2018 [YR2018]") %>%
pivot_longer(cols = paste(c(1990, 2000, c(2010:2019))), names_to = "year", values_to = "gdp") %>%
filter(Series Name == "GDP (current US$)", Series Code == "NY.GDP.MKTP.CD") %>% select(-Series Name, -Series Code) %>% drop_na() %>% mutate(gdp = as.numeric(gdp), year = as.numeric(year)) %>% group_by(c_code) # Definition of OLS Regression Function on 1D • input: vectors $$\vec{x}, \vec{y}$$ • output: a list with two elements: $$m$$ and $$b$$, the slope and intercept of the regression line $$y=mx+b$$ ols_regression <- function(x, y) { x <- as.matrix(x) y <- as.matrix(y) m <- (mean(x * y) - mean(x) * mean(y)) / (mean(x^2) - mean(x)^2) b <- mean(y) - m * mean(x) return(list(m = m, b = b)) } # Demonstration of OLS Regression on USA GDP usa <- data %>% filter(c_code == "USA") %>% ungroup() %>% select(year, gdp) usa %>% ggplot(aes(x = year, y = gdp)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "GDP of the United States (with library)", x = "Year", y = "GDP", color = "blue") + scale_y_continuous(labels = scales::dollar) l_ols <- ols_regression(usa$year, usa$gdp) usa %>% ggplot(aes(x = year, y = gdp)) + geom_point() + geom_abline(intercept = l_ols$b, slope = l_ols$m, color = "red") + labs(title = "GDP of the United States (without library)", x = "Year", y = "GDP") + scale_y_continuous(labels = scales::dollar) The learned coefficients for $$y=mx+b$$ l_ols ##$m
## [1] 507558820458
##
## $b ## [1] -1.004633e+15 Training set error: $$E_{\text{train}} = \sum_{i=1}^n (y_i - \hat{y}_i)^2$$ error <- usa %>% mutate(y_hat = l_ols$m * year + l_ols\$b) %>%
mutate(error = (gdp - y_hat)^2) %>%
summarise(error = sum(error)) %>%
pull()
error
## [1] 2.213066e+24

MSE: $$MSE = \frac{1}{n} E_{\text{train}}$$

MSE <- error / nrow(usa)
MSE
## [1] 2.011879e+23