Load packages and set working directory:
knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw03-machine-teaching-ols")
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(readxl)
library(tidyverse)
library(lubridate)
library(ggplot2)
set.seed(27021)
Functions adapted from homework 02:
gen_datasets <- function(n_datasets, len_dataset, a, b, sigma_sq, fixed) {
dfs <- list(1:n_datasets)
for (i in 1:n_datasets) {
dfs[[i]] <- gen_dataset(len_dataset, a, b, sigma_sq, fixed)
}
return(dfs)
}
ols_regression_single <- function(x, y) {
model <- lm(y ~ x)
return(list(m = model$coefficients[2], b = model$coefficients[1]))
}
ols_regression_batch <- function(ds) {
ols_results <- list(1:100)
for (i in 1:100) {
ols_results[[i]] <- ols_regression_single(ds[[i]]$x, ds[[i]]$y)
}
ols_results_df <- tibble(
m = map_dbl(ols_results, "m"),
b = map_dbl(ols_results, "b")
)
return(ols_results_df)
}
\[ \begin{aligned} \hat{\alpha}-\alpha &= \bar{\varepsilon}-(\hat{\beta}-\beta)\bar{X} \\ \hat{\beta}-\beta &= \frac{\sum_{i=1}^{n} \varepsilon_i (X_i - \bar{X})}{n \hat{\sigma_X^2}} \end{aligned} \]
(from https://medium.com/analytics-vidhya/expectation-variance-of-ols-estimates-9acd2b48a635)
gen_dataset <- function(n, a, b, sigma_sq, fixed) {
x <- runif(n, -1, 1)
eps <- rnorm(n, mean = 0, sd = sigma_sq)
if (fixed) {
# fix the regression line to be the truth line
eps_model <- ols_regression_single(x, eps)
delta_a <- eps_model$m # defined above
delta_b <- eps_model$b # defined above
# eliminate the effect of eps
y <- (a - delta_a) * x + b - delta_b + eps
} else {
y <- a * x + b + eps
}
return(tibble(x = x, y = y))
}
gen_dataset_shifted <- function(n, a, b, sigma_sq, fixed) {
while (TRUE) {
ds <- gen_dataset(n, a, b, sigma_sq, fixed)
n_pts <- ds %>% nrow()
for (i in 1:n_pts) { # select the first point
for (j in (i + 1):n_pts) { # select the second point
# get the regression line of the two points
x_2pts <- c(ds$x[i], ds$x[j])
y_2pts <- c(ds$y[i], ds$y[j])
reg <- ols_regression_single(x_2pts, y_2pts)
if (reg$m != a || reg$b != b) {
# at least one pair is not on the y = ax + b line
return(ds)
}
}
}
}
}
run_q1()
run_q1 <- function(n) {
d <- gen_dataset(n, a = 2, b = -3, sigma_sq = 1, fixed = TRUE)
model <- ols_regression_single(d$x, d$y)
plot <- ggplot(data = d) +
geom_point(aes(x = x, y = y, color = "data points")) +
geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
xlim(-5, 5) +
ylim(-5, 5) +
ggtitle(bquote("Synthetic Teacher: exact linear regression with n =" ~ .(n)))
print(plot)
ggsave(filename = paste0("q1_n", n, ".svg"))
summary_model(model)
}
summary_model <- function(model) {
cat("Estimated slope:", model$m, "\n")
cat("Estimated intercept:", model$b, "\n")
cat("True slope:", 2, "\n")
cat("True intercept:", -3, "\n")
}
## Estimated slope: 2
## Estimated intercept: -3
## True slope: 2
## True intercept: -3
## x y
## 1 2.374862 1.0802890
## 2 2.946939 1.8986054
## 3 1.650485 1.8640327
## 4 3.270571 2.6910741
## 5 1.168345 -3.0322151
## 6 1.443327 -1.1253668
## 7 2.869272 3.3632392
## 8 2.695933 1.9037783
## 9 1.450013 2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955 1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486 3.1464462
## 15 2.231690 1.4246791
## 16 2.851257 3.6954195
## 17 1.659208 0.0867588
## 18 3.076964 2.5382524
## 19 3.447796 2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858 3.0603483
## 22 2.474814 2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247 3.5516174
## 25 2.574593 0.3965521
## 26 1.002182 1.2032982
## 27 1.828825 0.2724988
## 28 3.779303 3.9852923
## 29 3.747501 3.7052840
## 30 2.607060 2.1907988
# define closeness
closeness <- function(m, b, mhat, bhat) {
(mhat - m)^2 + (bhat - b)^2
}
# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
if (n == 1) {
return(list(c(m)))
} else {
v <- list()
for (i in 0:m) {
v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
}
return(v)
}
}
# select the best multi-subset of size n from the dataset df with variables x and y
select_data <- function(df, n) {
# generate all multi-subsets of size n
tuples <- gen_tuples(nrow(df), n)
# generate the multi-subsets
# an element v_i in tuple v means the number of times the i-th point is selected
multisets <- map(tuples, function(v) {
multiset <- list(x = c(), y = c())
for (i in seq_along(v)) {
multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
}
return(multiset)
})
cat("Number of multi-subsets:", length(multisets), "\n")
# calculate the closeness of each multi-subset
closenesses <- map_dbl(multisets, function(s) {
reg <- ols_regression_single(s$x, s$y)
return(closeness(2, -3, reg$m, reg$b))
})
# drop NA, as if we select n points be the same, the closeness is NA
min_closeness <- min(closenesses, na.rm = TRUE)
if (is.na(min_closeness)) {
return(list(
tuple = NULL,
multiset = NULL,
closeness = NA
))
}
# return the best tuple and its closeness
best_idx <- which(closenesses == min_closeness)
return(list(
tuple = tuples[best_idx],
multiset = multisets[best_idx],
closeness = min_closeness
))
}
run_q2_part1 <- function() {
start_time <- Sys.time()
model <- ols_regression_single(pool$x, pool$y)
plot <- ggplot(data = pool) +
geom_point(aes(x = x, y = y, color = "data points")) +
geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
xlim(-5, 5) +
ylim(-5, 5) +
ggtitle(bquote("Pool-based Teacher: linear regression on whole" ~ .(nrow(pool)) ~ "points"))
print(plot)
ggsave(filename = "q2_all.svg")
summary_model(model)
cat("Closeness:", closeness(2, -3, model$m, model$b), "\n")
end_time <- Sys.time()
cat("Time elapsed (second):", end_time - start_time, "\n")
}
run_q2_part1()