hw3: machine teaching for OLS

Ruixuan Tu ()

23 February 2023


Setup

Load packages and set working directory:

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw03-machine-teaching-ols")
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(readxl)
library(tidyverse)
library(lubridate)
library(ggplot2)
set.seed(27021)

Functions adapted from homework 02:

gen_datasets <- function(n_datasets, len_dataset, a, b, sigma_sq, fixed) {
  dfs <- list(1:n_datasets)
  for (i in 1:n_datasets) {
    dfs[[i]] <- gen_dataset(len_dataset, a, b, sigma_sq, fixed)
  }
  return(dfs)
}

ols_regression_single <- function(x, y) {
  model <- lm(y ~ x)
  return(list(m = model$coefficients[2], b = model$coefficients[1]))
}

ols_regression_batch <- function(ds) {
  ols_results <- list(1:100)
  for (i in 1:100) {
    ols_results[[i]] <- ols_regression_single(ds[[i]]$x, ds[[i]]$y)
  }
  ols_results_df <- tibble(
    m = map_dbl(ols_results, "m"),
    b = map_dbl(ols_results, "b")
  )
  return(ols_results_df)
}

Q1: Synthetic Teacher

Formulas

\[ \begin{aligned} \hat{\alpha}-\alpha &= \bar{\varepsilon}-(\hat{\beta}-\beta)\bar{X} \\ \hat{\beta}-\beta &= \frac{\sum_{i=1}^{n} \varepsilon_i (X_i - \bar{X})}{n \hat{\sigma_X^2}} \end{aligned} \]

(from https://medium.com/analytics-vidhya/expectation-variance-of-ols-estimates-9acd2b48a635)

Dataset Generators

gen_dataset <- function(n, a, b, sigma_sq, fixed) {
  x <- runif(n, -1, 1)
  eps <- rnorm(n, mean = 0, sd = sigma_sq)
  if (fixed) {
    # fix the regression line to be the truth line
    eps_model <- ols_regression_single(x, eps)
    delta_a <- eps_model$m  # defined above
    delta_b <- eps_model$b  # defined above
    # eliminate the effect of eps
    y <- (a - delta_a) * x + b - delta_b + eps
  } else {
    y <- a * x + b + eps
  }
  return(tibble(x = x, y = y))
}

gen_dataset_shifted <- function(n, a, b, sigma_sq, fixed) {
  while (TRUE) {
    ds <- gen_dataset(n, a, b, sigma_sq, fixed)
    n_pts <- ds %>% nrow()
    for (i in 1:n_pts) {  # select the first point
      for (j in (i + 1):n_pts) {  # select the second point
        # get the regression line of the two points
        x_2pts <- c(ds$x[i], ds$x[j])
        y_2pts <- c(ds$y[i], ds$y[j])
        reg <- ols_regression_single(x_2pts, y_2pts)
        if (reg$m != a || reg$b != b) {
          # at least one pair is not on the y = ax + b line
          return(ds)
        }
      }
    }
  }
}

Experiment Function run_q1()

run_q1 <- function(n) {
  d <- gen_dataset(n, a = 2, b = -3, sigma_sq = 1, fixed = TRUE)
  model <- ols_regression_single(d$x, d$y)
  plot <- ggplot(data = d) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Synthetic Teacher: exact linear regression with n =" ~ .(n)))
  print(plot)
  ggsave(filename = paste0("q1_n", n, ".svg"))
  summary_model(model)
}

summary_model <- function(model) {
  cat("Estimated slope:", model$m, "\n")
  cat("Estimated intercept:", model$b, "\n")
  cat("True slope:", 2, "\n")
  cat("True intercept:", -3, "\n")
}

Dataset \(D_1\)

run_q1(2)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Dataset \(D_2\)

run_q1(3)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Q2: Pool-based Teacher

Read Data

pool <- read.table("hw3pool.txt", col.names = c("x", "y"))
pool
##           x          y
## 1  2.374862  1.0802890
## 2  2.946939  1.8986054
## 3  1.650485  1.8640327
## 4  3.270571  2.6910741
## 5  1.168345 -3.0322151
## 6  1.443327 -1.1253668
## 7  2.869272  3.3632392
## 8  2.695933  1.9037783
## 9  1.450013  2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955  1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486  3.1464462
## 15 2.231690  1.4246791
## 16 2.851257  3.6954195
## 17 1.659208  0.0867588
## 18 3.076964  2.5382524
## 19 3.447796  2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858  3.0603483
## 22 2.474814  2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247  3.5516174
## 25 2.574593  0.3965521
## 26 1.002182  1.2032982
## 27 1.828825  0.2724988
## 28 3.779303  3.9852923
## 29 3.747501  3.7052840
## 30 2.607060  2.1907988

Algorithm Implementation

# define closeness
closeness <- function(m, b, mhat, bhat) {
  (mhat - m)^2 + (bhat - b)^2
}

# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
  if (n == 1) {
    return(list(c(m)))
  } else {
    v <- list()
    for (i in 0:m) {
      v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
    }
    return(v)
  }
}

# select the best multi-subset of size n from the dataset df with variables x and y
select_data <- function(df, n) {
  # generate all multi-subsets of size n
  tuples <- gen_tuples(nrow(df), n)
  # generate the multi-subsets
  # an element v_i in tuple v means the number of times the i-th point is selected
  multisets <- map(tuples, function(v) {
    multiset <- list(x = c(), y = c())
    for (i in seq_along(v)) {
      multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
      multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
    }
    return(multiset)
  })
  cat("Number of multi-subsets:", length(multisets), "\n")
  # calculate the closeness of each multi-subset
  closenesses <- map_dbl(multisets, function(s) {
    reg <- ols_regression_single(s$x, s$y)
    return(closeness(2, -3, reg$m, reg$b))
  })
  # drop NA, as if we select n points be the same, the closeness is NA
  min_closeness <- min(closenesses, na.rm = TRUE)
  if (is.na(min_closeness)) {
    return(list(
      tuple = NULL,
      multiset = NULL,
      closeness = NA
    ))
  }
  # return the best tuple and its closeness
  best_idx <- which(closenesses == min_closeness)
  return(list(
    tuple = tuples[best_idx],
    multiset = multisets[best_idx],
    closeness = min_closeness
  ))
}

Part 1

run_q2_part1 <- function() {
  start_time <- Sys.time()
  model <- ols_regression_single(pool$x, pool$y)
  plot <- ggplot(data = pool) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Pool-based Teacher: linear regression on whole" ~ .(nrow(pool)) ~ "points"))
  print(plot)
  ggsave(filename = "q2_all.svg")
  summary_model(model)
  cat("Closeness:", closeness(2, -3, model$m, model$b), "\n")
  end_time <- Sys.time()
  cat("Time elapsed (second):", end_time - start_time, "\n")
}

run_q2_part1()