hw3: machine teaching for OLS

Ruixuan Tu ()

23 February 2023


Setup

Load packages and set working directory:

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw03-machine-teaching-ols")
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(readxl)
library(tidyverse)
library(lubridate)
library(ggplot2)
set.seed(27021)

Functions adapted from homework 02:

gen_datasets <- function(n_datasets, len_dataset, a, b, sigma_sq, fixed) {
  dfs <- list(1:n_datasets)
  for (i in 1:n_datasets) {
    dfs[[i]] <- gen_dataset(len_dataset, a, b, sigma_sq, fixed)
  }
  return(dfs)
}

ols_regression_single <- function(x, y) {
  model <- lm(y ~ x)
  return(list(m = model$coefficients[2], b = model$coefficients[1]))
}

ols_regression_batch <- function(ds) {
  ols_results <- list(1:100)
  for (i in 1:100) {
    ols_results[[i]] <- ols_regression_single(ds[[i]]$x, ds[[i]]$y)
  }
  ols_results_df <- tibble(
    m = map_dbl(ols_results, "m"),
    b = map_dbl(ols_results, "b")
  )
  return(ols_results_df)
}

Q1: Synthetic Teacher

Formulas

\[ \begin{aligned} \hat{\alpha}-\alpha &= \bar{\varepsilon}-(\hat{\beta}-\beta)\bar{X} \\ \hat{\beta}-\beta &= \frac{\sum_{i=1}^{n} \varepsilon_i (X_i - \bar{X})}{n \hat{\sigma_X^2}} \end{aligned} \]

(from https://medium.com/analytics-vidhya/expectation-variance-of-ols-estimates-9acd2b48a635)

Dataset Generators

gen_dataset <- function(n, a, b, sigma_sq, fixed) {
  x <- runif(n, -1, 1)
  eps <- rnorm(n, mean = 0, sd = sigma_sq)
  if (fixed) {
    # fix the regression line to be the truth line
    eps_model <- ols_regression_single(x, eps)
    delta_a <- eps_model$m  # defined above
    delta_b <- eps_model$b  # defined above
    # eliminate the effect of eps
    y <- (a - delta_a) * x + b - delta_b + eps
  } else {
    y <- a * x + b + eps
  }
  return(tibble(x = x, y = y))
}

gen_dataset_shifted <- function(n, a, b, sigma_sq, fixed) {
  while (TRUE) {
    ds <- gen_dataset(n, a, b, sigma_sq, fixed)
    n_pts <- ds %>% nrow()
    for (i in 1:n_pts) {  # select the first point
      for (j in (i + 1):n_pts) {  # select the second point
        # get the regression line of the two points
        x_2pts <- c(ds$x[i], ds$x[j])
        y_2pts <- c(ds$y[i], ds$y[j])
        reg <- ols_regression_single(x_2pts, y_2pts)
        if (reg$m != a || reg$b != b) {
          # at least one pair is not on the y = ax + b line
          return(ds)
        }
      }
    }
  }
}

Experiment Function run_q1()

run_q1 <- function(n) {
  d <- gen_dataset(n, a = 2, b = -3, sigma_sq = 1, fixed = TRUE)
  model <- ols_regression_single(d$x, d$y)
  plot <- ggplot(data = d) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Synthetic Teacher: exact linear regression with n =" ~ .(n)))
  print(plot)
  ggsave(filename = paste0("q1_n", n, ".svg"))
  summary_model(model)
}

summary_model <- function(model) {
  cat("Estimated slope:", model$m, "\n")
  cat("Estimated intercept:", model$b, "\n")
  cat("True slope:", 2, "\n")
  cat("True intercept:", -3, "\n")
}

Dataset \(D_1\)

run_q1(2)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Dataset \(D_2\)

run_q1(3)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Q2: Pool-based Teacher

Read Data

pool <- read.table("hw3pool.txt", col.names = c("x", "y"))
pool
##           x          y
## 1  2.374862  1.0802890
## 2  2.946939  1.8986054
## 3  1.650485  1.8640327
## 4  3.270571  2.6910741
## 5  1.168345 -3.0322151
## 6  1.443327 -1.1253668
## 7  2.869272  3.3632392
## 8  2.695933  1.9037783
## 9  1.450013  2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955  1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486  3.1464462
## 15 2.231690  1.4246791
## 16 2.851257  3.6954195
## 17 1.659208  0.0867588
## 18 3.076964  2.5382524
## 19 3.447796  2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858  3.0603483
## 22 2.474814  2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247  3.5516174
## 25 2.574593  0.3965521
## 26 1.002182  1.2032982
## 27 1.828825  0.2724988
## 28 3.779303  3.9852923
## 29 3.747501  3.7052840
## 30 2.607060  2.1907988

Algorithm Implementation

# define closeness
closeness <- function(m, b, mhat, bhat) {
  (mhat - m)^2 + (bhat - b)^2
}

# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
  if (n == 1) {
    return(list(c(m)))
  } else {
    v <- list()
    for (i in 0:m) {
      v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
    }
    return(v)
  }
}

# select the best multi-subset of size n from the dataset df with variables x and y
select_data <- function(df, n) {
  # generate all multi-subsets of size n
  tuples <- gen_tuples(nrow(df), n)
  # generate the multi-subsets
  # an element v_i in tuple v means the number of times the i-th point is selected
  multisets <- map(tuples, function(v) {
    multiset <- list(x = c(), y = c())
    for (i in seq_along(v)) {
      multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
      multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
    }
    return(multiset)
  })
  cat("Number of multi-subsets:", length(multisets), "\n")
  # calculate the closeness of each multi-subset
  closenesses <- map_dbl(multisets, function(s) {
    reg <- ols_regression_single(s$x, s$y)
    return(closeness(2, -3, reg$m, reg$b))
  })
  # drop NA, as if we select n points be the same, the closeness is NA
  min_closeness <- min(closenesses, na.rm = TRUE)
  if (is.na(min_closeness)) {
    return(list(
      tuple = NULL,
      multiset = NULL,
      closeness = NA
    ))
  }
  # return the best tuple and its closeness
  best_idx <- which(closenesses == min_closeness)
  return(list(
    tuple = tuples[best_idx],
    multiset = multisets[best_idx],
    closeness = min_closeness
  ))
}

Part 1

run_q2_part1 <- function() {
  start_time <- Sys.time()
  model <- ols_regression_single(pool$x, pool$y)
  plot <- ggplot(data = pool) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Pool-based Teacher: linear regression on whole" ~ .(nrow(pool)) ~ "points"))
  print(plot)
  ggsave(filename = "q2_all.svg")
  summary_model(model)
  cat("Closeness:", closeness(2, -3, model$m, model$b), "\n")
  end_time <- Sys.time()
  cat("Time elapsed (second):", end_time - start_time, "\n")
}

run_q2_part1()

## Estimated slope: 1.637053 
## Estimated intercept: -2.37024 
## True slope: 2 
## True intercept: -3 
## Closeness: 0.5283286 
## Time elapsed (second): 0.3014269

Experiment Function run_q2()

run_q2 <- function(n) {
  start_time <- Sys.time()
  d <- select_data(pool, n)
  cat("\nBest dataset (use the first one):\n")
  print(d)
  df_selected <- tibble(
    x = d$multiset[[1]]$x,
    y = d$multiset[[1]]$y
  )
  df_non_selected <- pool %>% filter(!(x %in% df_selected$x & y %in% df_selected$y))
  model <- ols_regression_single(d$multiset[[1]]$x, d$multiset[[1]]$y)
  cat("\nModel of linear regression on the best dataset:\n")
  print(model)
  plot <- ggplot() +
    geom_point(data = df_selected, aes(x = x, y = y, color = "selected points")) +
    geom_point(data = df_non_selected, aes(x = x, y = y, color = "non-selected points")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Pool-based Teacher: linear regression on best" ~ .(n) ~ "points"))
  print(plot)
  ggsave(filename = paste0("q2_n", n, ".svg"))
  end_time <- Sys.time()
  cat("Time elapsed (second):", end_time - start_time, "\n")
}

Part 2

run_q2(2)
## Number of multi-subsets: 465 
## 
## Best dataset (use the first one):
## $tuple
## $tuple[[1]]
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 
## 
## $multiset
## $multiset[[1]]
## $multiset[[1]]$x
## [1] 2.23169 2.60706
## 
## $multiset[[1]]$y
## [1] 1.424679 2.190799
## 
## 
## 
## $closeness
## [1] 0.01861329
## 
## 
## Model of linear regression on the best dataset:
## $m
##       x 
## 2.04097 
## 
## $b
## (Intercept) 
##   -3.130133

## Time elapsed (second): 0.516551

Part 3

run_q2(3)
## Number of multi-subsets: 4960 
## 
## Best dataset (use the first one):
## $tuple
## $tuple[[1]]
##  [1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 
## 
## $multiset
## $multiset[[1]]
## $multiset[[1]]$x
## [1] 1.186760 2.851257 3.779303
## 
## $multiset[[1]]$y
## [1] -0.9483188  3.6954195  3.9852923
## 
## 
## 
## $closeness
## [1] 0.0002014267
## 
## 
## Model of linear regression on the best dataset:
## $m
##        x 
## 2.008008 
## 
## $b
## (Intercept) 
##   -2.988283

## Time elapsed (second): 1.864729

Discussion

\[ \begin{aligned} \ell(D) &\coloneqq (\hat{a}-a)^2 + (\hat{b}-b)^2 \\ \ell(P) &= 0.5283286 \\ \ell(D_3) &= 0.01861329 \\ \ell(D_4) &= 0.0002014267 \end{aligned} \]

I find that with \(P\) having all 30 points, we can pick a multi-subset to have a smaller closeness, which means that the linear regression model on the multi-subset is closer to the truth.

However, the closeness \(\ell\), i.e., the loss, should be smaller if we pick more points. Suppose we can perfectly fit the regression line by \(n\) points, then we might not have perfect fit with \(n+1\) points based on the original \(n\) points plus one extra point, as there might not be a point directly on the regression line, but we might find one as we could use other points which are not originally picked.

This could be explained by the relation \(\ell(P)>\ell(D_3)>\ell(D_4)\).

\[ \begin{aligned} T(D) &\coloneqq \text{time to select and model on a dataset $D$} \\ T(P) &= 0.3 \\ T(D_3) &= 0.5 \\ T(D_4) &= 1.8 \end{aligned} \]

The time function contains ignorable plotting, printing, and saveing time, as we are comparing the elapsed times relatively.

I find that the multiset selection takes the most time, as we need to calculate the closeness of each multi-subset. For picking a multi-subset of \(m\) points from all \(n\) points, it is the same as generating the set of all tuples of \(n\) non-negative integers with sum \(m\), with every element \(v_i\) of every generated tuple be the number of times the \(i\)-th point is selected. Applying combinatorics, we have \(m+n-1\choose n-1\) tuples.

To illustrate the combinatorial result, suppose we have \(m+n-1\) elements as depicted below:

\[ e_1, \dots, e_m, e_{m+1}, \dots, e_{m+n-1} \]

Then we can put \(n-1\) \(e_i\)’s to be seperators, and we put remaining \(m\) \(e_i\)’s to be \(1\). Then we have \(n\) non-negative integers with sum \(m\). One instance is depicted below for \(n=4, m=3\):

\[ \begin{aligned} \text{Pattern: } & e_1, & e_2, & e_3, & e_4, & e_5, & e_6 \\ \text{Instance: } & 1, & |, & 1, & 1, & |, & | \\ \end{aligned} \]

From this instance, we have the tuple \(v=(1, 2, 0, 0)\), meaning our generated multi-subset have \(1\) point from the \(1\)-st point, \(2\) points from the \(2\)-nd point, \(0\) point from the \(3\)-rd point, and \(0\) point from the \(4\)-th point.

With the combination expanded \({m+n-1\choose n-1}=\frac{(m+n-1)!}{m!(n-1)!}\), so the time complexity of the multiset selection is factorial, which expands quickly with the number of points, making the selection slow.

This could be explained by the relation \(T(P)<T(D_3)<T(D_4)\).