hw3: machine teaching for OLS

Ruixuan Tu (ruixuan@cs.wisc.edu)

23 February 2023

Setup

Load packages and set working directory:

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw03-machine-teaching-ols")
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(readxl)
library(tidyverse)
library(lubridate)
library(ggplot2)
set.seed(27021)

Functions adapted from homework 02:

gen_datasets <- function(n_datasets, len_dataset, a, b, sigma_sq, fixed) {
  dfs <- list(1:n_datasets)
  for (i in 1:n_datasets) {
    dfs[[i]] <- gen_dataset(len_dataset, a, b, sigma_sq, fixed)
  }
  return(dfs)
}

ols_regression_single <- function(x, y) {
  model <- lm(y ~ x)
  return(list(m = model$coefficients[2], b = model$coefficients[1]))
}

ols_regression_batch <- function(ds) {
  ols_results <- list(1:100)
  for (i in 1:100) {
    ols_results[[i]] <- ols_regression_single(ds[[i]]$x, ds[[i]]$y)
  }
  ols_results_df <- tibble(
    m = map_dbl(ols_results, "m"),
    b = map_dbl(ols_results, "b")
  )
  return(ols_results_df)
}

Q1: Synthetic Teacher

Formulas

\[ \begin{aligned} \hat{\alpha}-\alpha &= \bar{\varepsilon}-(\hat{\beta}-\beta)\bar{X} \\ \hat{\beta}-\beta &= \frac{\sum_{i=1}^{n} \varepsilon_i (X_i - \bar{X})}{n \hat{\sigma_X^2}} \end{aligned} \]

(from https://medium.com/analytics-vidhya/expectation-variance-of-ols-estimates-9acd2b48a635)

Dataset Generators

gen_dataset <- function(n, a, b, sigma_sq, fixed) {
  x <- runif(n, -1, 1)
  eps <- rnorm(n, mean = 0, sd = sigma_sq)
  if (fixed) {
    # fix the regression line to be the truth line
    eps_model <- ols_regression_single(x, eps)
    delta_a <- eps_model$m  # defined above
    delta_b <- eps_model$b  # defined above
    # eliminate the effect of eps
    y <- (a - delta_a) * x + b - delta_b + eps
  } else {
    y <- a * x + b + eps
  }
  return(tibble(x = x, y = y))
}

gen_dataset_shifted <- function(n, a, b, sigma_sq, fixed) {
  while (TRUE) {
    ds <- gen_dataset(n, a, b, sigma_sq, fixed)
    n_pts <- ds %>% nrow()
    for (i in 1:n_pts) {  # select the first point
      for (j in (i + 1):n_pts) {  # select the second point
        # get the regression line of the two points
        x_2pts <- c(ds$x[i], ds$x[j])
        y_2pts <- c(ds$y[i], ds$y[j])
        reg <- ols_regression_single(x_2pts, y_2pts)
        if (reg$m != a || reg$b != b) {
          # at least one pair is not on the y = ax + b line
          return(ds)
        }
      }
    }
  }
}

Experiment Function `run_q1()`

run_q1 <- function(n) {
  d <- gen_dataset(n, a = 2, b = -3, sigma_sq = 1, fixed = TRUE)
  model <- ols_regression_single(d$x, d$y)
  plot <- ggplot(data = d) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Synthetic Teacher: exact linear regression with n =" ~ .(n)))
  print(plot)
  ggsave(filename = paste0("q1_n", n, ".svg"))
  summary_model(model)
}

summary_model <- function(model) {
  cat("Estimated slope:", model$m, "\n")
  cat("Estimated intercept:", model$b, "\n")
  cat("True slope:", 2, "\n")
  cat("True intercept:", -3, "\n")
}

Dataset $D_1$

run_q1(2)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Dataset $D_2$

run_q1(3)

## Estimated slope: 2 
## Estimated intercept: -3 
## True slope: 2 
## True intercept: -3

Q2: Pool-based Teacher

Read Data

pool <- read.table("hw3pool.txt", col.names = c("x", "y"))
pool

##           x          y
## 1  2.374862  1.0802890
## 2  2.946939  1.8986054
## 3  1.650485  1.8640327
## 4  3.270571  2.6910741
## 5  1.168345 -3.0322151
## 6  1.443327 -1.1253668
## 7  2.869272  3.3632392
## 8  2.695933  1.9037783
## 9  1.450013  2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955  1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486  3.1464462
## 15 2.231690  1.4246791
## 16 2.851257  3.6954195
## 17 1.659208  0.0867588
## 18 3.076964  2.5382524
## 19 3.447796  2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858  3.0603483
## 22 2.474814  2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247  3.5516174
## 25 2.574593  0.3965521
## 26 1.002182  1.2032982
## 27 1.828825  0.2724988
## 28 3.779303  3.9852923
## 29 3.747501  3.7052840
## 30 2.607060  2.1907988

Algorithm Implementation

# define closeness
closeness <- function(m, b, mhat, bhat) {
  (mhat - m)^2 + (bhat - b)^2
}

# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
  if (n == 1) {
    return(list(c(m)))
  } else {
    v <- list()
    for (i in 0:m) {
      v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
    }
    return(v)
  }
}

# select the best multi-subset of size n from the dataset df with variables x and y
select_data <- function(df, n) {
  # generate all multi-subsets of size n
  tuples <- gen_tuples(nrow(df), n)
  # generate the multi-subsets
  # an element v_i in tuple v means the number of times the i-th point is selected
  multisets <- map(tuples, function(v) {
    multiset <- list(x = c(), y = c())
    for (i in seq_along(v)) {
      multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
      multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
    }
    return(multiset)
  })
  cat("Number of multi-subsets:", length(multisets), "\n")
  # calculate the closeness of each multi-subset
  closenesses <- map_dbl(multisets, function(s) {
    reg <- ols_regression_single(s$x, s$y)
    return(closeness(2, -3, reg$m, reg$b))
  })
  # drop NA, as if we select n points be the same, the closeness is NA
  min_closeness <- min(closenesses, na.rm = TRUE)
  if (is.na(min_closeness)) {
    return(list(
      tuple = NULL,
      multiset = NULL,
      closeness = NA
    ))
  }
  # return the best tuple and its closeness
  best_idx <- which(closenesses == min_closeness)
  return(list(
    tuple = tuples[best_idx],
    multiset = multisets[best_idx],
    closeness = min_closeness
  ))
}

Part 1

run_q2_part1 <- function() {
  start_time <- Sys.time()
  model <- ols_regression_single(pool$x, pool$y)
  plot <- ggplot(data = pool) +
    geom_point(aes(x = x, y = y, color = "data points")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Pool-based Teacher: linear regression on whole" ~ .(nrow(pool)) ~ "points"))
  print(plot)
  ggsave(filename = "q2_all.svg")
  summary_model(model)
  cat("Closeness:", closeness(2, -3, model$m, model$b), "\n")
  end_time <- Sys.time()
  cat("Time elapsed (second):", end_time - start_time, "\n")
}

run_q2_part1()

## Estimated slope: 1.637053 
## Estimated intercept: -2.37024 
## True slope: 2 
## True intercept: -3 
## Closeness: 0.5283286 
## Time elapsed (second): 0.3014269

Experiment Function `run_q2()`

run_q2 <- function(n) {
  start_time <- Sys.time()
  d <- select_data(pool, n)
  cat("\nBest dataset (use the first one):\n")
  print(d)
  df_selected <- tibble(
    x = d$multiset[[1]]$x,
    y = d$multiset[[1]]$y
  )
  df_non_selected <- pool %>% filter(!(x %in% df_selected$x & y %in% df_selected$y))
  model <- ols_regression_single(d$multiset[[1]]$x, d$multiset[[1]]$y)
  cat("\nModel of linear regression on the best dataset:\n")
  print(model)
  plot <- ggplot() +
    geom_point(data = df_selected, aes(x = x, y = y, color = "selected points")) +
    geom_point(data = df_non_selected, aes(x = x, y = y, color = "non-selected points")) +
    geom_abline(aes(intercept = -3, slope = 2, color = "truth")) +
    geom_abline(aes(intercept = model$b, slope = model$m, color = "estimate")) +
    xlim(-5, 5) +
    ylim(-5, 5) +
    ggtitle(bquote("Pool-based Teacher: linear regression on best" ~ .(n) ~ "points"))
  print(plot)
  ggsave(filename = paste0("q2_n", n, ".svg"))
  end_time <- Sys.time()
  cat("Time elapsed (second):", end_time - start_time, "\n")
}

Part 2

run_q2(2)

## Number of multi-subsets: 465 
## 
## Best dataset (use the first one):
## $tuple
## $tuple[[1]]
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 
## 
## $multiset
## $multiset[[1]]
## $multiset[[1]]$x
## [1] 2.23169 2.60706
## 
## $multiset[[1]]$y
## [1] 1.424679 2.190799
## 
## 
## 
## $closeness
## [1] 0.01861329
## 
## 
## Model of linear regression on the best dataset:
## $m
##       x 
## 2.04097 
## 
## $b
## (Intercept) 
##   -3.130133

## Time elapsed (second): 0.516551

Part 3

run_q2(3)

## Number of multi-subsets: 4960 
## 
## Best dataset (use the first one):
## $tuple
## $tuple[[1]]
##  [1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 
## 
## $multiset
## $multiset[[1]]
## $multiset[[1]]$x
## [1] 1.186760 2.851257 3.779303
## 
## $multiset[[1]]$y
## [1] -0.9483188  3.6954195  3.9852923
## 
## 
## 
## $closeness
## [1] 0.0002014267
## 
## 
## Model of linear regression on the best dataset:
## $m
##        x 
## 2.008008 
## 
## $b
## (Intercept) 
##   -2.988283

## Time elapsed (second): 1.864729

Discussion

\[ \begin{aligned} \ell(D) &\coloneqq (\hat{a}-a)^2 + (\hat{b}-b)^2 \\ \ell(P) &= 0.5283286 \\ \ell(D_3) &= 0.01861329 \\ \ell(D_4) &= 0.0002014267 \end{aligned} \]

I find that with $P$ having all 30 points, we can pick a multi-subset to have a smaller closeness, which means that the linear regression model on the multi-subset is closer to the truth.

However, the closeness $\ell$, i.e., the loss, should be smaller if we pick more points. Suppose we can perfectly fit the regression line by $n$ points, then we might not have perfect fit with $n+1$ points based on the original $n$ points plus one extra point, as there might not be a point directly on the regression line, but we might find one as we could use other points which are not originally picked.

This could be explained by the relation $\ell(P)>\ell(D_3)>\ell(D_4)$.

\[ \begin{aligned} T(D) &\coloneqq \text{time to select and model on a dataset $D$} \\ T(P) &= 0.3 \\ T(D_3) &= 0.5 \\ T(D_4) &= 1.8 \end{aligned} \]

The time function contains ignorable plotting, printing, and saveing time, as we are comparing the elapsed times relatively.

I find that the multiset selection takes the most time, as we need to calculate the closeness of each multi-subset. For picking a multi-subset of $m$ points from all $n$ points, it is the same as generating the set of all tuples of $n$ non-negative integers with sum $m$, with every element $v_i$ of every generated tuple be the number of times the $i$-th point is selected. Applying combinatorics, we have $m+n-1\choose n-1$ tuples.

To illustrate the combinatorial result, suppose we have $m+n-1$ elements as depicted below:

\[ e_1, \dots, e_m, e_{m+1}, \dots, e_{m+n-1} \]

Then we can put $n-1$ $e_i$’s to be seperators, and we put remaining $m$ $e_i$’s to be $1$. Then we have $n$ non-negative integers with sum $m$. One instance is depicted below for $n=4, m=3$:

\[ \begin{aligned} \text{Pattern: } & e_1, & e_2, & e_3, & e_4, & e_5, & e_6 \\ \text{Instance: } & 1, & |, & 1, & 1, & |, & | \\ \end{aligned} \]

From this instance, we have the tuple $v=(1, 2, 0, 0)$, meaning our generated multi-subset have $1$ point from the $1$-st point, $2$ points from the $2$-nd point, $0$ point from the $3$-rd point, and $0$ point from the $4$-th point.

With the combination expanded ${m+n-1\choose n-1}=\frac{(m+n-1)!}{m!(n-1)!}$, so the time complexity of the multiset selection is factorial, which expands quickly with the number of points, making the selection slow.

This could be explained by the relation $T(P)<T(D_3)<T(D_4)$.

hw3: machine teaching for OLS

Ruixuan Tu (ruixuan@cs.wisc.edu)

23 February 2023

Setup

Q1: Synthetic Teacher

Formulas

Dataset Generators

Experiment Function `run_q1()`

Dataset \(D_1\)

Dataset \(D_2\)

Q2: Pool-based Teacher

Read Data

Algorithm Implementation

Part 1

Experiment Function `run_q2()`

Part 2

Part 3

Discussion

Setup

Q1: Synthetic Teacher

Formulas

Dataset Generators

Experiment Function run_q1()

Dataset \(D_1\)

Dataset \(D_2\)

Q2: Pool-based Teacher

Read Data

Algorithm Implementation

Part 1

Experiment Function run_q2()

Part 2

Part 3

Discussion

Experiment Function `run_q1()`

Experiment Function `run_q2()`