Homework 4: greedy algorithm for teaching OLS; 1NN classifier

Ruixuan Tu ()

4 March 2023


Setup

Load packages and set working directory:

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw04-greedy-1nn")
knitr::opts_chunk$set(warning = FALSE, message = FALSE, error = FALSE)
library(readxl)
library(tidyverse)
library(lubridate)
library(ggplot2)
set.seed(27021)

Q1: When enumeration stops working?

Read data (from Homework 3)

pool <- read.table("../hw03-machine-teaching-ols/hw3pool.txt", col.names = c("x", "y"))
pool
##           x          y
## 1  2.374862  1.0802890
## 2  2.946939  1.8986054
## 3  1.650485  1.8640327
## 4  3.270571  2.6910741
## 5  1.168345 -3.0322151
## 6  1.443327 -1.1253668
## 7  2.869272  3.3632392
## 8  2.695933  1.9037783
## 9  1.450013  2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955  1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486  3.1464462
## 15 2.231690  1.4246791
## 16 2.851257  3.6954195
## 17 1.659208  0.0867588
## 18 3.076964  2.5382524
## 19 3.447796  2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858  3.0603483
## 22 2.474814  2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247  3.5516174
## 25 2.574593  0.3965521
## 26 1.002182  1.2032982
## 27 1.828825  0.2724988
## 28 3.779303  3.9852923
## 29 3.747501  3.7052840
## 30 2.607060  2.1907988

Implementation of OLS, combinatorial selector, and fitter (modified from Homework 3)

ols_regression_single <- function(x, y) {
  model <- lm(y ~ x)
  return(list(m = model$coefficients[2], b = model$coefficients[1]))
}

# define closeness
closeness <- function(m, b, mhat, bhat) (mhat - m)^2 + (bhat - b)^2

# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
  if (n == 1) {
    return(list(c(m)))
  } else {
    v <- list()
    for (i in 0:m) {
      v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
    }
    return(v)
  }
}

# select the best multi-subset of size n from the dataset df with variables x and y
combinatorial_selector <- function(df, n, verbose = FALSE, history = NA) {
  if (n < 2) stop("n must be greater than 1")
  # generate all multi-subsets of size n
  tuples <- gen_tuples(nrow(df), n)
  # generate the multi-subsets
  # an element v_i in tuple v means the number of times the i-th point is selected
  multisets <- map(tuples, function(v) {
    multiset <- list(x = c(), y = c())
    for (i in seq_along(v)) {
      multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
      multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
    }
    return(multiset)
  })
  if (verbose) cat("Number of multi-subsets:", length(multisets), "\n")
  # calculate the closeness of each multi-subset
  closenesses <- map_dbl(multisets, function(s) {
    reg <- ols_regression_single(s$x, s$y)
    return(closeness(2, -3, reg$m, reg$b))
  })
  # drop NA, as if we select n points be the same, the closeness is NA
  min_closeness <- min(closenesses, na.rm = TRUE)
  if (is.na(min_closeness)) {
    return(list(
      tuple = NULL,
      multiset = NULL,
      closeness = NA
    ))
  }
  # return the best tuple and its closeness
  best_idx <- which(closenesses == min_closeness)
  return(list(
    multiset = multisets[[best_idx]],
    closeness = min_closeness,
    n = length(tuples)
  ))
}

pool_fit <- function(n, selector, verbose = FALSE, history = NA) {
  start_time <- Sys.time()
  d <- selector(df = pool, n = n, history = history)
  df_selected <- tibble(
    x = d$multiset$x,
    y = d$multiset$y
  )
  model <- ols_regression_single(df_selected$x, df_selected$y)
  end_time <- Sys.time()
  if (verbose) {
    cat("\nBest dataset (use the first one):\n")
    print(d)
    cat("\nModel of linear regression on the best dataset:\n")
    print(model)
    cat("Time elapsed (second):", end_time - start_time, "\n")
    cat("[pool_fit] x =", d$multiset$x, " y =", d$multiset$y, "\n")
  }
  return(list(
    x = d$multiset$x,
    y = d$multiset$y,
    loss = d$closeness,
    n_ols = d$n_ols,
    time = end_time - start_time
  ))
}

Test when enumeration stops working

pool_fit_batch <- function(upper, selector, file) {
  if (!file.exists(file)) {
    pool_fit_results <- tibble(x = list(), y = list(), n_pts = integer(), n_ols = integer(), loss = double(), time = double())
    for (n in 2:upper) {
      history_row_df <- pool_fit_results[nrow(pool_fit_results), ]
      history <- list(x = unlist(history_row_df$x), y = unlist(history_row_df$y))
      result <- pool_fit(n, selector, history = history)
      pool_fit_results <- rbind(pool_fit_results, tibble(x = list(result$x), y = list(result$y), n_pts = n, n_ols = result$n_ols, loss = result$loss, time = result$time))
    }
    pool_fit_results <- pool_fit_results %>% select(-x, -y)
  } else {
    pool_fit_results <- read_csv(file, col_types = cols(n_pts = col_integer(), n_ols = col_integer(), loss = col_double(), time = col_double()))
  }
  pool_fit_results <- pool_fit_results %>% mutate(loss = num(loss, notation = "sci"))
  if (!file.exists(file))
    write_csv(pool_fit_results, file)
  return(pool_fit_results)
}

# plot n_pts vs. log(loss) curve
plot_fit_result <- function(pool_fit_results, title) {
  p <- ggplot(pool_fit_results, aes(x = n_pts, y = loss)) +
    geom_point() +
    geom_line() +
    scale_y_log10() +
    labs(x = "Number of points", y = "Loss (scaled log10)") +
    ggtitle("Log of loss vs. number of points", subtitle = title)
  ggsave(paste0(title, "_plot.svg"), p)
  p
}
pool_fit_comb_results <- pool_fit_batch(
  upper = 6,
  selector = combinatorial_selector,
  file = "q1_pool_fit_comb_results.csv"
)
pool_fit_comb_results %>% print(n = Inf)
## # A tibble: 5 × 3
##   n_pts    loss    time
##   <int>   <sci>   <dbl>
## 1     2 1.86e-2   0.147
## 2     3 2.01e-4   1.35 
## 3     4 1.65e-7  10.8  
## 4     5 1.77e-6  74.7  
## 5     6 6.54e-8 417.
plot_fit_result(pool_fit_comb_results, "q1_pool_fit_comb")

Intepretation

As the enumeration for the combinatorial selector of size 6 takes 418 seconds, and it is extremely slow for larger sizes (waited more than 10 minutes for size 7), we stop the enumeration at size 6.

Q2: A greedy algorithm

Implementation of the greedy algorithm

greedy_selector <- function(df, n, history) {
  if (length(history$x) >= 2 && length(history$x) != n - 1) stop("n is not for +1")
  n_ols <- 0
  if (length(history$x) >= 2) {
    df_selected <- tibble(x = history$x, y = history$y)
  } else {
    # select the first point
    initial_idx <- sample(nrow(df), 1)
    initial_point <- df[initial_idx, ]
    df_selected <- tibble(x = initial_point$x, y = initial_point$y)
  }
  min_global_closeness <- Inf
  lower <- 2
  if (length(history$x) >= 2)
    lower <- length(history$x) + 1
  for (i in lower:n) {
    min_closeness <- Inf
    min_idx <- NA
    for (j in seq_len(nrow(df))) {
      pending_df_selected <- rbind(df_selected, df[j, ])
      theta_hat <- ols_regression_single(pending_df_selected$x, pending_df_selected$y)
      n_ols <- n_ols + 1
      pending_closeness <- closeness(2, -3, theta_hat$m, theta_hat$b)
      if (is.na(pending_closeness)) next
      if (pending_closeness < min_closeness) {
        min_closeness <- pending_closeness
        min_idx <- j
      }
    }
    min_global_closeness <- min_closeness
    df_selected <- rbind(df_selected, df[min_idx, ])
  }
  return(list(
    multiset = list(x = df_selected$x, y = df_selected$y),
    closeness = min_global_closeness,
    n_ols = n_ols
  ))
}

Use the greedy algorithm

pool_fit_greedy_results <- pool_fit_batch(
  upper = nrow(pool),
  greedy_selector,
  file = "q2_pool_fit_greedy_results.csv"
)
pool_fit_greedy_results %>% print(n = Inf)
## # A tibble: 29 × 4
##    n_pts n_ols    loss time            
##    <int> <dbl>   <sci> <drtn>          
##  1     2    30 4.33e-1 0.021356821 secs
##  2     3    30 1.71e-1 0.007766008 secs
##  3     4    30 1.47e-1 0.010340929 secs
##  4     5    30 7.94e-2 0.007749081 secs
##  5     6    30 7.77e-2 0.007658005 secs
##  6     7    30 7.17e-2 0.007671118 secs
##  7     8    30 5.46e-2 0.010077000 secs
##  8     9    30 5.39e-2 0.007758141 secs
##  9    10    30 5.39e-2 0.007751942 secs
## 10    11    30 4.02e-2 0.007688046 secs
## 11    12    30 2.51e-2 0.010122061 secs
## 12    13    30 2.57e-2 0.007826090 secs
## 13    14    30 2.18e-2 0.007699013 secs
## 14    15    30 1.91e-2 0.007743835 secs
## 15    16    30 1.71e-2 0.010215044 secs
## 16    17    30 1.57e-2 0.007791042 secs
## 17    18    30 1.48e-2 0.007806063 secs
## 18    19    30 1.30e-2 0.007740021 secs
## 19    20    30 1.15e-2 0.010284901 secs
## 20    21    30 1.04e-2 0.007874012 secs
## 21    22    30 9.61e-3 0.007771015 secs
## 22    23    30 9.03e-3 0.007818937 secs
## 23    24    30 8.62e-3 0.010452986 secs
## 24    25    30 8.19e-3 0.007900000 secs
## 25    26    30 7.42e-3 0.007948875 secs
## 26    27    30 6.82e-3 0.007759094 secs
## 27    28    30 6.37e-3 0.010304928 secs
## 28    29    30 6.03e-3 0.007889032 secs
## 29    30    30 5.78e-3 0.007838964 secs
plot_fit_result(pool_fit_greedy_results, "q2_pool_fit_greedy")

Interpretation

pool_fit_greedy_results %>% filter(loss == min(pool_fit_greedy_results$loss))
## # A tibble: 1 × 4
##   n_pts n_ols    loss time            
##   <int> <dbl>   <sci> <drtn>          
## 1    30    30 5.78e-3 0.007838964 secs
pool_fit_comb_results %>% filter(loss == min(pool_fit_comb_results$loss))
## # A tibble: 1 × 3
##   n_pts    loss  time
##   <int>   <sci> <dbl>
## 1     6 6.54e-8  417.

The greedy algorithm is worse than the combinatorial algorithm, as the loss is higher (the difference between losses with max 6 points for the combinatorial selector and max 30 points for the greedy selector is significant), which means a lower quality of the teaching set. The stopping criterion I might use is that there is no lower loss which can refresh the min loss in consecutive 5 points, as selecting multiset does not need to restrict the number of selected points to be the size of the pool \(P\). In terms of running time, the greedy algorithm is much faster, and the time cost to add a point is constant, not exponential like the combinatorial algorithm.

Q3: Implement kNN classifier

Implementation of dataset generatior

# generate a dataset with n vectors of dimension d, with n_categories categories of labels
generator_unif <- function(n_vec, d, n_categories) {
  x <- matrix(runif(n_vec * d), n_vec, d)
  y <- sample.int(n_categories, n_vec, replace = TRUE) - 1
  return(list(x = x, y = y))
}

# generate a dataset on a grid with n_categories categories of labels
generator_grid <- function(x_min, x_max, y_min, y_max, len, n_categories) {
  x <- seq(x_min, x_max, length.out = len)
  y <- seq(y_min, y_max, length.out = len)
  x <- expand.grid(x, y)
  x <- as.matrix(x)
  y <- sample.int(n_categories, len * len, replace = TRUE) - 1
  return(list(x = x, y = y))
}

# plot the training and testing datasets, with testing dataset in not predicted
plot_dataset <- function(ds_train, ds_test, n_categories, k = 0) {
  plot <- ggplot() +
    geom_point(data = tibble(x = ds_train$x[, 1], y = ds_train$x[, 2], label = ds_train$y), aes(x, y, color = factor(label), size = "train", alpha = "train")) +
    scale_color_discrete(name = "label", labels = paste((1:n_categories) - 1)) +
    scale_size_manual(values = c("train" = 3, "test" = 1)) +
    scale_alpha_manual(values = c("train" = 1, "test" = 0.5)) +
    xlim(0, 1) +
    ylim(0, 1)
  if (length(table(ds_test$y)) > 1) {
    plot <- plot +
      geom_point(data = tibble(x = ds_test$x[, 1], y = ds_test$x[, 2], label = ds_test$y), aes(x, y, color = factor(label), size = "test", alpha = "test")) +
      ggtitle(
        "Training and Testing Datasets with Predicted Labels",
        subtitle = bquote(n[categories] == .(n_categories) ~ n[train] == .(nrow(ds_train$x)) ~ n[test] == .(nrow(ds_test$x)) ~ k == .(k))
      )
    ggsave(paste0("q3_plot_cat", n_categories, "_train", nrow(ds_train$x), "_test", nrow(ds_test$x), "_k", k, ".svg"), plot)
  } else {
    plot <- plot +
      geom_point(data = tibble(x = ds_test$x[, 1], y = ds_test$x[, 2]), aes(x, y, size = "test", alpha = "test")) +
      ggtitle(
        "Training and Testing Datasets without Predicted Labels",
        subtitle = bquote(n[categories] == .(n_categories) ~ n[train] == .(nrow(ds_train$x)) ~ n[test] == .(nrow(ds_test$x)))
      )
    ggsave(paste0("q3_cat", n_categories, "_train", nrow(ds_train$x), "_test", nrow(ds_test$x), "_np.svg"), plot)
  }
  plot
}

ds_train <- generator_unif(n_vec = 16, d = 2, n_categories = 2)
ds_test <- generator_grid(x_min = 0, x_max = 1, y_min = 0, y_max = 1, len = 100, n_categories = 1)
plot_dataset(ds_train, ds_test, n_categories = 2)

Implementation of kNN classifier

knn_classifier <- function(train_x, train_y, test_x, k) {
  test_y <- integer(nrow(test_x))
  for (i in seq_len(nrow(test_x))) {
    # compute the distance between the test point and all training points
    distances <- map_dbl(seq_len(nrow(train_x)), function(j) sqrt(sum((train_x[j, ] - test_x[i, ]) ^ 2)))
    # find the k nearest neighbors
    nearest_neighbors <- train_y[order(distances)[1:k]]
    # predict the label of the test point
    test_y_single <- as.integer(names(which.max(table(nearest_neighbors))))
    # store the prediction
    test_y[i] <- test_y_single
  }
  return(test_y)
}

Test the kNN classifier for a certain dataset

ds_test_pred <- tibble(x = ds_test$x, y = knn_classifier(ds_train$x, ds_train$y, ds_test$x, k = 1))
plot_dataset(ds_train, ds_test_pred, n_categories = 2, k = 1)

ds_test_pred <- tibble(x = ds_test$x, y = knn_classifier(ds_train$x, ds_train$y, ds_test$x, k = 3))
plot_dataset(ds_train, ds_test_pred, n_categories = 2, k = 3)

One more complicated dataset with 4 categories

ds_train <- generator_unif(n_vec = 32, d = 2, n_categories = 4)
ds_test <- generator_grid(x_min = 0, x_max = 1, y_min = 0, y_max = 1, len = 100, n_categories = 1)
ds_test_pred <- tibble(x = ds_test$x, y = knn_classifier(ds_train$x, ds_train$y, ds_test$x, k = 3))
plot_dataset(ds_train, ds_test_pred, n_categories = 4, k = 4)