Homework 5: machine teaching for kNN

Ruixuan Tu ()

29 March 2023


Preparation

Setup

Load packages and set working directory:

knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw05-machine-teaching-knn")
knitr::opts_chunk$set(warning = FALSE, message = FALSE, error = TRUE) # error = TRUE
setwd("/Users/turx/Projects/machine-teaching-23sp/hw05-machine-teaching-knn")
library(tidyverse)
library(lubridate)
library(ggplot2)
library(purrr)
library(rjson)
library(digest)
set.seed(digest2int("machine teaching"))

Read data

pool <- tibble(read.table("hw5data.txt", col.names = c("x1", "x2", "y")))
# pool_idx <- sample(nrow(pool), size = 50, replace = FALSE)
# pool <- pool[pool_idx, ]
pool
## # A tibble: 807 × 3
##        x1     x2     y
##     <dbl>  <dbl> <int>
##  1 0.776  0.0528     1
##  2 0.992  0.0874     1
##  3 0.264  0.131      1
##  4 0.0795 0.109      1
##  5 0.689  0.983      1
##  6 0.117  0.434      1
##  7 0.453  0.721      1
##  8 0.179  0.928      1
##  9 0.0787 0.245      1
## 10 0.480  0.980      1
## # … with 797 more rows

Plot data

ggplot(pool, aes(x = x1, y = x2, color = factor(y))) +
  geom_point() +
  lims(x = c(0, 1), y = c(0, 1))

Constants for Problem Setting in Homework Section 4

k <- 1  # number of nearest neighbors
Z <- pool %>% rename(x1p = x1, x2p = x2, yp = y)  # superset of all teaching sets
enum_upper <- 2  # n_star for enum, threshold for the number of teaching examples
greedy_upper <- 20  # n_star for greedy, threshold for the number of teaching examples
# saved_run_debug <- TRUE  # uncomment to ignore saved csv files

Shared functions

kNN

# kNN classifier (from Homework 4)
knn_classifier <- function(train_x, train_y, test_x, k) {
  test_y <- integer(nrow(test_x))
  for (i in seq_len(nrow(test_x))) {
    # compute the distance between the test point and all training points
    distances <- map_dbl(seq_len(nrow(train_x)), function(j) sqrt(sum((train_x[j, ] - test_x[i, ]) ^ 2)))
    # find the k nearest neighbors
    nearest_neighbors <- train_y[order(distances)[1:k]]
    # predict the label of the test point
    test_y_single <- as.integer(names(which.max(table(nearest_neighbors))))
    # store the prediction
    test_y[i] <- test_y_single
  }
  return(test_y)
}

# wrapper for a simpler kNN application which accepts a single test point instead of a test set
knn_classifier_single <- function(x1, x2, y, x1p, x2p, k) {
  # convert x1, x2 lists to a matrix
  train_x <- cbind(x1, x2)
  # convert x1p, x2p to a matrix
  x1pl <- c(x1p)
  x2pl <- c(x2p)
  test_x <- as.matrix(cbind(x1pl, x2pl))
  knn_classifier(train_x, y, test_x, k)[1]
}

Approximation and Sampling

# 0-1 loss function l
loss_bin <- function(y, yp) y != yp

# generate the set Z of (x', y') with drawn x' from distribution p and predicted y' = g(x')
gen_Z <- function(g, x1p, x2p) {
  Z <- tibble(
    x1p = x1p,
    x2p = x2p,
  )
  # map x' by (x1p, x2p) to y' by g
  yp <- map_dbl(seq_len(nrow(Z)), function(i) g(Z$x1p[i], Z$x2p[i]))
  Z <- Z %>%
    mutate(yp = yp)
  Z
}

# metric between functions f and g defined by equation (3)
dist <- function(Z, f, l) {
  m <- nrow(Z)
  pred <- Z %>%
    rowwise() %>%
    mutate(pred = f(x1p, x2p)) %>%
    pull(pred)
  d <- sum(l(pred, Z$yp)) / m
  d
}

OS

# time the execution of a function
time_run <- function(f, ...) {
  start <- Sys.time()
  f(...)
  end <-