Load packages and set working directory:
knitr::opts_knit$set(root.dir = "/Users/turx/Projects/machine-teaching-23sp/hw05-machine-teaching-knn")
knitr::opts_chunk$set(warning = FALSE, message = FALSE, error = TRUE) # error = TRUE
setwd("/Users/turx/Projects/machine-teaching-23sp/hw05-machine-teaching-knn")
library(tidyverse)
library(lubridate)
library(ggplot2)
library(purrr)
library(rjson)
library(digest)
set.seed(digest2int("machine teaching"))
pool <- tibble(read.table("hw5data.txt", col.names = c("x1", "x2", "y")))
# pool_idx <- sample(nrow(pool), size = 50, replace = FALSE)
# pool <- pool[pool_idx, ]
pool
## # A tibble: 807 × 3
## x1 x2 y
## <dbl> <dbl> <int>
## 1 0.776 0.0528 1
## 2 0.992 0.0874 1
## 3 0.264 0.131 1
## 4 0.0795 0.109 1
## 5 0.689 0.983 1
## 6 0.117 0.434 1
## 7 0.453 0.721 1
## 8 0.179 0.928 1
## 9 0.0787 0.245 1
## 10 0.480 0.980 1
## # … with 797 more rows
k <- 1 # number of nearest neighbors
Z <- pool %>% rename(x1p = x1, x2p = x2, yp = y) # superset of all teaching sets
enum_upper <- 2 # n_star for enum, threshold for the number of teaching examples
greedy_upper <- 20 # n_star for greedy, threshold for the number of teaching examples
# saved_run_debug <- TRUE # uncomment to ignore saved csv files
# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
# (from Homework 3)
gen_tuples <- function(n, m) {
if (n == 1) {
return(list(c(m)))
} else {
v <- list()
for (i in 0:m) {
v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
}
return(v)
}
}
# Reporting:
# 1. the number of teaching sets of that size that you have to search through;
# 2. number of seconds it takes for that size;
# 3. d(kNN(D), g) of the best teaching set of that size;
# 4. plot the best teaching set D-hat in relation to P (i.e. plot both, but use different symbols for D-hat)
enum_run <- function() {
results <- tibble(
subset_sz = integer(),
subset_n = integer(),
time = duration(units = "secs"),
d = double(),
tuple = character()
)
for (m in 1:enum_upper) {
timed <- function() {
# generate tuples for all multi-subsets of pool of size m
tuples <- gen_tuples(nrow(pool), m)
best_d <- Inf
best_tuple <- NULL
for (tuple in tuples) {
# convert the tuple to a multi-subset of pool
subset <- pool[which(tuple > 0), ]
# define the kNN classifier g on the multi-subset
g <- function(x1p, x2p)
knn_classifier_single(subset$x1, subset$x2, subset$y, x1p, x2p, k)
# compute the distance d(kNN(D), g)
d <- dist(Z, g, loss_bin)
# update the best distance
if (d < best_d) {
best_d <- d
best_tuple <- tuple
}
}
list(n = length(tuples), d = best_d, tuple = best_tuple)
}
r <- time_run(timed)
results <- results %>%
add_row(
subset_sz = m,
subset_n = r$result$n,
time = r$time,
d = r$result$d,
tuple = toJSON(r$result$tuple)
)
}
results
}
enum_results <- saved_run(enum_run, "enum.csv")
for (m in 1:enum_upper) {
best_tuple <- fromJSON(enum_results$tuple[[m]])
best_Dhat <- pool[which(best_tuple > 0), ]
print(
ggplot() +
geom_point(data = pool, aes(x = x1, y = x2, color = factor(y), shape = "P")) +
geom_point(data = best_Dhat, aes(x = x1, y = x2, shape = "D"), alpha = 0.5) +
ggtitle("Enumeration: Best Teaching Set vs Pool", subtitle = paste0("m = ", m))
)
ggsave(paste0("enum_m", m, ".svg"))
}
## # A tibble: 2 × 4
## subset_sz subset_n time d
## <dbl> <dbl> <chr> <dbl>
## 1 1 807 41.3479979038239s 0.444
## 2 2 326028 16841.8223998547s (~4.68 hours) 0.289
# Reporting:
# 1. the number of teaching sets of that size that you have to search through;
# 2. number of seconds it takes for that size;
# 3. d(kNN(D), g) of the best teaching set of that size;
# 4. plot one figure for the last teaching set D-hat with size n* in relation to P
greedy_run <- function() {
results <- tibble(
subset_sz = integer(),
subset_n = integer(),
time = duration(units = "secs"),
d = double(),
index = integer()
)
for (m in 1:greedy_upper) {
prev_D <- NULL
if (nrow(results) == 0)
prev_D <- c()
else {
prev_D <- unlist(results %>% pull(index))
if (is.null(prev_D))
stop("prev_D is NULL")
}
timed <- function(prev_D) {
best_d <- Inf
idx_l <- c()
for (i in 1:nrow(pool)) {
D <- c(prev_D, i)
subset <- pool[D, ]
g <- function(x1p, x2p)
knn_classifier_single(subset$x1, subset$x2, subset$y, x1p, x2p, k)
d <- dist(Z, g, loss_bin)
if (d < best_d) {
best_d <- d
idx_l <- c(i)
} else if (d == best_d)
idx_l <- c(idx_l, i)
}
# find the index with smallest d
index_to_add <- sample(idx_l, 1)
list(n = nrow(pool), d = best_d, index_to_add = index_to_add)
}
r <- time_run(timed, prev_D)
results <- results %>%
add_row(
subset_sz = m,
subset_n = r$result$n,
time = r$time,
d = r$result$d,
index = r$result$index_to_add
)
}
results
}
greedy_results <- saved_run(greedy_run, "greedy.csv")
indices <- greedy_results %>% pull(index)
print(
ggplot() +
geom_point(data = pool, aes(x = x1, y = x2, color = factor(y), shape = "P")) +
geom_point(data = pool[indices, ], aes(x = x1, y = x2, shape = "D"), alpha = 0.5) +
ggtitle("Greedy: Last Teaching Set vs Pool", subtitle = paste0("m = ", greedy_upper))
)
## # A tibble: 20 × 5
## subset_sz subset_n time d index
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 807 39.5 0.444 703
## 2 2 807 39.7 0.299 319
## 3 3 807 40.9 0.280 162
## 4 4 807 42.1 0.279 271
## 5 5 807 44.3 0.273 44
## 6 6 807 45.6 0.273 370
## 7 7 807 44.9 0.299 104
## 8 8 807 46.0 0.297 23
## 9 9 807 46.8 0.318 380
## 10 10 807 47.9 0.312 554
## 11 11 807 48.8 0.299 47
## 12 12 807 49.5 0.367 495
## 13 13 807 50.6 0.321 98
## 14 14 807 52.2 0.309 67
## 15 15 807 52.8 0.309 415
## 16 16 807 53.4 0.286 131
## 17 17 807 54.6 0.321 2
## 18 18 807 55.5 0.317 226
## 19 19 807 56.5 0.317 604
## 20 20 807 57.8 0.305 425