Load packages and set working directory:
## x y
## 1 2.374862 1.0802890
## 2 2.946939 1.8986054
## 3 1.650485 1.8640327
## 4 3.270571 2.6910741
## 5 1.168345 -3.0322151
## 6 1.443327 -1.1253668
## 7 2.869272 3.3632392
## 8 2.695933 1.9037783
## 9 1.450013 2.0780165
## 10 1.638038 -0.1211598
## 11 1.978955 1.3904179
## 12 1.186760 -0.9483188
## 13 2.001471 -0.3276148
## 14 2.585486 3.1464462
## 15 2.231690 1.4246791
## 16 2.851257 3.6954195
## 17 1.659208 0.0867588
## 18 3.076964 2.5382524
## 19 3.447796 2.6038317
## 20 1.500665 -0.1878728
## 21 3.912858 3.0603483
## 22 2.474814 2.8410390
## 23 1.767409 -0.4052952
## 24 3.094247 3.5516174
## 25 2.574593 0.3965521
## 26 1.002182 1.2032982
## 27 1.828825 0.2724988
## 28 3.779303 3.9852923
## 29 3.747501 3.7052840
## 30 2.607060 2.1907988
ols_regression_single <- function(x, y) {
model <- lm(y ~ x)
return(list(m = model$coefficients[2], b = model$coefficients[1]))
}
# define closeness
closeness <- function(m, b, mhat, bhat) (mhat - m)^2 + (bhat - b)^2
# generate the set of all multi-subsets of size m from a set of size n given by index 1:n
# an equivalent problem: generate the set of all tuples of n non-negative integers with sum m
gen_tuples <- function(n, m) {
if (n == 1) {
return(list(c(m)))
} else {
v <- list()
for (i in 0:m) {
v <- c(v, lapply(gen_tuples(n - 1, m - i), function(x) c(i, x)))
}
return(v)
}
}
# select the best multi-subset of size n from the dataset df with variables x and y
combinatorial_selector <- function(df, n, verbose = FALSE, history = NA) {
if (n < 2) stop("n must be greater than 1")
# generate all multi-subsets of size n
tuples <- gen_tuples(nrow(df), n)
# generate the multi-subsets
# an element v_i in tuple v means the number of times the i-th point is selected
multisets <- map(tuples, function(v) {
multiset <- list(x = c(), y = c())
for (i in seq_along(v)) {
multiset$x <- c(multiset$x, rep(df$x[i], v[i]))
multiset$y <- c(multiset$y, rep(df$y[i], v[i]))
}
return(multiset)
})
if (verbose) cat("Number of multi-subsets:", length(multisets), "\n")
# calculate the closeness of each multi-subset
closenesses <- map_dbl(multisets, function(s) {
reg <- ols_regression_single(s$x, s$y)
return(closeness(2, -3, reg$m, reg$b))
})
# drop NA, as if we select n points be the same, the closeness is NA
min_closeness <- min(closenesses, na.rm = TRUE)
if (is.na(min_closeness)) {
return(list(
tuple = NULL,
multiset = NULL,
closeness = NA
))
}
# return the best tuple and its closeness
best_idx <- which(closenesses == min_closeness)
return(list(
multiset = multisets[[best_idx]],
closeness = min_closeness,
n = length(tuples)
))
}
pool_fit <- function(n, selector, verbose = FALSE, history = NA) {
start_time <- Sys.time()
d <- selector(df = pool, n = n, history = history)
df_selected <- tibble(
x = d$multiset$x,
y = d$multiset$y
)
model <- ols_regression_single(df_selected$x, df_selected$y)
end_time <- Sys.time()
if (verbose) {
cat("\nBest dataset (use the first one):\n")
print(d)
cat("\nModel of linear regression on the best dataset:\n")
print(model)
cat("Time elapsed (second):", end_time - start_time, "\n")
cat("[pool_fit] x =", d$multiset$x, " y =", d$multiset$y, "\n")
}
return(list(
x = d$multiset$x,
y = d$multiset$y,
loss = d$closeness,
n_ols = d$n_ols,
time = end_time - start_time
))
}
pool_fit_batch <- function(upper, selector, file) {
if (!file.exists(file)) {
pool_fit_results <- tibble(x = list(), y = list(), n_pts = integer(), n_ols = integer(), loss = double(), time = double())
for (n in 2:upper) {
history_row_df <- pool_fit_results[nrow(pool_fit_results), ]
history <- list(x = unlist(history_row_df$x), y = unlist(history_row_df$y))
result <- pool_fit(n, selector, history = history)
pool_fit_results <- rbind(pool_fit_results, tibble(x = list(result$x), y = list(result$y), n_pts = n, n_ols = result$n_ols, loss = result$loss, time = result$time))
}
pool_fit_results <- pool_fit_results %>% select(-x, -y)
} else {
pool_fit_results <- read_csv(file, col_types = cols(n_pts = col_integer(), n_ols = col_integer(), loss = col_double(), time = col_double()))
}
pool_fit_results <- pool_fit_results %>% mutate(loss = num(loss, notation = "sci"))
if (!file.exists(file))
write_csv(pool_fit_results, file)
return(pool_fit_results)
}
# plot n_pts vs. log(loss) curve
plot_fit_result <- function(pool_fit_results, title) {
p <- ggplot(pool_fit_results, aes(x = n_pts, y = loss)) +
geom_point() +
geom_line() +
scale_y_log10() +
labs(x = "Number of points", y = "Loss (scaled log10)") +
ggtitle("Log of loss vs. number of points", subtitle = title)
ggsave(paste0(title, "_plot.svg"), p)
p
}
pool_fit_comb_results <- pool_fit_batch(
upper = 6,
selector = combinatorial_selector,
file = "q1_pool_fit_comb_results.csv"
)
pool_fit_comb_results %>% print(n = Inf)
## # A tibble: 5 × 3
## n_pts loss time
## <int> <sci> <dbl>
## 1 2 1.86e-2 0.147
## 2 3 2.01e-4 1.35
## 3 4 1.65e-7 10.8
## 4 5 1.77e-6 74.7
## 5 6 6.54e-8 417.
As the enumeration for the combinatorial selector of size 6 takes 418 seconds, and it is extremely slow for larger sizes (waited more than 10 minutes for size 7), we stop the enumeration at size 6.
greedy_selector <- function(df, n, history) {
if (length(history$x) >= 2 && length(history$x) != n - 1) stop("n is not for +1")
n_ols <- 0
if (length(history$x) >= 2) {
df_selected <- tibble(x = history$x, y = history$y)
} else {
# select the first point
initial_idx <- sample(nrow(df), 1)
initial_point <- df[initial_idx, ]
df_selected <- tibble(x = initial_point$x, y = initial_point$y)
}
min_global_closeness <- Inf
lower <- 2
if (length(history$x) >= 2)
lower <- length(history$x) + 1
for (i in lower:n) {
min_closeness <- Inf
min_idx <- NA
for (j in seq_len(nrow(df))) {
pending_df_selected <- rbind(df_selected, df[j, ])
theta_hat <- ols_regression_single(pending_df_selected$x, pending_df_selected$y)
n_ols <- n_ols + 1
pending_closeness <- closeness(2, -3, theta_hat$m, theta_hat$b)
if (is.na(pending_closeness)) next
if (pending_closeness < min_closeness) {
min_closeness <- pending_closeness
min_idx <- j
}
}
min_global_closeness <- min_closeness
df_selected <- rbind(df_selected, df[min_idx, ])
}
return(list(
multiset = list(x = df_selected$x, y = df_selected$y),
closeness = min_global_closeness,
n_ols = n_ols
))
}
pool_fit_greedy_results <- pool_fit_batch(
upper = nrow(pool),
greedy_selector,
file = "q2_pool_fit_greedy_results.csv"
)
pool_fit_greedy_results %>% print(n = Inf)
## # A tibble: 29 × 4
## n_pts n_ols loss time
## <int> <dbl> <sci> <drtn>
## 1 2 30 4.33e-1 0.021356821 secs
## 2 3 30 1.71e-1 0.007766008 secs
## 3 4 30 1.47e-1 0.010340929 secs
## 4 5 30 7.94e-2 0.007749081 secs
## 5 6 30 7.77e-2 0.007658005 secs
## 6 7 30 7.17e-2 0.007671118 secs
## 7 8 30 5.46e-2 0.010077000 secs
## 8 9 30 5.39e-2 0.007758141 secs
## 9 10 30 5.39e-2 0.007751942 secs
## 10 11 30 4.02e-2 0.007688046 secs
## 11 12 30 2.51e-2 0.010122061 secs
## 12 13 30 2.57e-2 0.007826090 secs
## 13 14 30 2.18e-2 0.007699013 secs
## 14 15 30 1.91e-2 0.007743835 secs
## 15 16 30 1.71e-2 0.010215044 secs
## 16 17 30 1.57e-2 0.007791042 secs
## 17 18 30 1.48e-2 0.007806063 secs
## 18 19 30 1.30e-2 0.007740021 secs
## 19 20 30 1.15e-2 0.010284901 secs
## 20 21 30 1.04e-2 0.007874012 secs
## 21 22 30 9.61e-3 0.007771015 secs
## 22 23 30 9.03e-3 0.007818937 secs
## 23 24 30 8.62e-3 0.010452986 secs
## 24 25 30 8.19e-3 0.007900000 secs
## 25 26 30 7.42e-3 0.007948875 secs
## 26 27 30 6.82e-3 0.007759094 secs
## 27 28 30 6.37e-3 0.010304928 secs
## 28 29 30 6.03e-3 0.007889032 secs
## 29 30 30 5.78e-3 0.007838964 secs
## # A tibble: 1 × 4
## n_pts n_ols loss time
## <int> <dbl> <sci> <drtn>
## 1 30 30 5.78e-3 0.007838964 secs
## # A tibble: 1 × 3
## n_pts loss time
## <int> <sci> <dbl>
## 1 6 6.54e-8 417.
The greedy algorithm is worse than the combinatorial algorithm, as the loss is higher (the difference between losses with max 6 points for the combinatorial selector and max 30 points for the greedy selector is significant), which means a lower quality of the teaching set. The stopping criterion I might use is that there is no lower loss which can refresh the min loss in consecutive 5 points, as selecting multiset does not need to restrict the number of selected points to be the size of the pool \(P\). In terms of running time, the greedy algorithm is much faster, and the time cost to add a point is constant, not exponential like the combinatorial algorithm.
# generate a dataset with n vectors of dimension d, with n_categories categories of labels
generator_unif <- function(n_vec, d, n_categories) {
x <- matrix(runif(n_vec * d), n_vec, d)
y <- sample.int(n_categories, n_vec, replace = TRUE) - 1
return(list(x = x, y = y))
}
# generate a dataset on a grid with n_categories categories of labels
generator_grid <- function(x_min, x_max, y_min, y_max, len, n_categories) {
x <- seq(x_min, x_max, length.out = len)
y <- seq(y_min, y_max, length.out = len)
x <- expand.grid(x, y)
x <- as.matrix(x)
y <- sample.int(n_categories, len * len, replace = TRUE) - 1
return(list(x = x, y = y))
}
# plot the training and testing datasets, with testing dataset in not predicted
plot_dataset <- function(ds_train, ds_test, n_categories, k = 0) {
plot <- ggplot() +
geom_point(data = tibble(x = ds_train$x[, 1], y = ds_train$x[, 2], label = ds_train$y), aes(x, y, color = factor(label), size = "train", alpha = "train")) +
scale_color_discrete(name = "label", labels = paste((1:n_categories) - 1)) +
scale_size_manual(values = c("train" = 3, "test" = 1)) +
scale_alpha_manual(values = c("train" = 1, "test" = 0.5)) +
xlim(0, 1) +
ylim(0, 1)
if (length(table(ds_test$y)) > 1) {
plot <- plot +
geom_point(data = tibble(x = ds_test$x[, 1], y = ds_test$x[, 2], label = ds_test$y), aes(x, y, color = factor(label), size = "test", alpha = "test")) +
ggtitle(
"Training and Testing Datasets with Predicted Labels",
subtitle = bquote(n[categories] == .(n_categories) ~ n[train] == .(nrow(ds_train$x)) ~ n[test] == .(nrow(ds_test$x)) ~ k == .(k))
)
ggsave(paste0("q3_plot_cat", n_categories, "_train", nrow(ds_train$x), "_test", nrow(ds_test$x), "_k", k, ".svg"), plot)
} else {
plot <- plot +
geom_point(data = tibble(x = ds_test$x[, 1], y = ds_test$x[, 2]), aes(x, y, size = "test", alpha = "test")) +
ggtitle(
"Training and Testing Datasets without Predicted Labels",
subtitle = bquote(n[categories] == .(n_categories) ~ n[train] == .(nrow(ds_train$x)) ~ n[test] == .(nrow(ds_test$x)))
)
ggsave(paste0("q3_cat", n_categories, "_train", nrow(ds_train$x), "_test", nrow(ds_test$x), "_np.svg"), plot)
}
plot
}
ds_train <- generator_unif(n_vec = 16, d = 2, n_categories = 2)
ds_test <- generator_grid(x_min = 0, x_max = 1, y_min = 0, y_max = 1, len = 100, n_categories = 1)
plot_dataset(ds_train, ds_test, n_categories = 2)
knn_classifier <- function(train_x, train_y, test_x, k) {
test_y <- integer(nrow(test_x))
for (i in seq_len(nrow(test_x))) {
# compute the distance between the test point and all training points
distances <- map_dbl(seq_len(nrow(train_x)), function(j) sqrt(sum((train_x[j, ] - test_x[i, ]) ^ 2)))
# find the k nearest neighbors
nearest_neighbors <- train_y[order(distances)[1:k]]
# predict the label of the test point
test_y_single <- as.integer(names(which.max(table(nearest_neighbors))))
# store the prediction
test_y[i] <- test_y_single
}
return(test_y)
}
ds_train <- generator_unif(n_vec = 32, d = 2, n_categories = 4)
ds_test <- generator_grid(x_min = 0, x_max = 1, y_min = 0, y_max = 1, len = 100, n_categories = 1)
ds_test_pred <- tibble(x = ds_test$x, y = knn_classifier(ds_train$x, ds_train$y, ds_test$x, k = 3))
plot_dataset(ds_train, ds_test_pred, n_categories = 4, k = 4)