Fit a neural network to the Default data. Specifically
create a network that predicts default using
student, balance and income.
Default data into training and
testing datasets. Next create the predictor matrices
x_train and x_test as well as the outcomes
y_train and y_test. Consider standardizing
balance and income using
scale().# Load required libraries
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
library(keras)
## Warning: package 'keras' was built under R version 4.4.2
set.seed(123) # For reproducibility
# Load the data
data("Default")
# Split into training and testing sets (70-30 split)
train_indices <- sample(1:nrow(Default), size = 0.7 * nrow(Default))
train_data <- Default[train_indices, ]
test_data <- Default[-train_indices, ]
# Create predictor matrices and outcomes
x_train <- data.frame(
student = as.numeric(train_data$student == "Yes"),
balance = scale(train_data$balance),
income = scale(train_data$income)
)
x_test <- data.frame(
student = as.numeric(test_data$student == "Yes"),
balance = scale(test_data$balance),
income = scale(test_data$income)
)
y_train <- as.numeric(train_data$default == "Yes")
y_test <- as.numeric(test_data$default == "Yes")
modelnn with a single hidden
layer with 10 units, dropout regularization, and an output layer with a
single unit and sigmoidal activation function. Have a look at Labs
10.9.1–10.9.2 and Homework 10 for guidance.# Initialize the model
#modelnn <- keras_model_sequential()
#layer_dense(units = 10, activation = "relu", input_shape = ncol(x_train)) %>%
#layer_dropout(rate = 0.2) %>%
#layer_dense(units = 1, activation = "sigmoid")
modelnn to the training data.#modelnn %>% compile(
# loss = "binary_crossentropy",
# optimizer = "adam",
#metrics = c("accuracy")
#)
#history <- modelnn %>% fit(
# as.matrix(x_train), y_train,
#epochs = 100,
#batch_size = 32,
#validation_split = 0.2,
#verbose = 1
#)
Compute the error rate using the testing data (hint: use the
function predict() to evaluate the output layer of the
network for the testing data and then use the decision boundary 0.5 to
classify as TRUE or FALSE).
Compare the classification performance of your model with that of linear logistic regression.
In this problem, you will perform K-means clustering manually with \(K=2\) clusters on a small example dataset with \(n=6\) observations and \(p=2\) features. The clustering can be implemented either by hand or in R. The observations are as follows:
m=matrix(c(1,1,0,5,6,4,4,3,4,1.5,2,0), nrow=6)
m
## [,1] [,2]
## [1,] 1 4.0
## [2,] 1 3.0
## [3,] 0 4.0
## [4,] 5 1.5
## [5,] 6 2.0
## [6,] 4 0.0
# Create the dataset
m <- matrix(c(1, 1, 0, 5, 6, 4, 4, 3, 4, 1.5, 2, 0), nrow = 6, byrow = FALSE)
# Scatter plot
plot(m, main = "Scatter Plot of Observations", xlab = "Feature 1", ylab = "Feature 2", pch = 19, cex = 1.5)
sample() command in R to do this. Report the cluster labels
for each observation.set.seed(123) # For reproducibility
initial_clusters <- sample(1:2, nrow(m), replace = TRUE)
# Report cluster labels
print(initial_clusters)
## [1] 1 1 1 2 1 2
colMeans() may be useful).# Compute centroids
centroid1 <- colMeans(m[initial_clusters == 1, ])
centroid2 <- colMeans(m[initial_clusters == 2, ])
# Report centroids
print(centroid1)
## [1] 2.00 3.25
print(centroid2)
## [1] 4.50 0.75
# Function to compute distances and reassign clusters
compute_clusters <- function(data, centroids) {
distances <- sapply(1:2, function(k) rowSums((data - centroids[k, ])^2))
apply(distances, 1, which.min)
}
# Reassign clusters
centroids <- rbind(centroid1, centroid2)
new_clusters <- compute_clusters(m, centroids)
# Report new cluster labels
print(new_clusters)
## [1] 1 1 1 1 2 1
repeat {
# Update centroids for the current clusters
centroids <- rbind(
colMeans(m[new_clusters == 1, , drop = FALSE]),
colMeans(m[new_clusters == 2, , drop = FALSE])
)
# Reassign clusters to the nearest centroid
old_clusters <- new_clusters
new_clusters <- compute_clusters(m, centroids)
# Break if clusters no longer change
if (all(old_clusters == new_clusters)) break
}
col in
plot()). Also show the centroids.# Plot with final clusters
plot(m, col = new_clusters, pch = 19, cex = 1.5,
main = "K-means Clustering", xlab = "Feature 1", ylab = "Feature 2")
points(centroids, col = 1:2, pch = 4, cex = 2, lwd = 2) # Add centroids
Consider the USArrests data. We will now perform
hierarchical clustering on the states.
hclust() and plot(). Look at lab section
12.5.4)# Load USArrests data
data("USArrests")
# Scale the data
scaled_data <- scale(USArrests)
# Perform hierarchical clustering
hc_complete <- hclust(dist(scaled_data), method = "complete")
# Plot the dendrogram
plot(hc_complete, main = "Dendrogram: Complete Linkage", xlab = "", sub = "", cex = 0.9)
cutree())# Cut the dendrogram into 3 clusters
clusters <- cutree(hc_complete, k = 3)
# Display clusters
print(clusters)
## Alabama Alaska Arizona Arkansas California
## 1 1 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 3 2 3 3
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 3 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 3 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 3 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 3 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 3 1 2 3 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 3 3 3
plot() with empty x and
y vectors to create an empty plot and then use the function
text() to add the state names at the desired coordinates.
When running plot(), specify the plot range using
xlim and ylim. Also look at# Perform PCA
pca_result <- prcomp(scaled_data, scale. = TRUE)
# Plot PCA with empty plot and text
plot(pca_result$x[, 1], pca_result$x[, 2], type = "n",
xlab = "PC1", ylab = "PC2", main = "PCA of USArrests", xlim = c(-5, 5), ylim = c(-5, 5))
text(pca_result$x[, 1], pca_result$x[, 2], labels = rownames(USArrests), cex = 0.7)
(d) Repeat the calculation in (c) but this time coloring the state name
using the cluster assignment from part (b). Did the clustering work as
expected?
# Add colors based on cluster assignment
colors <- c("red", "blue", "green")
plot(pca_result$x[, 1], pca_result$x[, 2], type = "n",
xlab = "PC1", ylab = "PC2", main = "PCA Colored by Clusters", xlim = c(-5, 5), ylim = c(-5, 5))
text(pca_result$x[, 1], pca_result$x[, 2], labels = rownames(USArrests),
col = colors[clusters], cex = 0.7)
kmeans(), which is demonstrated in section 12.5.3)# Perform K-means clustering
set.seed(123) # For reproducibility
kmeans_result <- kmeans(scaled_data, centers = 3)
# Display clusters
print(kmeans_result$cluster)
## Alabama Alaska Arizona Arkansas California
## 1 1 1 3 1
## Colorado Connecticut Delaware Florida Georgia
## 1 3 3 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 1 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
# Plot PCA colored by K-means clusters
plot(pca_result$x[, 1], pca_result$x[, 2], type = "n",
xlab = "PC1", ylab = "PC2", main = "PCA Colored by K-means Clusters", xlim = c(-5, 5), ylim = c(-5, 5))
text(pca_result$x[, 1], pca_result$x[, 2], labels = rownames(USArrests),
col = colors[kmeans_result$cluster], cex = 0.7)