Question 1:

A)

set.seed(123)

# Define caffeine levels and corresponding number of students and A grades
caffeine_levels <- c(0, 50, 100, 150, 200)  # Caffeine levels
n_students <- rep(300, 5)  # Number of students per group
a_grades <- c(109, 155, 175, 158, 103)  # Number of A grades

# Create the initial data frame
Caffeine2.df <- data.frame(caffeine = caffeine_levels, n = n_students, Agrade = a_grades)

# Ungroup the data
new_caffeine <- rep(Caffeine2.df$caffeine, Caffeine2.df$n)
length(new_caffeine)  # Verify the length matches the total number of observations
## [1] 1500
tmp3 <- rep(c(t(cbind(Caffeine2.df$Agrade, Caffeine2.df$n - Caffeine2.df$Agrade))))
new_y <- rep(rep(c(1, 0), nrow(Caffeine2.df)), tmp3)
caffeine_long.df <- data.frame(caffeine = new_caffeine, y = new_y)

# Fit the initial model
initial_model <- glm(cbind(Agrade, n - Agrade) ~ caffeine + I(caffeine^2), family = binomial, data = Caffeine2.df)
print(coef(initial_model))
##   (Intercept)      caffeine I(caffeine^2) 
## -5.805004e-01  1.839666e-02 -9.327319e-05
# Bootstrap resampling
set.seed(123)
n <- nrow(caffeine_long.df)
Nsim <- 1000
betasBS <- matrix(0, Nsim, 3)

for (i in 1:Nsim) {
  # Draw a random sample (with replacement) from these data
  id <- sample(1:n, n, replace = TRUE)
  BS.df <- caffeine_long.df[id, ]  # Bootstrap sample

  mod_i <- glm(y ~ caffeine + I(caffeine^2), family = binomial, data = BS.df)

  betasBS[i, ] <- coef(mod_i)
}

# Calculate xpeak values
xpeaks <- -betasBS[, 2] / (2 * betasBS[, 3])

# Summarize xpeak values
summary(xpeaks)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85.64   95.60   98.55   98.57  101.34  112.62
# Plot histogram of xpeak values
hist(xpeaks, main = "Bootstrap Estimates of xpeak", xlab = "xpeak")

The histogram above shows a 1000 bootstrap estimated of the x peak from the caffeine data set, with the mean value of caffeine being associated and the most optimal amount of caffeine associated with success is between 95 to a 100mgs of caffeine. The bootstrap seems to have a normal distribution as the bulk of estimations are are centered in the middle and around the original estimate value and it is in a symmetric bell shaped meaning that the original estimate is reliable measure of the central tendency and the variability of the histogram seems to be a realistic measure of uncertainty associated with the estimate. overall the consistency and the normal distribution shape as well as the adequate sample size of a 1000 suggests that the data is valid and the original estimate is reliable.

B)

alpha <- 0.05
lower_bound <- quantile(xpeaks, alpha / 2)
upper_bound <- quantile(xpeaks, 1 - (alpha / 2))

# Print the CI without using cat
print("Bootstrap 95% Confidence Interval for xpeak:")
## [1] "Bootstrap 95% Confidence Interval for xpeak:"
print(paste("Lower bound:", lower_bound))
## [1] "Lower bound: 90.2889135817063"
print(paste("Upper bound:", upper_bound))
## [1] "Upper bound: 107.243361938917"

C)

The CIs above [90.28 , 107.24] are very similar to the CIs that were given by the simulation in the assignment 3 [90.93 , 106.70]. Both models have very similar confidence interval values with very insignificant differences suggesting that both models xpeaks values robust and reliable, and the minor differences in these values is due to the bootstrapping variability.

Question 2

A)

train.df <- read.table("train.txt")

names(train.df) <- c("D", paste("V", 1:256, sep=""))

train.df <- na.omit(train.df)

train.df <- train.df[train.df$D %in% c(3, 7), ]

for (i in 2:257) {
  train.df <- train.df[train.df[, i] >= -1 & train.df[, i] <= 1, ]
}

B)

par(mfrow = c(5, 5), mar = c(1, 1, 1, 1))
for (k in 1:25) {
  z = matrix(unlist(train.df[k, -1]), 16, 16)
  zz = z
  for (j in 16:1) zz[, j] = z[, 17 - j]
  image(zz, col = gray((32:0) / 32))
  box()
  text(0.1, 0.9, train.df$D[k], cex = 1.5)
}

The cells that would be good for differentiating 3s from 7s would be mainly cells in the bottom left, followed by bottom right and middle right as there are the main cells that 3 may stick out and there is no overlaps between 3 and 7 and the worst parts for differentiating 7 and 3 would be the top as both 3 and 7 share similar cells and there is a lot of overlapping.

C)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) 


correlations <- train.df %>%
  select(-D) %>%
  summarise_all(~ cor(., train.df$D)) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "correlation")

correlations <- correlations %>%
  mutate(abs_correlation = abs(correlation))

top_20_variables <- correlations %>%
  arrange(desc(abs_correlation)) %>%
  slice(1:20)

print(top_20_variables)
## # A tibble: 20 × 3
##    variable correlation abs_correlation
##    <chr>          <dbl>           <dbl>
##  1 V185           0.871           0.871
##  2 V170           0.833           0.833
##  3 V105          -0.821           0.821
##  4 V220          -0.814           0.814
##  5 V235          -0.805           0.805
##  6 V201           0.799           0.799
##  7 V229          -0.782           0.782
##  8 V120          -0.780           0.780
##  9 V219          -0.777           0.777
## 10 V230          -0.765           0.765
## 11 V104          -0.758           0.758
## 12 V189          -0.755           0.755
## 13 V205          -0.755           0.755
## 14 V169           0.750           0.750
## 15 V234          -0.743           0.743
## 16 V121          -0.739           0.739
## 17 V204          -0.738           0.738
## 18 V186           0.733           0.733
## 19 V173          -0.725           0.725
## 20 V221          -0.693           0.693

D)

train.df$Y <- ifelse(train.df$D == 7, 1, 0)

correlations <- sapply(train.df[, paste("V", 1:256, sep="")], function(x) cor(x, train.df$D, use = "complete.obs"))

abs_correlations <- abs(correlations)

top_20_indices <- order(abs_correlations, decreasing = TRUE)[1:20]
top_20_variables <- names(abs_correlations)[top_20_indices]

selected_vars <- c("Y", top_20_variables)
train_selected.df <- train.df[, selected_vars]

formula <- as.formula(paste("Y ~", paste(top_20_variables, collapse = " + ")))
Full.mod <- glm(formula, data = train_selected.df, family = binomial)

fitted_logits <- predict(Full.mod, type = "link")

deviance_residuals <- residuals(Full.mod, type = "deviance")

train_selected.df$fitted_logits <- fitted_logits
train_selected.df$deviance_residuals <- deviance_residuals

head(train_selected.df)
##   Y   V185   V170   V105   V220   V235   V201   V229 V120   V219   V230   V104
## 1 1  1.000  0.590 -1.000 -1.000 -1.000  0.958 -1.000   -1 -1.000 -1.000 -1.000
## 2 0 -1.000 -1.000  1.000 -0.433  1.000 -1.000  1.000    1 -0.733  1.000  1.000
## 3 0 -1.000 -1.000  0.936  0.214  0.056 -1.000  0.957    1  0.997  0.293  0.183
## 4 1 -0.978 -0.727 -1.000 -1.000 -0.835 -0.265 -1.000   -1 -0.706 -1.000 -1.000
## 5 1 -0.114  0.999 -1.000 -1.000 -1.000  0.360 -1.000   -1 -1.000 -1.000 -1.000
## 6 1  0.808 -0.353 -1.000 -1.000 -1.000  0.095 -1.000   -1 -1.000 -0.394 -1.000
##     V189   V205   V169   V234   V121 V204   V186   V173   V221 fitted_logits
## 1 -1.000 -1.000  0.963 -0.935 -1.000   -1 -0.079 -1.000 -1.000      9.250851
## 2  0.418 -0.077 -1.000  1.000  0.319   -1 -1.000  0.918  0.649    -14.077921
## 3  0.281 -0.537 -1.000  0.975  1.000    1 -1.000  0.740 -0.996    -10.735400
## 4 -1.000 -1.000 -1.000  0.959 -1.000   -1  0.425 -0.834 -1.000      2.836090
## 5 -1.000 -1.000 -0.522  0.385 -1.000   -1  0.995 -1.000 -1.000      5.742092
## 6 -1.000 -1.000  1.000 -1.000 -0.649   -1 -0.962 -1.000 -1.000      7.653596
##   deviance_residuals
## 1        0.013858231
## 2       -0.001240318
## 3       -0.006597054
## 4        0.337635316
## 5        0.080036420
## 6        0.030796852
summary(Full.mod)
## 
## Call:
## glm(formula = formula, family = binomial, data = train_selected.df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.7213     1.1139  -3.341 0.000835 ***
## V185          0.5552     0.9111   0.609 0.542296    
## V170          0.5178     0.7462   0.694 0.487756    
## V105         -2.2946     0.7941  -2.890 0.003857 ** 
## V220          1.3797     1.0624   1.299 0.194065    
## V235         -0.6921     0.6548  -1.057 0.290544    
## V201         -0.2385     0.7841  -0.304 0.761023    
## V229         -0.5488     0.8635  -0.636 0.525081    
## V120         -0.5567     0.9129  -0.610 0.541987    
## V219         -0.6417     0.7202  -0.891 0.372942    
## V230         -1.2361     0.6692  -1.847 0.064719 .  
## V104          0.2161     0.7724   0.280 0.779626    
## V189         -2.6311     1.6250  -1.619 0.105417    
## V205          1.2512     1.2990   0.963 0.335416    
## V169          2.0252     0.9125   2.219 0.026461 *  
## V234         -0.5477     0.6404  -0.855 0.392423    
## V121         -0.4854     0.8082  -0.601 0.548059    
## V204         -1.3438     1.0016  -1.342 0.179706    
## V186          0.4521     0.7666   0.590 0.555333    
## V173          0.9718     0.8536   1.138 0.254918    
## V221         -3.3018     1.3970  -2.364 0.018102 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1806.212  on 1302  degrees of freedom
## Residual deviance:   87.875  on 1282  degrees of freedom
## AIC: 129.87
## 
## Number of Fisher Scoring iterations: 10

E)

selected_variables <- top_20_variables

Full.mod <- glm(Y ~ ., data = train.df[, c(selected_variables, "Y")], family = binomial)

predicted_probabilities <- predict(Full.mod, type = "response")

predicted_classes <- ifelse(predicted_probabilities > 0.5, 7, 3)

in_sample_error <- mean(predicted_classes != train.df$D)

print(paste("In-sample prediction error:", in_sample_error))
## [1] "In-sample prediction error: 0.0107444359171144"

The prediction error suggests that 1.07% of the predictions made by the logistic regression model were incorrect when evaluated on the training data, this may call for cross validation in the future as there might be a potential over fitting as the sample error is low.

F)

null.model = glm(Y~1, data=train.df, family=binomial)
#step(Full.mod, direction = "backward")
step(null.model, formula(Full.mod),direction="both", trace=0)
## 
## Call:  glm(formula = Y ~ V185 + V105 + V220 + V230 + V235 + V189 + V169 + 
##     V221 + V219 + V121, family = binomial, data = train.df)
## 
## Coefficients:
## (Intercept)         V185         V105         V220         V230         V235  
##     -3.5699       1.2121      -2.1221       1.1322      -1.5567      -0.8855  
##        V189         V169         V221         V219         V121  
##     -1.9760       1.7562      -2.5699      -1.3135      -1.0119  
## 
## Degrees of Freedom: 1302 Total (i.e. Null);  1292 Residual
## Null Deviance:       1806 
## Residual Deviance: 93.44     AIC: 115.4
stepwise.mod <- step(null.model, scope = list(lower = null.model, upper = Full.mod), direction = "both", trace = 0)

full.mod_predictions <- predict(Full.mod, type = "response")
full.mod_predicted_classes <- ifelse(full.mod_predictions > 0.5, 1, 0)
full.mod_error_rate <- mean(full.mod_predicted_classes != train_selected.df$Y)
full.mod_error_rate_percent <- full.mod_error_rate * 100

stepwise_predictions <- predict(stepwise.mod, type = "response")
stepwise_predicted_classes <- ifelse(stepwise_predictions > 0.5, 1, 0)
stepwise_error_rate <- mean(stepwise_predicted_classes != train_selected.df$Y)
stepwise_error_rate_percent <- stepwise_error_rate * 100

print(paste("In-sample Prediction Error full.mod: ", round(full.mod_error_rate_percent, 2), "%", sep = ""))
## [1] "In-sample Prediction Error full.mod: 1.07%"
print(paste("In-sample Prediction Error Stepwise Model: ", round(stepwise_error_rate_percent, 2), "%", sep = ""))
## [1] "In-sample Prediction Error Stepwise Model: 1.3%"
print(summary(stepwise.mod))
## 
## Call:
## glm(formula = Y ~ V185 + V105 + V220 + V230 + V235 + V189 + V169 + 
##     V221 + V219 + V121, family = binomial, data = train.df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.5699     0.9517  -3.751 0.000176 ***
## V185          1.2121     0.6125   1.979 0.047837 *  
## V105         -2.1221     0.5235  -4.054 5.04e-05 ***
## V220          1.1322     0.7696   1.471 0.141245    
## V230         -1.5567     0.3972  -3.919 8.88e-05 ***
## V235         -0.8855     0.4731  -1.872 0.061273 .  
## V189         -1.9760     0.8926  -2.214 0.026844 *  
## V169          1.7562     0.6911   2.541 0.011050 *  
## V221         -2.5699     0.9014  -2.851 0.004359 ** 
## V219         -1.3135     0.4700  -2.795 0.005196 ** 
## V121         -1.0119     0.4293  -2.357 0.018429 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1806.21  on 1302  degrees of freedom
## Residual deviance:   93.44  on 1292  degrees of freedom
## AIC: 115.44
## 
## Number of Fisher Scoring iterations: 10

As indicated above the in sample prediction error rate of null model is 1.07% indicating that there is 1.07% of incorrect predictions in this model. The prediction error in the step wise model is 1.3% and it is slightly higher than the null model. The step wise model is simpler than the full model as it includes less predictor variables, however the cost of this is a slightly higher prediction accuracy. The full model with a lower prediction error may be over fitting (failing to fit additional data) the training data slightly compared to step wise model which despite having a slightly higher in sample error variable could generlise better to the new data because of low level of complexity.

G)

PE.fun <- function(train_data, test_data, formula) {
  mod <- glm(formula, data = train_data, family = binomial)
  
  predictions <- predict(mod, newdata = test_data, type = "response")
  predicted_classes <- ifelse(predictions > 0.5, 1, 0)
  
  error_rate <- mean(predicted_classes != test_data$Y)
  return(error_rate)
}

set.seed(123)
k <- 5
folds <- sample(1:k, nrow(train_selected.df), replace = TRUE)
cv_errors <- numeric(k)

for (i in 1:k) {
  train_data <- train_selected.df[folds != i, ]
  test_data <- train_selected.df[folds == i, ]
  
  cv_errors[i] <- PE.fun(train_data, test_data, formula)
}

mean_cv_error <- mean(cv_errors)
mean_cv_error_percent <- mean_cv_error * 100

print(paste("Average Prediction Error Rate: ", round(mean_cv_error_percent, 2), "%", sep=""))
## [1] "Average Prediction Error Rate: 1.93%"

The cross validation process provides an average prediction error of 1.93% for the reduced logistic regression model demonstrates strong generlise ability meaning it is accurate and simple for the variables.