set.seed(123)
# Define caffeine levels and corresponding number of students and A grades
caffeine_levels <- c(0, 50, 100, 150, 200) # Caffeine levels
n_students <- rep(300, 5) # Number of students per group
a_grades <- c(109, 155, 175, 158, 103) # Number of A grades
# Create the initial data frame
Caffeine2.df <- data.frame(caffeine = caffeine_levels, n = n_students, Agrade = a_grades)
# Ungroup the data
new_caffeine <- rep(Caffeine2.df$caffeine, Caffeine2.df$n)
length(new_caffeine) # Verify the length matches the total number of observations
## [1] 1500
tmp3 <- rep(c(t(cbind(Caffeine2.df$Agrade, Caffeine2.df$n - Caffeine2.df$Agrade))))
new_y <- rep(rep(c(1, 0), nrow(Caffeine2.df)), tmp3)
caffeine_long.df <- data.frame(caffeine = new_caffeine, y = new_y)
# Fit the initial model
initial_model <- glm(cbind(Agrade, n - Agrade) ~ caffeine + I(caffeine^2), family = binomial, data = Caffeine2.df)
print(coef(initial_model))
## (Intercept) caffeine I(caffeine^2)
## -5.805004e-01 1.839666e-02 -9.327319e-05
# Bootstrap resampling
set.seed(123)
n <- nrow(caffeine_long.df)
Nsim <- 1000
betasBS <- matrix(0, Nsim, 3)
for (i in 1:Nsim) {
# Draw a random sample (with replacement) from these data
id <- sample(1:n, n, replace = TRUE)
BS.df <- caffeine_long.df[id, ] # Bootstrap sample
mod_i <- glm(y ~ caffeine + I(caffeine^2), family = binomial, data = BS.df)
betasBS[i, ] <- coef(mod_i)
}
# Calculate xpeak values
xpeaks <- -betasBS[, 2] / (2 * betasBS[, 3])
# Summarize xpeak values
summary(xpeaks)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85.64 95.60 98.55 98.57 101.34 112.62
# Plot histogram of xpeak values
hist(xpeaks, main = "Bootstrap Estimates of xpeak", xlab = "xpeak")
The histogram above shows a 1000 bootstrap estimated of the x peak from
the caffeine data set, with the mean value of caffeine being associated
and the most optimal amount of caffeine associated with success is
between 95 to a 100mgs of caffeine. The bootstrap seems to have a normal
distribution as the bulk of estimations are are centered in the middle
and around the original estimate value and it is in a symmetric bell
shaped meaning that the original estimate is reliable measure of the
central tendency and the variability of the histogram seems to be a
realistic measure of uncertainty associated with the estimate. overall
the consistency and the normal distribution shape as well as the
adequate sample size of a 1000 suggests that the data is valid and the
original estimate is reliable.
alpha <- 0.05
lower_bound <- quantile(xpeaks, alpha / 2)
upper_bound <- quantile(xpeaks, 1 - (alpha / 2))
# Print the CI without using cat
print("Bootstrap 95% Confidence Interval for xpeak:")
## [1] "Bootstrap 95% Confidence Interval for xpeak:"
print(paste("Lower bound:", lower_bound))
## [1] "Lower bound: 90.2889135817063"
print(paste("Upper bound:", upper_bound))
## [1] "Upper bound: 107.243361938917"
The CIs above [90.28 , 107.24] are very similar to the CIs that were given by the simulation in the assignment 3 [90.93 , 106.70]. Both models have very similar confidence interval values with very insignificant differences suggesting that both models xpeaks values robust and reliable, and the minor differences in these values is due to the bootstrapping variability.
train.df <- read.table("train.txt")
names(train.df) <- c("D", paste("V", 1:256, sep=""))
train.df <- na.omit(train.df)
train.df <- train.df[train.df$D %in% c(3, 7), ]
for (i in 2:257) {
train.df <- train.df[train.df[, i] >= -1 & train.df[, i] <= 1, ]
}
par(mfrow = c(5, 5), mar = c(1, 1, 1, 1))
for (k in 1:25) {
z = matrix(unlist(train.df[k, -1]), 16, 16)
zz = z
for (j in 16:1) zz[, j] = z[, 17 - j]
image(zz, col = gray((32:0) / 32))
box()
text(0.1, 0.9, train.df$D[k], cex = 1.5)
}
The cells that would be good for differentiating 3s from 7s would be mainly cells in the bottom left, followed by bottom right and middle right as there are the main cells that 3 may stick out and there is no overlaps between 3 and 7 and the worst parts for differentiating 7 and 3 would be the top as both 3 and 7 share similar cells and there is a lot of overlapping.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
correlations <- train.df %>%
select(-D) %>%
summarise_all(~ cor(., train.df$D)) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "correlation")
correlations <- correlations %>%
mutate(abs_correlation = abs(correlation))
top_20_variables <- correlations %>%
arrange(desc(abs_correlation)) %>%
slice(1:20)
print(top_20_variables)
## # A tibble: 20 × 3
## variable correlation abs_correlation
## <chr> <dbl> <dbl>
## 1 V185 0.871 0.871
## 2 V170 0.833 0.833
## 3 V105 -0.821 0.821
## 4 V220 -0.814 0.814
## 5 V235 -0.805 0.805
## 6 V201 0.799 0.799
## 7 V229 -0.782 0.782
## 8 V120 -0.780 0.780
## 9 V219 -0.777 0.777
## 10 V230 -0.765 0.765
## 11 V104 -0.758 0.758
## 12 V189 -0.755 0.755
## 13 V205 -0.755 0.755
## 14 V169 0.750 0.750
## 15 V234 -0.743 0.743
## 16 V121 -0.739 0.739
## 17 V204 -0.738 0.738
## 18 V186 0.733 0.733
## 19 V173 -0.725 0.725
## 20 V221 -0.693 0.693
train.df$Y <- ifelse(train.df$D == 7, 1, 0)
correlations <- sapply(train.df[, paste("V", 1:256, sep="")], function(x) cor(x, train.df$D, use = "complete.obs"))
abs_correlations <- abs(correlations)
top_20_indices <- order(abs_correlations, decreasing = TRUE)[1:20]
top_20_variables <- names(abs_correlations)[top_20_indices]
selected_vars <- c("Y", top_20_variables)
train_selected.df <- train.df[, selected_vars]
formula <- as.formula(paste("Y ~", paste(top_20_variables, collapse = " + ")))
Full.mod <- glm(formula, data = train_selected.df, family = binomial)
fitted_logits <- predict(Full.mod, type = "link")
deviance_residuals <- residuals(Full.mod, type = "deviance")
train_selected.df$fitted_logits <- fitted_logits
train_selected.df$deviance_residuals <- deviance_residuals
head(train_selected.df)
## Y V185 V170 V105 V220 V235 V201 V229 V120 V219 V230 V104
## 1 1 1.000 0.590 -1.000 -1.000 -1.000 0.958 -1.000 -1 -1.000 -1.000 -1.000
## 2 0 -1.000 -1.000 1.000 -0.433 1.000 -1.000 1.000 1 -0.733 1.000 1.000
## 3 0 -1.000 -1.000 0.936 0.214 0.056 -1.000 0.957 1 0.997 0.293 0.183
## 4 1 -0.978 -0.727 -1.000 -1.000 -0.835 -0.265 -1.000 -1 -0.706 -1.000 -1.000
## 5 1 -0.114 0.999 -1.000 -1.000 -1.000 0.360 -1.000 -1 -1.000 -1.000 -1.000
## 6 1 0.808 -0.353 -1.000 -1.000 -1.000 0.095 -1.000 -1 -1.000 -0.394 -1.000
## V189 V205 V169 V234 V121 V204 V186 V173 V221 fitted_logits
## 1 -1.000 -1.000 0.963 -0.935 -1.000 -1 -0.079 -1.000 -1.000 9.250851
## 2 0.418 -0.077 -1.000 1.000 0.319 -1 -1.000 0.918 0.649 -14.077921
## 3 0.281 -0.537 -1.000 0.975 1.000 1 -1.000 0.740 -0.996 -10.735400
## 4 -1.000 -1.000 -1.000 0.959 -1.000 -1 0.425 -0.834 -1.000 2.836090
## 5 -1.000 -1.000 -0.522 0.385 -1.000 -1 0.995 -1.000 -1.000 5.742092
## 6 -1.000 -1.000 1.000 -1.000 -0.649 -1 -0.962 -1.000 -1.000 7.653596
## deviance_residuals
## 1 0.013858231
## 2 -0.001240318
## 3 -0.006597054
## 4 0.337635316
## 5 0.080036420
## 6 0.030796852
summary(Full.mod)
##
## Call:
## glm(formula = formula, family = binomial, data = train_selected.df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.7213 1.1139 -3.341 0.000835 ***
## V185 0.5552 0.9111 0.609 0.542296
## V170 0.5178 0.7462 0.694 0.487756
## V105 -2.2946 0.7941 -2.890 0.003857 **
## V220 1.3797 1.0624 1.299 0.194065
## V235 -0.6921 0.6548 -1.057 0.290544
## V201 -0.2385 0.7841 -0.304 0.761023
## V229 -0.5488 0.8635 -0.636 0.525081
## V120 -0.5567 0.9129 -0.610 0.541987
## V219 -0.6417 0.7202 -0.891 0.372942
## V230 -1.2361 0.6692 -1.847 0.064719 .
## V104 0.2161 0.7724 0.280 0.779626
## V189 -2.6311 1.6250 -1.619 0.105417
## V205 1.2512 1.2990 0.963 0.335416
## V169 2.0252 0.9125 2.219 0.026461 *
## V234 -0.5477 0.6404 -0.855 0.392423
## V121 -0.4854 0.8082 -0.601 0.548059
## V204 -1.3438 1.0016 -1.342 0.179706
## V186 0.4521 0.7666 0.590 0.555333
## V173 0.9718 0.8536 1.138 0.254918
## V221 -3.3018 1.3970 -2.364 0.018102 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1806.212 on 1302 degrees of freedom
## Residual deviance: 87.875 on 1282 degrees of freedom
## AIC: 129.87
##
## Number of Fisher Scoring iterations: 10
selected_variables <- top_20_variables
Full.mod <- glm(Y ~ ., data = train.df[, c(selected_variables, "Y")], family = binomial)
predicted_probabilities <- predict(Full.mod, type = "response")
predicted_classes <- ifelse(predicted_probabilities > 0.5, 7, 3)
in_sample_error <- mean(predicted_classes != train.df$D)
print(paste("In-sample prediction error:", in_sample_error))
## [1] "In-sample prediction error: 0.0107444359171144"
The prediction error suggests that 1.07% of the predictions made by the logistic regression model were incorrect when evaluated on the training data, this may call for cross validation in the future as there might be a potential over fitting as the sample error is low.
null.model = glm(Y~1, data=train.df, family=binomial)
#step(Full.mod, direction = "backward")
step(null.model, formula(Full.mod),direction="both", trace=0)
##
## Call: glm(formula = Y ~ V185 + V105 + V220 + V230 + V235 + V189 + V169 +
## V221 + V219 + V121, family = binomial, data = train.df)
##
## Coefficients:
## (Intercept) V185 V105 V220 V230 V235
## -3.5699 1.2121 -2.1221 1.1322 -1.5567 -0.8855
## V189 V169 V221 V219 V121
## -1.9760 1.7562 -2.5699 -1.3135 -1.0119
##
## Degrees of Freedom: 1302 Total (i.e. Null); 1292 Residual
## Null Deviance: 1806
## Residual Deviance: 93.44 AIC: 115.4
stepwise.mod <- step(null.model, scope = list(lower = null.model, upper = Full.mod), direction = "both", trace = 0)
full.mod_predictions <- predict(Full.mod, type = "response")
full.mod_predicted_classes <- ifelse(full.mod_predictions > 0.5, 1, 0)
full.mod_error_rate <- mean(full.mod_predicted_classes != train_selected.df$Y)
full.mod_error_rate_percent <- full.mod_error_rate * 100
stepwise_predictions <- predict(stepwise.mod, type = "response")
stepwise_predicted_classes <- ifelse(stepwise_predictions > 0.5, 1, 0)
stepwise_error_rate <- mean(stepwise_predicted_classes != train_selected.df$Y)
stepwise_error_rate_percent <- stepwise_error_rate * 100
print(paste("In-sample Prediction Error full.mod: ", round(full.mod_error_rate_percent, 2), "%", sep = ""))
## [1] "In-sample Prediction Error full.mod: 1.07%"
print(paste("In-sample Prediction Error Stepwise Model: ", round(stepwise_error_rate_percent, 2), "%", sep = ""))
## [1] "In-sample Prediction Error Stepwise Model: 1.3%"
print(summary(stepwise.mod))
##
## Call:
## glm(formula = Y ~ V185 + V105 + V220 + V230 + V235 + V189 + V169 +
## V221 + V219 + V121, family = binomial, data = train.df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5699 0.9517 -3.751 0.000176 ***
## V185 1.2121 0.6125 1.979 0.047837 *
## V105 -2.1221 0.5235 -4.054 5.04e-05 ***
## V220 1.1322 0.7696 1.471 0.141245
## V230 -1.5567 0.3972 -3.919 8.88e-05 ***
## V235 -0.8855 0.4731 -1.872 0.061273 .
## V189 -1.9760 0.8926 -2.214 0.026844 *
## V169 1.7562 0.6911 2.541 0.011050 *
## V221 -2.5699 0.9014 -2.851 0.004359 **
## V219 -1.3135 0.4700 -2.795 0.005196 **
## V121 -1.0119 0.4293 -2.357 0.018429 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1806.21 on 1302 degrees of freedom
## Residual deviance: 93.44 on 1292 degrees of freedom
## AIC: 115.44
##
## Number of Fisher Scoring iterations: 10
As indicated above the in sample prediction error rate of null model is 1.07% indicating that there is 1.07% of incorrect predictions in this model. The prediction error in the step wise model is 1.3% and it is slightly higher than the null model. The step wise model is simpler than the full model as it includes less predictor variables, however the cost of this is a slightly higher prediction accuracy. The full model with a lower prediction error may be over fitting (failing to fit additional data) the training data slightly compared to step wise model which despite having a slightly higher in sample error variable could generlise better to the new data because of low level of complexity.
PE.fun <- function(train_data, test_data, formula) {
mod <- glm(formula, data = train_data, family = binomial)
predictions <- predict(mod, newdata = test_data, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
error_rate <- mean(predicted_classes != test_data$Y)
return(error_rate)
}
set.seed(123)
k <- 5
folds <- sample(1:k, nrow(train_selected.df), replace = TRUE)
cv_errors <- numeric(k)
for (i in 1:k) {
train_data <- train_selected.df[folds != i, ]
test_data <- train_selected.df[folds == i, ]
cv_errors[i] <- PE.fun(train_data, test_data, formula)
}
mean_cv_error <- mean(cv_errors)
mean_cv_error_percent <- mean_cv_error * 100
print(paste("Average Prediction Error Rate: ", round(mean_cv_error_percent, 2), "%", sep=""))
## [1] "Average Prediction Error Rate: 1.93%"
The cross validation process provides an average prediction error of 1.93% for the reduced logistic regression model demonstrates strong generlise ability meaning it is accurate and simple for the variables.