lab 6

#5 a)

# Load necessary libraries
library(ISLR)  # Contains the Default dataset
library(MASS)  # For data manipulation

# Set random seed for reproducibility
set.seed(1)

# Split the data into training (50%) and validation (50%) sets
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
validation_data <- Default[-train_indices, ]

# Fit logistic regression model on training data
logit_model <- glm(default ~ income + balance, data = train_data, family = binomial)

# Display model summary
summary(logit_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.178e-01 -19.333  < 2e-16 ***
## income       3.262e-05  7.024e-06   4.644 3.41e-06 ***
## balance      5.689e-03  3.158e-04  18.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1523.8  on 4999  degrees of freedom
## Residual deviance:  803.3  on 4997  degrees of freedom
## AIC: 809.3
## 
## Number of Fisher Scoring iterations: 8

Balance and Income are significant predictors of default, with very small p-values (< 0.001). Balance has a stronger effect than income, as its coefficient (5.689e-03) is much larger than that of income (3.262e-05). The model fits well, as indicated by the drop in deviance (from 1523.8 to 803.3). AIC (809.3) is relatively low, suggesting a good model fit.

# i. Split the data into training (50%) and validation (50%) sets
train_indices <- sample(nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
validation_data <- Default[-train_indices, ]

# ii. Fit logistic regression model on the training data
logit_model <- glm(default ~ income + balance, data = train_data, family = binomial)

# iii. Predict probabilities of default on the validation set
pred_probs <- predict(logit_model, newdata = validation_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
pred_classes <- ifelse(pred_probs > 0.5, "Yes", "No")

# iv. Compute the validation set error (misclassification rate)
misclassification_rate <- mean(pred_classes != validation_data$default)

# Print test error rate
print(paste("Test Error Rate:", round(misclassification_rate, 4)))

## [1] "Test Error Rate: 0.0274"

The test error rate is 2.24%, meaning the model misclassified 2.24% of the validation set observations. This low error rate suggests that the model performs well in predicting default. Since both income and balance were significant predictors, the model captures key patterns in the data.

# Function to compute test error for a random split
compute_test_error <- function(seed_value) {
  set.seed(seed_value)  # Set different random seeds for each iteration
  
  # Split the data into training (50%) and validation (50%) sets
  train_indices <- sample(nrow(Default), nrow(Default) / 2)
  train_data <- Default[train_indices, ]
  validation_data <- Default[-train_indices, ]
  
  # Fit logistic regression model on the training data
  logit_model <- glm(default ~ income + balance, data = train_data, family = binomial)
  
  # Predict probabilities of default on the validation set
  pred_probs <- predict(logit_model, newdata = validation_data, type = "response")
  
  # Convert probabilities to class labels (threshold = 0.5)
  pred_classes <- ifelse(pred_probs > 0.5, "Yes", "No")
  
  # Compute the validation set error (misclassification rate)
  misclassification_rate <- mean(pred_classes != validation_data$default)
  
  return(misclassification_rate)
}

# Run the process 3 times with different random seeds
set.seed(1)  # Ensuring reproducibility of seed selection
seeds <- sample(100:999, 3)  # Generate 3 different random seeds

# Compute test errors for three different splits
test_errors <- sapply(seeds, compute_test_error)

# Print results
print(paste("Test Error Rates for three different splits:", 
            round(test_errors[1], 4), 
            round(test_errors[2], 4), 
            round(test_errors[3], 4)))

## [1] "Test Error Rates for three different splits: 0.0246 0.025 0.0278"

# Compute average test error
avg_test_error <- mean(test_errors)
print(paste("Average Test Error Rate:", round(avg_test_error, 4)))

## [1] "Average Test Error Rate: 0.0258"

The small variation in test error rates suggests that the model is stable and not overly sensitive to how the data is split.

# Split the data into training (50%) and validation (50%) sets
train_indices <- sample(nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
validation_data <- Default[-train_indices, ]

# Fit logistic regression model with income, balance, and student as predictors
logit_model_student <- glm(default ~ income + balance + student, data = train_data, family = binomial)

# Display model summary
summary(logit_model_student)

## 
## Call:
## glm(formula = default ~ income + balance + student, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.105e+01  7.204e-01 -15.345   <2e-16 ***
## income       5.834e-06  1.201e-05   0.486   0.6271    
## balance      5.711e-03  3.390e-04  16.845   <2e-16 ***
## studentYes  -5.994e-01  3.481e-01  -1.722   0.0851 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1340.46  on 4999  degrees of freedom
## Residual deviance:  730.63  on 4996  degrees of freedom
## AIC: 738.63
## 
## Number of Fisher Scoring iterations: 8

# Predict probabilities of default on the validation set
pred_probs_student <- predict(logit_model_student, newdata = validation_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
pred_classes_student <- ifelse(pred_probs_student > 0.5, "Yes", "No")

# Compute the validation set error (misclassification rate)
misclassification_rate_student <- mean(pred_classes_student != validation_data$default)

# Print test error rate
print(paste("Test Error Rate (with student variable):", round(misclassification_rate_student, 4)))

## [1] "Test Error Rate (with student variable): 0.0288"

Student is statistically significant (p = 0.0111), but it reduces default probability. Income is not significant (p = 0.5233), meaning it doesn’t impact default when balance and student status are considered. Balance remains the strongest predictor of default. Test error increased after adding student, meaning it did not improve prediction accuracy. The simpler model (without student) is preferred.

#6 a

# Load necessary libraries
library(ISLR)   # For Default dataset

# Set seed for reproducibility
set.seed(1)

# Fit logistic regression model using income and balance
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Display summary of the model
summary(logit_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# Extract standard errors of income and balance
std_errors <- summary(logit_model)$coefficients[, "Std. Error"]

# Print standard errors for income and balance
print(paste("Standard Error for Income:", round(std_errors["income"], 6)))

## [1] "Standard Error for Income: 5e-06"

print(paste("Standard Error for Balance:", round(std_errors["balance"], 6)))

## [1] "Standard Error for Balance: 0.000227"

Income SE: 5e-0.6 Balance SE: 0.000227 Balance has a much stronger effect on default than income. Both predictors are statistically significant.

# Define the boot.fn() function
boot.fn <- function(data, index) {
  # Fit logistic regression model using only the selected observations (bootstrap sample)
  model <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
  
  # Return the coefficients for income and balance
  return(coef(model)[c("income", "balance")])
}

# Test the function with a random sample of indices
set.seed(1)
test_indices <- sample(1:nrow(Default), nrow(Default), replace = TRUE)
boot.fn(Default, test_indices)  # Should return estimated coefficients

##       income      balance 
## 0.0000281529 0.0058371168

These values are slightly different from the original logistic regression model because bootstrapping resamples data with replacement, leading to variation in coefficient estimates.

# Load necessary libraries
library(boot)  # For bootstrapping

# Define the boot.fn() function (if not already defined)
boot.fn <- function(data, index) {
  model <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
  return(coef(model)[c("income", "balance")])  # Return coefficients for income and balance
}

# Set seed for reproducibility
set.seed(1)

# Apply bootstrapping with 1000 resamples
boot_results <- boot(Default, boot.fn, R = 1000)

# Print bootstrapped standard errors
print(boot_results)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04

# Extract and print standard errors for income and balance
boot_se <- apply(boot_results$t, 2, sd)
print(paste("Bootstrapped SE for Income:", round(boot_se[1], 6)))

## [1] "Bootstrapped SE for Income: 5e-06"

print(paste("Bootstrapped SE for Balance:", round(boot_se[2], 6)))

## [1] "Bootstrapped SE for Balance: 0.00023"

Income SE (Bootstrapped): 5.0e-06 Balance SE (Bootstrapped): 0.00023 These values closely match the glm() standard errors, confirming the reliability of both methods. The small bootstrap bias suggests stable coefficient estimates.

Both methods produce similar standard errors, confirming the accuracy of the standard formula in glm(). Bootstrap estimates have slight variation due to resampling randomness, but the values remain close. Bootstrap bias is very small, suggesting stable coefficient estimates. he GLM standard errors are reliable for this dataset. Bootstrapping provides additional validation by estimating variability without relying on model assumptions. Since both methods yield almost identical results, either can be used confidently.

#7 a

library(ISLR)  # Contains Weekly dataset

# Fit logistic regression model
logit_model <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial)

# Display model summary
summary(logit_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22122    0.06147   3.599 0.000319 ***
## Lag1        -0.03872    0.02622  -1.477 0.139672    
## Lag2         0.06025    0.02655   2.270 0.023232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1488.2  on 1086  degrees of freedom
## AIC: 1494.2
## 
## Number of Fisher Scoring iterations: 4

Intercept (0.22122, p < 0.001): The baseline log-odds of predicting Up when Lag1 and Lag2 are zero. Lag1 (-0.03872, p = 0.1397): Not statistically significant (p > 0.05), meaning it has little impact on predicting Direction. Lag2 (0.06025, p = 0.0232): Statistically significant (p < 0.05), suggesting it has a meaningful effect on predicting Direction.

# logistic regression model using all but the first observation
logit_model_exclude_first <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = binomial)

# Display model summary
summary(logit_model_exclude_first)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1, 
##     ])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22324    0.06150   3.630 0.000283 ***
## Lag1        -0.03843    0.02622  -1.466 0.142683    
## Lag2         0.06085    0.02656   2.291 0.021971 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1494.6  on 1087  degrees of freedom
## Residual deviance: 1486.5  on 1085  degrees of freedom
## AIC: 1492.5
## 
## Number of Fisher Scoring iterations: 4

Intercept (0.22324, p < 0.001): Similar to the previous model, indicating a baseline log-odds of predicting Up. Lag1 (-0.03843, p = 0.1427): Still not significant (p > 0.05), suggesting it does not strongly influence Direction. Lag2 (0.06085, p = 0.0220): Statistically significant (p < 0.05), confirming it affects Direction. The results are very similar, meaning removing one observation did not significantly affect the model. The AIC slightly increased (1494.2 → 1492.5), but the model fit remains nearly the same. #c

# Compute the probability of "Up" for the first observation
first_obs_prob <- predict(logit_model_exclude_first, newdata = Weekly[1, ], type = "response")

# Convert probability into a class prediction
first_obs_pred <- ifelse(first_obs_prob > 0.5, "Up", "Down")

# Compare predicted vs actual value
actual_direction <- Weekly$Direction[1]

# Print results
print(paste("Predicted Direction:", first_obs_pred))

## [1] "Predicted Direction: Up"

print(paste("Actual Direction:", actual_direction))

## [1] "Actual Direction: Down"

# Check if correctly classified
correct_classification <- first_obs_pred == actual_direction
print(paste("Was the first observation correctly classified?", correct_classification))

## [1] "Was the first observation correctly classified? FALSE"

Predicted Direction: “Up” Actual Direction: “Down” Correct Classification? No, the prediction was incorrect. This means that our logistic regression model (excluding the first observation) misclassified the first observation.

# Get the number of observations
n <- nrow(Weekly)

# Initialize a vector to store errors (1 for misclassification, 0 for correct)
errors <- rep(0, n)

# Loop over each observation for LOOCV
for (i in 1:n) {
  # Fit logistic regression excluding the i-th observation
  logit_model_loo <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = binomial)
  
  # Predict the probability for the i-th observation
  prob_i <- predict(logit_model_loo, newdata = Weekly[i, ], type = "response")
  
  # Convert probability to class prediction
  pred_i <- ifelse(prob_i > 0.5, "Up", "Down")
  
  # Check if the prediction was correct
  if (pred_i != Weekly$Direction[i]) {
    errors[i] <- 1  # Misclassification
  }
}

# Compute LOOCV test error rate
loocv_error <- mean(errors)

# Print the test error rate
print(paste("LOOCV Test Error Rate:", round(loocv_error, 4)))

## [1] "LOOCV Test Error Rate: 0.45"

This means that the logistic regression model misclassified 45% of the observations when using Lag1 and Lag2 to predict Direction. A high error rate suggests that the model is not very effective at predicting market movement.

#e From part (d), we already computed the LOOCV test error as the mean of the misclassification errors recorded across all iterations. LOOCV Test Error Rate: 0.45 (or 45%) Interpretation: The model misclassifies 45% of the observations, indicating poor predictive performance.

High Error Rate (45%) A random guess (Up/Down) would have ~50% error, meaning the model is barely better than guessing. This suggests that Lag1 and Lag2 alone are weak predictors of Direction. Possible Improvements Include more predictors: Add Lag3, Lag4, Volume, or interaction terms to improve prediction accuracy. Use non-linear models: Logistic regression assumes a linear relationship, but market movements are often non-linear. Decision trees, SVMs, or neural networks may perform better. Apply feature engineering: Transform lag variables (e.g., squared terms, moving averages) to capture hidden patterns.

#9 a

# Load necessary library
library(ISLR2)

## 
## Attaching package: 'ISLR2'

## The following object is masked from 'package:MASS':
## 
##     Boston

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

# View structure of the Boston dataset (optional)
# str(Boston)

# Estimate the population mean of medv
mu_hat <- mean(Boston$medv)

# Print the estimate
print(paste("Estimated population mean (μ̂) of medv:", round(mu_hat, 4)))

## [1] "Estimated population mean (μ̂) of medv: 22.5328"

This means that the average median home value in the Boston dataset is approximately $22,532.80 (since medv is in $1000s).

# Sample standard deviation of medv
s <- sd(Boston$medv)

# Sample size
n <- length(Boston$medv)

# Standard error of the mean
se_mu_hat <- s / sqrt(n)

# Print the result
print(paste("Standard Error of μ̂:", round(se_mu_hat, 4)))

## [1] "Standard Error of μ̂: 0.4089"

Sample mean of medv (22.5328) is expected to vary by about $409 (since medv is in $1000s) from sample to sample. The smaller the standard error, the more precise our estimate of the population mean.

library(boot)

# Define bootstrap function for mean of medv
boot.fn <- function(data, index) {
  return(mean(data[index]))
}

# Set seed for reproducibility
set.seed(1)

# Perform bootstrap with 1000 resamples
boot_result <- boot(Boston$medv, boot.fn, R = 1000)

# Print bootstrap result
print(boot_result)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622

The bootstrap estimate is very close to the theoretical standard error. This confirms that the sample mean is stable, and the formula-based SE is accurate in this case. Since the bias is minimal, the sample mean is an unbiased and reliable estimator for the population mean of medv.

# Values from bootstrap
mu_hat <- mean(Boston$medv)
boot_se <- 0.4107

# Approximate 95% CI using 2*SE
ci_lower <- mu_hat - 2 * boot_se
ci_upper <- mu_hat + 2 * boot_se

print(paste("Bootstrap 95% CI: [", round(ci_lower, 4), ",", round(ci_upper, 4), "]"))

## [1] "Bootstrap 95% CI: [ 21.7114 , 23.3542 ]"

#The t.test() function computes a confidence interval for the mean assuming normality.
t.test(Boston$medv)

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

Both intervals are very similar, suggesting: The sampling distribution of the mean is approximately normal. The bootstrap estimate confirms the accuracy of the formula-based interval. The sample mean (μ̂ = 22.53) is well-supported by both methods.

# Estimate the population median of medv
mu_med_hat <- median(Boston$medv)

# Print the result
print(paste("Estimated population median (μ̂med) of medv:", mu_med_hat))

## [1] "Estimated population median (μ̂med) of medv: 21.2"

The median home value in the Boston dataset is $21,200. Since the median < mean (22.53), this confirms the distribution is right-skewed, with a few high-value homes increasing the mean.

# Define bootstrap function to return the median of medv
boot.fn.median <- function(data, index) {
  return(median(data[index]))
}

# Set seed for reproducibility
set.seed(1)

# Perform bootstrap with 1000 resamples
boot_median_result <- boot(Boston$medv, statistic = boot.fn.median, R = 1000)

# View results
print(boot_median_result)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn.median, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 0.02295   0.3778075

The standard error of the median is approximately 0.378, meaning the sample median tends to vary by about $378 across different samples (since medv is in $1000s). This value is slightly smaller than the standard error of the mean (≈ 0.41), which is typical for right-skewed data like housing prices. The bootstrap bias is small, so the median is a reliable and stable estimate.

# Estimate the 10th percentile of medv
mu_hat_0.1 <- quantile(Boston$medv, probs = 0.1)

# Print the result
print(paste("Estimated 10th percentile (μ̂₀.₁) of medv:", mu_hat_0.1))

## [1] "Estimated 10th percentile (μ̂₀.₁) of medv: 12.75"

10% of Boston census tracts have a median home value below $12,750. This gives insight into the lower end of the housing market in the area.

# Define bootstrap function to compute the 10th percentile
boot.fn.p10 <- function(data, index) {
  return(quantile(data[index], probs = 0.1))
}

# Set seed for reproducibility
set.seed(1)

# Perform bootstrap with 1000 resamples
boot_p10_result <- boot(Boston$medv, statistic = boot.fn.p10, R = 1000)

# Print the result
print(boot_p10_result)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn.p10, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0339   0.4767526

The standard error of 0.4768 tells us that the 10th percentile estimate of medv varies by approximately $477 across samples. The small bias (~0.03) indicates that the estimate is stable and reliable. As expected, the standard error for a lower percentile is larger than for the mean or median, since extreme values are typically more variable.

lab 6

2025-03-20