hw2-mariahh3

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#EXCERCISE 1
 library(MASS)

#(a) Suppose we would like to understand the size of a cat’s heart based on the body weight of a cat. Fit
#a simple linear model in R that accomplishes this task. Store the results in a variable called cat_model.
#Output the result of calling summary() on cat_model.

cats_model = lm(Bwt ~ Hwt , data = cats)
cats_model

## 
## Call:
## lm(formula = Bwt ~ Hwt, data = cats)
## 
## Coefficients:
## (Intercept)          Hwt  
##      1.0196       0.1603

summary(cats_model)

## 
## Call:
## lm(formula = Bwt ~ Hwt, data = cats)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58283 -0.22140 -0.00879  0.20825  0.91717 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.019637   0.108428   9.404   <2e-16 ***
## Hwt         0.160290   0.009944  16.119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2895 on 142 degrees of freedom
## Multiple R-squared:  0.6466, Adjusted R-squared:  0.6441 
## F-statistic: 259.8 on 1 and 142 DF,  p-value: < 2.2e-16

#(b) Output only the estimated regression coefficients. Interpret βˆ0 and β1 in the context of the problem. Be
#aware that only one of those is an estimate.

summary(cats_model)

## 
## Call:
## lm(formula = Bwt ~ Hwt, data = cats)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58283 -0.22140 -0.00879  0.20825  0.91717 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.019637   0.108428   9.404   <2e-16 ***
## Hwt         0.160290   0.009944  16.119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2895 on 142 degrees of freedom
## Multiple R-squared:  0.6466, Adjusted R-squared:  0.6441 
## F-statistic: 259.8 on 1 and 142 DF,  p-value: < 2.2e-16

coef(cats_model)

## (Intercept)         Hwt 
##   1.0196367   0.1602902

#(c) Use your model to predict the heart weight of a cat that weights 3.1 kg. Do you feel confident in this
#prediction? Briefly explain. 

print("I am confident in this predict becuase the heartweight of a couple cats in the datat set with a bodyweight of 6kg -7k is around 2-2.3kg which is close to half the predicted value")

## [1] "I am confident in this predict becuase the heartweight of a couple cats in the datat set with a bodyweight of 6kg -7k is around 2-2.3kg which is close to half the predicted value"

#points estimates 

new_hwt = data.frame(Hwt = c(3.1))
predict(cats_model, newdata = new_hwt)

##        1 
## 1.516536

predict(cats_model ,data.frame(Hwt = 3.1))

##        1 
## 1.516536

#(d) Use your model to predict the heart weight of a cat that weights 1.5 kg. Do you feel confident in this
#prediction? Briefly explain. 

print("I do not confidence because 1.5 kg cats may represent a different population ( baby kittens , mal nourished cats wtc) where the body weight-heart weight relationship might differ from the adult cats in our sample.")

## [1] "I do not confidence because 1.5 kg cats may represent a different population ( baby kittens , mal nourished cats wtc) where the body weight-heart weight relationship might differ from the adult cats in our sample."

predict(cats_model ,data.frame(Hwt = 1.5))

##        1 
## 1.260072

#(e) Create a scatterplot of the data and add the fitted regression line. Make sure your plot is well labeled
#and is somewhat visually appealing.
 
 
 plot(Bwt ~ Hwt, data = cats,
      xlab = "Bodyweight (kg) ",
      ylab = "Heart Weight (kg) ",
      main = "Bodyweight vs Heart weight",
      pch  = 18,
      cex  = 1,
      col  = "orange")
 abline(cats_model, lwd = 5, col = "yellow")

#(f) Report the value of R2 for the model. Do so directly. Do not simply copy and paste the value from the


summary(cats_model)$r.squared

## [1] 0.6466209

# EXCERCISE 2

 
# (a) Use R to simulate n = 25 observations from the above model. For the remainder of this exercise, use
#the following “known” values of x.

 birthday = 18760613
 set.seed(birthday)
 x = runif(n = 25, 0, 10)
 
 sim_slr = function(x, beta_0 = 10, beta_1 = 5, sigma = 1) {
   n = length(x)
   epsilon = rnorm(n, mean = 0, sd = sigma)
   y = beta_0 + beta_1 * x + epsilon
   
   data.frame(predictor = x, response = y)
 }
 sim_data = sim_slr(x = x, beta_0 = 5, beta_1 = -3, sigma = sqrt(10.24))
 
 
# (b) Fit a model to your simulated data. Report the estimated coefficients. Are they close to what you would
# expect? Briefly explain. 

  sim_model = lm(sim_data$response ~ sim_data$predictor, data = sim_data)
   Estimated_coefs = coef(sim_model)
   Estimated_coefs

##        (Intercept) sim_data$predictor 
##           4.954205          -2.915525

print("Yes, these estimates are close to the expected true values (β₀=5, β₁=-3). The small differences are normal due to random sampling variation.")

## [1] "Yes, these estimates are close to the expected true values (β₀=5, β₁=-3). The small differences are normal due to random sampling variation."

# (c) Plot the data you simulated in part (a). Add the regression line from part (b) as well as the line for


  plot(response ~ predictor,
       data = sim_data,
       xlab = "Predictor",
       ylab = "Response",
       main = "Regression Data",
       pch  = 10,
       cex  = 1,
       col  = "orange")
  abline(sim_model, lwd = 2, lty = 1, col = "gold")
  abline(5, -3, lwd = 2, lty = 2, col = "red")
  legend("topright", c("Estimate", "Actual"), lty = c(1, 2), lwd = 2,
         col = c("gold", "darkred"))

#(d) Use R to repeat the process of simulating n = 25 observations from the above model 1000 times. Each#time fit a SLR model to the data and store the value of βˆ1 in a variable called beta_hat_1. Some hints
  
  beta_hat_1 = rep(0,1000)
  
  for( i in 1:length(beta_hat_1)){
    sim_data = sim_slr(x = x, beta_0 = 5, beta_1 = -3, sigma = sqrt(10.24))
    sim_fit = lm(response ~ predictor, data = sim_data)
    beta_hat_1[i] =coef(sim_fit)[2] 
  } 
  
#(e) Report the mean and standard deviation of beta_hat_1. Do either of these look familiar? these values look very close and familer to the coefficents of the simulated data
    
    mean(beta_hat_1)

## [1] -2.991164

    sd(beta_hat_1)

## [1] 0.2511701

print("Yes, these values look familiar! The mean of beta_hat_1 is very close to the true β₁ = -3, confirming that β̂₁ is an unbiased estimator. The standard deviations demonstrates that the sampling distribution of β̂₁ is centered at the true parameter with predictable variability.")

## [1] "Yes, these values look familiar! The mean of beta_hat_1 is very close to the true β₁ = -3, confirming that β̂₁ is an unbiased estimator. The standard deviations demonstrates that the sampling distribution of β̂₁ is centered at the true parameter with predictable variability."

#(f) Plot a histogram of beta_hat_1. Comment on the shape of this histogram. (What distribution and
# parameter values would you estimate that it looks like? ) 

print ( "The shape of the histogram follows normal distribution displaying that the beta_hat_1 values are centered around -3, the actual mean.")

## [1] "The shape of the histogram follows normal distribution displaying that the beta_hat_1 values are centered around -3, the actual mean."

   hist(beta_hat_1 , 
        xlab="beta_hat_1 from simulated data",
        main= "Histogram of beta_hat_1",
        col="orange",
        border = "red")

library(MASS)
#Exercise 3 (Using lm for Inference)
#(a) Fit the following simple linear regression model in R. Use heart weight as the response and body weight
#as the predictor.
   
    cats_fit_reg_model = lm(Hwt ~ Bwt , data = cats)
  summ = summary(cats_fit_reg_model)
    slope_coeff <- summ$coefficients["Bwt",]
    slope_coeff

##     Estimate   Std. Error      t value     Pr(>|t|) 
## 4.034063e+00 2.502615e-01 1.611939e+01 6.969045e-34

#(b) Calculate a 95% confidence interval for β1. Give an interpretation of the interval in the context of the
#problem.
   
    confint( cats_fit_reg_model,level = 0.95)[2,]

##    2.5 %   97.5 % 
## 3.539343 4.528782

  print(" We are 95% confident that for each additional kilogram of body weight, the heart weight of cats increases between 3.539343  and 4.528782")

## [1] " We are 95% confident that for each additional kilogram of body weight, the heart weight of cats increases between 3.539343  and 4.528782"

#(c) Calculate a 95% confidence interval for β0. Give an interpretation of the interval in the context of the
#problem.
     confint(cats_fit_reg_model, level = 0.95)[1,]

##     2.5 %    97.5 % 
## -1.725163  1.011838

   print("We are 95% confident that the predicted heart weight for a hypothetical cat with 0 kg body weight would be between -1.725163  and  1.011838 grams. However, this interpretation is not practically meaningful since cats cannot have zero body weight - this represents extrapolation far beyond our data range.")

## [1] "We are 95% confident that the predicted heart weight for a hypothetical cat with 0 kg body weight would be between -1.725163  and  1.011838 grams. However, this interpretation is not practically meaningful since cats cannot have zero body weight - this represents extrapolation far beyond our data range."

#(d) Use a 95% confidence interval to estimate the mean heart weight for body weights of 2.1 and 2.8
#kilograms. Which of the two intervals is wider? Why?
  
print("the body wight 2.8kg is a little wider because the is comparatively closer to mean body weight. and we know that as you move away from mean value of response your confidence interval increases ")

## [1] "the body wight 2.8kg is a little wider because the is comparatively closer to mean body weight. and we know that as you move away from mean value of response your confidence interval increases "

    mean(cats$Hwt)

## [1] 10.63056

    mean(cats$Bwt)

## [1] 2.723611

  new_weights <- data.frame(Bwt = c(2.1, 2.8))
  predict(cats_fit_reg_model,newdata=new_weights, interval = c("confidence"), level=0.95)

##         fit       lwr       upr
## 1  8.114869  7.724455  8.505284
## 2 10.938713 10.696491 11.180935

#(e) Use a 95% prediction interval to predict the heart weight for body weights of 2.8 and 4.2 kilograms.
   new_weights <- data.frame(Bwt = c(2.8, 4.2))
  predict(cats_fit_reg_model,newdata=new_weights, interval = c("confidence"), level=0.95)

##        fit      lwr      upr
## 1 10.93871 10.69649 11.18093
## 2 16.58640 15.81781 17.35499

#(f) Create a scatterplot of the data. Add the regression line, 95% confidence bands, and 95% prediction
#bands. If this is done correctly, you should notice that the vast majority of the data points are within the
#prediction bands, but very few points are within the confidence bands.
  
  grid <- seq(min(cats$Bwt), max(cats$Bwt), by = 0.01)
  cat_ci_band <- predict(cats_fit_reg_model, newdata = data.frame(Bwt = grid), interval = "confidence", level = 0.95)
  cat_pi_band <- predict(cats_fit_reg_model, newdata = data.frame(Bwt = grid), interval = "prediction", level = 0.95)
  
  
  plot( Hwt ~ Bwt , data =cats , 
      xlab = " Bodyweight (kg)" ,
      ylab = " Heartweight (kg)" ,
      main = " Body weight vs Heart Weight",
      pch  = 20,
      cex  = 1,
      col  = "grey",
      ylim = c(min(cat_pi_band), max(cat_pi_band)))
abline(cats_fit_reg_model, lwd = 3, col = "darkorange")

lines(grid, cat_ci_band[,"lwr"], col = "dodgerblue", lwd = 3, lty = 2)
lines(grid, cat_ci_band[,"upr"], col = "dodgerblue", lwd = 3, lty = 2)
lines(grid, cat_pi_band[,"lwr"], col = "dodgerblue", lwd = 3, lty = 3)
lines(grid, cat_pi_band[,"upr"], col = "dodgerblue", lwd = 3, lty = 3)
points(mean(cats$Bwt), mean(cats$Hwt), pch = "+", cex = 3)

#(g) Since the confidence interval in part (f) did not actually contain most of the data, in your own words,
#what is the point of the confidence interval? (i.e. what does it represent)


print("Confidence intervals tell us about our certainty in predicting what the typical or expected heart weight should be for cats of a particular size, not what any single cat's heart weight will be. The narrow bands show we're quite accurate at estimating these population averages. Individual cats naturally deviate from this average due to genetic differences, health status, and other factors, which is why most data points lie outside these confidence bands. Prediction intervals serve a different purpose - they account for this individual variation when forecasting a specific cat's heart weight.")

## [1] "Confidence intervals tell us about our certainty in predicting what the typical or expected heart weight should be for cats of a particular size, not what any single cat's heart weight will be. The narrow bands show we're quite accurate at estimating these population averages. Individual cats naturally deviate from this average due to genetic differences, health status, and other factors, which is why most data points lie outside these confidence bands. Prediction intervals serve a different purpose - they account for this individual variation when forecasting a specific cat's heart weight."

#(h) Perform a t test to test the following null hypothesis by extracting values from summary(cat_model)
#(do not type the values directly):

X <- summary(cats_fit_reg_model)$coefficients["Bwt", "Estimate"]  # Observed slope
SE <- summary(cats_fit_reg_model)$coefficients["Bwt", "Std. Error"]  # Standard error
null_value <- 3.5
TS <- (X - null_value) / SE  # Test statistic
n <- nrow(cats)  # Sample size
df <- n - 2  # Degrees of freedom
p_val <- 2 * pt(abs(TS), df, lower.tail = FALSE)  # Two-tailed p-value
alpha <- 0.05

decision_h <- ifelse(p_val < alpha, "Reject H0", "Fail to reject H0")
print(paste("Statistical decision at α = 0.05:", decision_h))

## [1] "Statistical decision at α = 0.05: Reject H0"

if (p_val < alpha) {
  print("Conclusion: There is enough evidence to conclude that β1 is significantly different from 3.5.")
} else {
  print("Conclusion: There is not enough evidence to conclude that β1 is significantly different from 3.5.")
}

## [1] "Conclusion: There is enough evidence to conclude that β1 is significantly different from 3.5."

# EXCERCISE 4 

#(a) Fit the following simple linear regression model in R. Use the ozone measurement as the response and
#wind speed as the predictor.
#Yi = β0 + β1xi + ϵi
#Store the results in a variable called ozone_wind_model. Use a t test to test the significance of the regression.
# Part (a): Ozone vs Wind Speed
library(mlbench)
install.packages("mlbench")

## Warning: package 'mlbench' is in use and will not be installed

data(Ozone, package = "mlbench")



Ozone = Ozone[, c(4, 6, 7, 8)]
colnames(Ozone) = c("ozone", "wind", "humidity", "temp")
Ozone = Ozone[complete.cases(Ozone), ]

# Fit the model
 ozone_wind_model <- lm(ozone ~ wind, data = Ozone)

wind_summary <- summary(ozone_wind_model)

# store values  for hypothesis test

wind_coeffs <- wind_summary$coefficients
slope_info <- wind_coeffs["wind", ]
t_stat_wind <- slope_info["t value"]
p_value_wind <- slope_info["Pr(>|t|)"]


print("Null Hypothesis (H0): β1 = 0 , There is no linear relationship between wind speed and ozone levels
Alternative Hypothesis (H1): β1 ≠ 0 
There is a linear relationship between wind speed and ozone levels")

## [1] "Null Hypothesis (H0): β1 = 0 , There is no linear relationship between wind speed and ozone levels\nAlternative Hypothesis (H1): β1 ≠ 0 \nThere is a linear relationship between wind speed and ozone levels"

print(paste("Test statistic (t-value):", round(t_stat_wind, 4)))

## [1] "Test statistic (t-value): -0.219"

print(paste("P-value:", format(p_value_wind, scientific = TRUE, digits = 4)))

## [1] "P-value: 8.268e-01"

# Statistical decision at α = 0.01

alpha <- 0.01
decision_wind <- ifelse(p_value_wind < alpha, "Reject H0", "Fail to reject H0")
print(paste("Statistical decision at α = 0.01:", decision_wind))

## [1] "Statistical decision at α = 0.01: Fail to reject H0"

# Conclusion in context


if (p_value_wind < alpha) {
  conclusion_wind <- "There is enough evidence at the α = 0.01 significance level to conclude that there is a significant linear relationship between wind speed and ozone levels."
} else {
  conclusion_wind <- "There is not enough evidence at the α = 0.01 significance level to conclude that there is a significant linear relationship between wind speed and ozone levels."
}
print(paste("Conclusion:", conclusion_wind))

## [1] "Conclusion: There is not enough evidence at the α = 0.01 significance level to conclude that there is a significant linear relationship between wind speed and ozone levels."

#When reporting these, you should explicitly state them in your document, not assume that a reader will find
#and interpret them from a large block of R output.
#(b) Fit the following simple linear regression model in R. Use the ozone measurement as the response and
#**temperature as the predictor.

ozone_temp_model <- lm(ozone ~ temp, data = Ozone)


# Extract test statistics

temp_summary <- summary(ozone_temp_model)
t_stat <- temp_summary$coefficients["temp", "t value"]
p_value <- temp_summary$coefficients["temp", "Pr(>|t|)"]

# Report results

print("Part (b): Ozone vs Temperature Regression
Null Hypothesis (H0): β1 = 0 (no relationship between temperature and ozone
Alternative Hypothesis (H1): β1 ≠ 0 (there is a relationship)")

## [1] "Part (b): Ozone vs Temperature Regression\nNull Hypothesis (H0): β1 = 0 (no relationship between temperature and ozone\nAlternative Hypothesis (H1): β1 ≠ 0 (there is a relationship)"

print(paste("Test statistic:", round(t_stat, 4)))

## [1] "Test statistic: 22.849"

print(paste("P-value:", format(p_value, scientific = TRUE)))

## [1] "P-value: 8.153764e-71"

alpha <- 0.01
decision <- ifelse(p_value < alpha, "Reject H0", "Fail to reject H0")
print(paste("Decision at α = 0.01:", decision))

## [1] "Decision at α = 0.01: Reject H0"

if (p_value < alpha) {
  print("Conclusion: Significant linear relationship between temperature and ozone at α = 0.01")
} else {
  print("Conclusion: No significant linear relationship between temperature and ozone at α = 0.01")
}

## [1] "Conclusion: Significant linear relationship between temperature and ozone at α = 0.01"

# Exercise 5 (Simulating Confidence Intervals)
#a)

birthday = 18760613
set.seed(birthday)
n = 25
beta_0 = 5
beta_1 = 2
sigma_s2 = 9
sigma = sqrt(sigma_s2)
x = seq(0, 2.5, length = n)

beta_hat_0 <- rep(0,2500)
beta_hat_1 <- rep(0,2500)
beta_hat_1_se <- rep(0,2500)

Sxx <- sum((x - mean(x)) ^ 2)
for(i in 1:2500) {
  eps <- rnorm(n,mean=0,sd=sigma)
  y <- beta_0 + beta_1 * x + eps
  
  model <- lm(y~x)
  
  beta_hat_0[i] <- coef(model)[1]
  beta_hat_1[i] <- coef(model)[2]
  beta_hat_1_se[i] <- summary(model)$coefficient[2,"Std. Error"]
  
  mean(beta_hat_1)
  mean(beta_hat_1_se[i])
}

#(b) For each of the ˆ β1 that you simulated, calculate a 95% confidence interval. Store the lower limits in a
#vector lower_95 and the upper limits in a vector upper_95. Some hints:

crit_95 <- qt(0.950, df = length(resid(model)) - 2)

lower_95 <- c(beta_hat_1 - crit_95 * beta_hat_1_se)
upper_95 <- c(beta_hat_1 + crit_95 * beta_hat_1_se)

result <- data.frame(beta1hat = beta_hat_1, CI_low = lower_95, CI_High = upper_95)



#(c) What proportion of these intervals contains the true value of β1?

interval_95_contains_beta_1 <- subset(result, CI_low < beta_1 & beta_1 < CI_High)
proportion_95 <- nrow(interval_95_contains_beta_1 ) / nrow(result)
proportion_95

## [1] 0.8952

#(d) Based on these intervals, what proportion of the simulations would reject the test H0 : β1 = 0 vs H1 : β1 ̸= 0 at α = 0.05?

interval_95_contains_1 <- subset(result, CI_low < 0 & 0 < CI_High)
proportion_for_beta_1_test_95 <- nrow(interval_95_contains_1 ) / nrow(result)
1 - proportion_for_beta_1_test_95

## [1] 0.7756

#(e) For each of the ˆ β1 that you simulated, calculate a 99% confidence interval. Store the lower limits in a
#vector lower_99 and the upper limits in a vector upper_99.

crit_99 <- qt(0.995, df = length(resid(model)) - 2)
crit_99

## [1] 2.807336

lower_99 <- c(beta_hat_1 - crit_99 * beta_hat_1_se)
upper_99 <- c(beta_hat_1 + crit_99 * beta_hat_1_se)

#(f) What proportion of these intervals contains the true value of β1?

interval_99_contains_beta_1 <- subset(result, lower_99 < beta_1 & beta_1 < upper_99)
proportion_99 <- nrow(interval_99_contains_beta_1 ) / nrow(result)
proportion_99

## [1] 0.994

#(g) Based on these intervals, what proportion of the simulations would reject the test H0 : β1 = 0 vs H1 : β1 ̸= 0 at α = 0.01?

interval_99_contains_beta_1 <- subset(result, lower_99 < 0 & 0 < upper_99)
proportion_for_beta_1_test_99 <- nrow(interval_99_contains_beta_1 ) / nrow(result)
1 - proportion_for_beta_1_test_99

## [1] 0.3952

#Exercise 6 (Recreating lm())


#(a) Create a function named my_lm1 using SXX and SXY as defined in the textbook. You will need to
#calculate β1 first, then use that to find β0. Once your function is created, use the simulated data to estimate
#β0 and β1 with your function.
#Recommended: Check your answer with the results from lm(), but you don’t need to show/report that part

set.seed(2025)
x <- 1:10
y <- 2 + 3 * x + rnorm(10, 0, 1)

my_lm1 = function(x, y){
  xbar = mean(x)
  ybar = mean(y)
  
  sxx = sum(x^2) - sum(x)^2 / n
  sxy = sum(x * y) - (sum(x) * sum(y)) / n  
  
  
  beta_1 = sxy / sxx
  beta_0 =ybar - beta_1 * xbar
  coefficients <- c(beta_0, beta_1)
  names(coefficients) <- c("(Intercept)", "x")
  return(coefficients)
  
  } 

my_lm1(x, y)

## (Intercept)           x 
##   0.8344874   3.2770836

#(b) Repeat part(a), and create a function named my_lm2. This time, instead of using SXX and SXY, use
#matrix multiplication to find β0 and β1. First, create the appropriate model matrix, and call it X. After
#performing the matrix multiplication, use as.vector() to convert the output before naming. Check to make
#sure your answers are still the same.


my_lm2 <- function(x, y) {
  X <- cbind(1, x)
  
  # Calculate coefficients using matrix formula: β = (X'X)^(-1)X'y
  # where X' is the transpose of X
  
  # Method: β = (X'X)^(-1)X'y
  XtX <- t(X) %*% X           # transpose
  XtX_inv <- solve(XtX)       # Inverse 
  Xty <- t(X) %*% y           # X transpose times y
  beta <- XtX_inv %*% Xty     # Final calculation
  
  # Convert matrix to vector and add names
  coefficients <- as.vector(beta)
  names(coefficients) <- c("(Intercept)", "x")
  return(coefficients)
}
 my_lm2(x, y)

## (Intercept)           x 
##    2.670360    2.943289

hw2-mariahh3

MariahHolloway

2025-09-21

R Markdown