library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(ggplot2)
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(cpus)
str(cpus)
## 'data.frame': 209 obs. of 9 variables:
## $ name : Factor w/ 209 levels "ADVISOR 32/60",..: 1 3 2 4 5 6 8 9 10 7 ...
## $ syct : int 125 29 29 29 29 26 23 23 23 23 ...
## $ mmin : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ mmax : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ cach : int 256 32 32 32 32 64 64 64 64 128 ...
## $ chmin : int 16 8 8 8 8 8 16 16 16 32 ...
## $ chmax : int 128 32 32 32 16 32 32 32 32 64 ...
## $ perf : int 198 269 220 172 132 318 367 489 636 1144 ...
## $ estperf: int 199 253 253 253 132 290 381 381 749 1238 ...
head(cpus)
a) Investigate the relationship between variables in the dataset, both numerically and visually. Comment on the relationships you observe.
# Numerical Summary
summary(cpus)
## name syct mmin mmax
## ADVISOR 32/60 : 1 Min. : 17.0 Min. : 64 Min. : 64
## AMDAHL 470/7A : 1 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000
## AMDAHL 470V/7 : 1 Median : 110.0 Median : 2000 Median : 8000
## AMDAHL 470V/7B: 1 Mean : 203.8 Mean : 2868 Mean :11796
## AMDAHL 470V/7C: 1 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000
## AMDAHL 470V/8 : 1 Max. :1500.0 Max. :32000 Max. :64000
## (Other) :203
## cach chmin chmax perf
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 6.0
## 1st Qu.: 0.00 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0
## Median : 8.00 Median : 2.000 Median : 8.00 Median : 50.0
## Mean : 25.21 Mean : 4.699 Mean : 18.27 Mean : 105.6
## 3rd Qu.: 32.00 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0
## Max. :256.00 Max. :52.000 Max. :176.00 Max. :1150.0
##
## estperf
## Min. : 15.00
## 1st Qu.: 28.00
## Median : 45.00
## Mean : 99.33
## 3rd Qu.: 101.00
## Max. :1238.00
##
# Correlation Heatmap
library(corrplot)
## corrplot 0.95 loaded
corrplot(cor(cpus[,-1]), method = "circle")
# Pairwise Scatterplots
pairs(cpus[, -1], main = "Pairwise Scatterplots", pch = 19, col = "blue")
# Distribution of `estperf`
ggplot(cpus, aes(x = estperf)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
theme_minimal()
c) Create a residual plot using this model and comment on it’s features. Do any of the assumptions of linear regression seem to be violated? What might be done to adjust our model? Adjust the model if necessary by considering various residual plots, updating the model, and assessing residual plots using the updated model.
# Residual vs Fitted Plot
plot(model1$fitted.values, residuals(model1),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")
# Q-Q Plot for Normality Check
qqnorm(residuals(model1))
qqline(residuals(model1), col = "red")
Residuals vs. Fitted Values Plot: - The linear model may not be capturing all patterns in the data. - Variance of residuals may change with fitted values. Q-Q Plot: - Presence of outliers in the data.
model adjustments2
model2 <- lm(log(estperf) ~ syct + I(syct^2) + mmin + I(mmin^2) + mmax + cach + chmin + chmax, data = cpus)
# Summary of updated model
summary(model2)
##
## Call:
## lm(formula = log(estperf) ~ syct + I(syct^2) + mmin + I(mmin^2) +
## mmax + cach + chmin + chmax, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7606 -0.1022 0.0266 0.1127 0.4133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.242e+00 3.640e-02 89.042 < 2e-16 ***
## syct -7.532e-04 1.608e-04 -4.686 5.15e-06 ***
## I(syct^2) 3.664e-07 1.311e-07 2.794 0.00571 **
## mmin 8.077e-05 9.978e-06 8.094 5.52e-14 ***
## I(mmin^2) -2.984e-09 3.513e-10 -8.493 4.56e-15 ***
## mmax 4.923e-05 1.984e-06 24.811 < 2e-16 ***
## cach 6.391e-03 4.243e-04 15.062 < 2e-16 ***
## chmin 5.973e-03 2.587e-03 2.309 0.02199 *
## chmax -8.919e-04 6.652e-04 -1.341 0.18151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1802 on 200 degrees of freedom
## Multiple R-squared: 0.9644, Adjusted R-squared: 0.963
## F-statistic: 678 on 8 and 200 DF, p-value: < 2.2e-16
# Residuals vs. Fitted Plot for Updated Model
plot(model2$fitted.values, residuals(model2),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted Values (Model 2)")
abline(h = 0, col = "red")
# Q-Q Plot for Normality Check for Updated Model
qqnorm(residuals(model2))
qqline(residuals(model2), col = "red")
# Check for influential points in updated model using Cook's Distance
plot(model2, which = 4) # Influence plot to check Cook's distance
model adjustments3
model3 <- lm(log(estperf) ~ syct + mmin + mmax + cach + chmin + chmax, data = cpus, weights = 1/fitted(model2))
# Summary of weighted least squares model
summary(model3)
##
## Call:
## lm(formula = log(estperf) ~ syct + mmin + mmax + cach + chmin +
## chmax, data = cpus, weights = 1/fitted(model2))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.46222 -0.05678 0.01625 0.06911 0.22142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.219e+00 2.760e-02 116.637 < 2e-16 ***
## syct -3.704e-04 5.484e-05 -6.754 1.5e-10 ***
## mmin 1.642e-05 7.179e-06 2.287 0.0232 *
## mmax 5.674e-05 2.402e-06 23.617 < 2e-16 ***
## cach 7.255e-03 5.319e-04 13.640 < 2e-16 ***
## chmin 2.808e-03 3.402e-03 0.825 0.4101
## chmax -8.538e-04 7.951e-04 -1.074 0.2842
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1029 on 202 degrees of freedom
## Multiple R-squared: 0.9443, Adjusted R-squared: 0.9426
## F-statistic: 570.5 on 6 and 202 DF, p-value: < 2.2e-16
# Residuals vs. Fitted Plot for Model with Weighted Least Squares
plot(model3$fitted.values, residuals(model3),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted Values (Weighted Least Squares)")
abline(h = 0, col = "red")
# Q-Q Plot for Normality Check for Weighted Least Squares Model
qqnorm(residuals(model3))
qqline(residuals(model3), col = "red")
# Checking for outliers or influential points again with the new model
plot(model3, which = 4) # Influence plot to check Cook's distance
d) How well does the final model fit the data? Comment on some model fit criteria from the model built in c)
# Summary of the final model (Model 3 - Weighted Least Squares)
summary(model3)
##
## Call:
## lm(formula = log(estperf) ~ syct + mmin + mmax + cach + chmin +
## chmax, data = cpus, weights = 1/fitted(model2))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.46222 -0.05678 0.01625 0.06911 0.22142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.219e+00 2.760e-02 116.637 < 2e-16 ***
## syct -3.704e-04 5.484e-05 -6.754 1.5e-10 ***
## mmin 1.642e-05 7.179e-06 2.287 0.0232 *
## mmax 5.674e-05 2.402e-06 23.617 < 2e-16 ***
## cach 7.255e-03 5.319e-04 13.640 < 2e-16 ***
## chmin 2.808e-03 3.402e-03 0.825 0.4101
## chmax -8.538e-04 7.951e-04 -1.074 0.2842
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1029 on 202 degrees of freedom
## Multiple R-squared: 0.9443, Adjusted R-squared: 0.9426
## F-statistic: 570.5 on 6 and 202 DF, p-value: < 2.2e-16
# Residual Standard Error (from the summary output)
residual_standard_error <- summary(model3)$sigma
cat("Residual Standard Error: ", residual_standard_error, "\n")
## Residual Standard Error: 0.1029496
# R-squared and Adjusted R-squared (from the summary output)
r_squared <- summary(model3)$r.squared
adj_r_squared <- summary(model3)$adj.r.squared
cat("R-squared: ", r_squared, "\n")
## R-squared: 0.9442728
cat("Adjusted R-squared: ", adj_r_squared, "\n")
## Adjusted R-squared: 0.9426176
# F-statistic and its p-value (from the summary output)
f_statistic <- summary(model3)$fstatistic[1]
f_p_value <- pf(f_statistic, df1 = summary(model3)$fstatistic[2], df2 = summary(model3)$fstatistic[3], lower.tail = FALSE)
cat("F-statistic: ", f_statistic, "\n")
## F-statistic: 570.4672
cat("F-statistic p-value: ", f_p_value, "\n")
## F-statistic p-value: 1.056485e-123
e) Interpret all variables in your final model using complete sentences, making sure to account for the fact that this may be a multivariable model. Give interpretations in terms of as meaningful of units as possible (it may not be possible to use seconds for cycle time - the answer is too large, but you may use MB instead of kB, for instance). Adjust interpretations as needed, both for units, and the fact that our outcome has been log transformed (how do we get to the raw data values from a log transformation? Start by thinking: what is the inverse of the log function???)
# Coefficients of the final model (Model 3)
coef(model3)
## (Intercept) syct mmin mmax cach
## 3.219373e+00 -3.704125e-04 1.642075e-05 5.674055e-05 7.254669e-03
## chmin chmax
## 2.807945e-03 -8.537759e-04
# Exponentiate the coefficients to interpret in terms of original units
exp(coef(model3))
## (Intercept) syct mmin mmax cach chmin
## 25.0124445 0.9996297 1.0000164 1.0000567 1.0072810 1.0028119
## chmax
## 0.9991466
f) Calculate indices that help assess multicollinearity between predictors in your final model. Is there evidence of multicollinearity? What does this imply, and should you take action? Take action if appropriate.
# Load the car package for VIF calculation
library(car)
# Calculate VIF for the final model
vif(model3)
## syct mmin mmax cach chmin chmax
## 1.197671 2.710813 2.904269 1.786732 1.972578 1.690111
g) Are there any outliers or influential observations in this data? Calculate relevant indices or provide visualizations to justify your answer. Make sure to use rules of thumb discussed in class if necessary for interpretations.
Cook’s Distance: - Rule of Thumb: A Cook’s Distance greater than 1 may indicate an influential point. A more conservative threshold is 4/n, where n is the number of data points. If a point exceeds this threshold, it’s considered influential.
# Cook's Distance Plot
plot(model3, which = 4) # Influence plot to check Cook's distance
Leverage: - Rule of Thumb: A leverage value higher than 2(k+1)/n where k is the number of predictors and n is the number of observations, may indicate a high-leverage point.
# Leverage values
lev <- hatvalues(model3)
plot(lev, type = "h", main = "Leverage Values", ylab = "Leverage", col = "blue")
abline(h = (2*(length(coef(model3))+1))/nrow(cpus), col = "red") # Threshold line
Standardized Residuals: -Rule of Thumb: A studentized residual larger than 2 or smaller than -2 is typically considered an outlier.
# Standardized Residuals
std_resid <- rstandard(model3)
plot(std_resid, type = "h", main = "Standardized Residuals", ylab = "Standardized Residuals", col = "blue")
abline(h = c(-2, 2), col = "red") # Threshold for outliers
library(MASS)
data("birthwt")
head(birthwt)
# Convert categorical variables to factors
birthwt$low <- factor(birthwt$low, labels = c("Not Low", "Low"))
birthwt$race <- factor(birthwt$race, labels = c("White", "Black", "Other"))
birthwt$smoke <- factor(birthwt$smoke, labels = c("Non-Smoker", "Smoker"))
birthwt$ht <- factor(birthwt$ht, labels = c("No", "Yes"))
birthwt$ui <- factor(birthwt$ui, labels = c("No", "Yes"))
summary(birthwt)
## low age lwt race smoke
## Not Low:130 Min. :14.00 Min. : 80.0 White:96 Non-Smoker:115
## Low : 59 1st Qu.:19.00 1st Qu.:110.0 Black:26 Smoker : 74
## Median :23.00 Median :121.0 Other:67
## Mean :23.24 Mean :129.8
## 3rd Qu.:26.00 3rd Qu.:140.0
## Max. :45.00 Max. :250.0
## ptl ht ui ftv bwt
## Min. :0.0000 No :177 No :161 Min. :0.0000 Min. : 709
## 1st Qu.:0.0000 Yes: 12 Yes: 28 1st Qu.:0.0000 1st Qu.:2414
## Median :0.0000 Median :0.0000 Median :2977
## Mean :0.1958 Mean :0.7937 Mean :2945
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3487
## Max. :3.0000 Max. :6.0000 Max. :4990
library(ggplot2)
# Bar plot for 'smoke' vs 'low'
ggplot(birthwt, aes(x = smoke, fill = low)) +
geom_bar(position = "fill") +
labs(y = "Proportion", title = "Proportion of Low Birthweight by Smoking Status")
# Bar plot for 'race' vs 'low'
ggplot(birthwt, aes(x = race, fill = low)) +
geom_bar(position = "fill") +
labs(y = "Proportion", title = "Proportion of Low Birthweight by Race")
# Boxplot for 'lwt' (Mother's weight) vs 'low'
ggplot(birthwt, aes(x = low, y = lwt, fill = low)) +
geom_boxplot() +
labs(title = "Distribution of Mother's Weight by Birthweight")
# Scatterplot for 'age' vs 'lwt' colored by 'low'
ggplot(birthwt, aes(x = age, y = lwt, color = low)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Mother's Age vs Weight by Birthweight Status")
## `geom_smooth()` using formula = 'y ~ x'
log_model <- glm(low ~ age + lwt + race + smoke + ht + ui + ptl + ftv,
data = birthwt,
family = binomial)
# Display model summary
summary(log_model)
##
## Call:
## glm(formula = low ~ age + lwt + race + smoke + ht + ui + ptl +
## ftv, family = binomial, data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.480623 1.196888 0.402 0.68801
## age -0.029549 0.037031 -0.798 0.42489
## lwt -0.015424 0.006919 -2.229 0.02580 *
## raceBlack 1.272260 0.527357 2.413 0.01584 *
## raceOther 0.880496 0.440778 1.998 0.04576 *
## smokeSmoker 0.938846 0.402147 2.335 0.01957 *
## htYes 1.863303 0.697533 2.671 0.00756 **
## uiYes 0.767648 0.459318 1.671 0.09467 .
## ptl 0.543337 0.345403 1.573 0.11571
## ftv 0.065302 0.172394 0.379 0.70484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 201.28 on 179 degrees of freedom
## AIC: 221.28
##
## Number of Fisher Scoring iterations: 4
# Fit the logistic regression model with only significant predictors
model_refined <- glm(low ~ lwt + race + smoke + ht + ui,
data = birthwt,
family = binomial())
# Summary of the refined model
summary(model_refined)
##
## Call:
## glm(formula = low ~ lwt + race + smoke + ht + ui, family = binomial(),
## data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.056276 0.937853 0.060 0.95215
## lwt -0.016732 0.006803 -2.459 0.01392 *
## raceBlack 1.324562 0.521464 2.540 0.01108 *
## raceOther 0.926197 0.430386 2.152 0.03140 *
## smokeSmoker 1.035831 0.392558 2.639 0.00832 **
## htYes 1.871416 0.690902 2.709 0.00676 **
## uiYes 0.904974 0.447553 2.022 0.04317 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 204.22 on 182 degrees of freedom
## AIC: 218.22
##
## Number of Fisher Scoring iterations: 4
ptl (number of previous premature labors): In the first logistic regression model, this variable was not statistically significant. The p-value for ptl was greater than 0.1, which suggests that it does not have a strong relationship with the outcome variable.
ftv (number of physician visits during the first trimester): Similarly, ftv was also not statistically significant in the first model. The p-value for ftv was also high, indicating that it doesn’t have a meaningful effect on predicting low birthweight in this dataset.
model assumptions: - No relationship between ptl and ftv with low birthweight, Since these two variables were excluded from the final model, the model assumes that neither ptl nor ftv contribute significantly to predicting low birthweight.
Are These Assumptions Realistic? - ptl is not significant in this dataset suggests that the history of premature labors doesn’t have a strong relationship with low birthweight in this specific sample.The non-significance of ftv could indicate that the number of physician visits in the first trimester isn’t a strong predictor of low birthweight in this sample.
The model assumes a linear relationship between each continuous predictor and the log odds of the outcome. This is often a limitation because real-world relationships between predictors and the log odds might be non-linear.
The logistic regression model in part b) implicitly assumes that ptl and ftv have no effect on low birthweight in this specific dataset.
Steps for Creating ptl2: Understand the distribution of ptl: - ptl has values ranging from 0 to 3, and based on the summary, we might have few observations in some categories. Collapse ptl into broader groups: - Given the small sample size for higher values of ptl, we can group them into broader categories. For instance: - 0: No previous premature labor. - 1: One previous premature labor. - 2 or more: Two or more previous premature labors. This could be combined into one category since there are likely fewer than 10 observations in each of these categories. Create the new variable ptl2: - We can use dplyr or basic ifelse conditions in R to create the new variable.
Creating ptl2:
# Create a new variable ptl2
birthwt$ptl2 <- ifelse(birthwt$ptl == 0, "None",
ifelse(birthwt$ptl == 1, "One",
ifelse(birthwt$ptl >= 2, "Two or more", NA)))
# Convert ptl2 into a factor to be used in analysis
birthwt$ptl2 <- factor(birthwt$ptl2, levels = c("None", "One", "Two or more"))
# View the updated data
table(birthwt$ptl2) # Check the distribution of the new variable
##
## None One Two or more
## 159 24 6
# Fit a new logistic regression model using the new variable 'ptl2'
model_updated <- glm(low ~ lwt + race + smoke + ht + ui + ptl2,
family = binomial(link = "logit"), data = birthwt)
# Display the summary of the updated model
summary(model_updated)
##
## Call:
## glm(formula = low ~ lwt + race + smoke + ht + ui + ptl2, family = binomial(link = "logit"),
## data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.015463 0.982536 -0.016 0.98744
## lwt -0.016773 0.007081 -2.369 0.01786 *
## raceBlack 1.250947 0.534975 2.338 0.01937 *
## raceOther 0.827222 0.446817 1.851 0.06412 .
## smokeSmoker 0.875302 0.410139 2.134 0.03283 *
## htYes 1.875815 0.716529 2.618 0.00885 **
## uiYes 0.828436 0.466987 1.774 0.07606 .
## ptl2One 1.463051 0.506699 2.887 0.00388 **
## ptl2Two or more -0.163819 0.945560 -0.173 0.86245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 195.16 on 180 degrees of freedom
## AIC: 213.16
##
## Number of Fisher Scoring iterations: 4
Step 1: checking the ftv variable
table(birthwt$ftv)
##
## 0 1 2 3 4 6
## 100 47 30 7 4 1
birthwt$ftv2 <- ifelse(birthwt$ftv == 0, "No visits",
ifelse(birthwt$ftv <= 2, "Few visits", "Many visits"))
table(birthwt$ftv2)
##
## Few visits Many visits No visits
## 77 12 100
Step 2: Summarize the relationship between low birthweight and ftv2
table(birthwt$ftv2, birthwt$low)
##
## Not Low Low
## Few visits 59 18
## Many visits 7 5
## No visits 64 36
Step 3: Calculate the probabilities of low birthweight for each level of ftv2
prop.table(table(birthwt$ftv2, birthwt$low), margin = 1)
##
## Not Low Low
## Few visits 0.7662338 0.2337662
## Many visits 0.5833333 0.4166667
## No visits 0.6400000 0.3600000
Step 1: Fit the Logistic Regression Model with ftv2 and ptl2
# Fit the logistic regression model using the new variables 'ptl2' and 'ftv2'
model_updated2 <- glm(low ~ lwt + race + smoke + ht + ui + ptl2 + ftv2,
family = binomial(link = "logit"), data = birthwt)
# Display the summary of the updated model
summary(model_updated2)
##
## Call:
## glm(formula = low ~ lwt + race + smoke + ht + ui + ptl2 + ftv2,
## family = binomial(link = "logit"), data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.212146 0.995993 -0.213 0.83133
## lwt -0.016647 0.007029 -2.368 0.01787 *
## raceBlack 1.202742 0.541229 2.222 0.02627 *
## raceOther 0.737588 0.462355 1.595 0.11065
## smokeSmoker 0.768965 0.426988 1.801 0.07172 .
## htYes 1.819791 0.723424 2.516 0.01189 *
## uiYes 0.823097 0.469368 1.754 0.07949 .
## ptl2One 1.518520 0.514282 2.953 0.00315 **
## ptl2Two or more -0.149223 0.957761 -0.156 0.87619
## ftv2Many visits 0.764709 0.720009 1.062 0.28820
## ftv2No visits 0.379836 0.398844 0.952 0.34092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 193.62 on 178 degrees of freedom
## AIC: 215.62
##
## Number of Fisher Scoring iterations: 4
Key Takeaways: - ptl2One (One previous premature labor) is a significant predictor. This implies that mothers with one previous premature labor have higher odds of delivering a low birthweight baby. However, ptl2Two or more is not significant, meaning two or more premature labors does not have a clear effect in this dataset.
ftv2 (Physician visits) does not show any significant relationship with low birthweight, which suggests that the number of visits in the first trimester doesn’t play a major role after adjusting for other variables in this dataset.
Other significant predictors include mother’s weight (lwt), race (raceBlack), and history of hypertension (htYes), all of which contribute significantly to predicting low birthweight.
Step 1: Install and Load Necessary Packages
# Install required packages with a specified CRAN mirror
install.packages(c("TeachingDemos", "MASS", "caret"), repos = "https://cloud.r-project.org/")
## Warning: package 'MASS' is in use and will not be installed
## Installing packages into 'C:/Users/saisr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'TeachingDemos' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\saisr\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\saisr\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\saisr\AppData\Local\Temp\RtmpCuoL6B\downloaded_packages
# Load the necessary libraries
library(TeachingDemos)
## Warning: package 'TeachingDemos' was built under R version 4.4.3
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
# Load the birthwt dataset
data("birthwt")
Step 2: Set the Seed for Reproducibility
# Create a binary variable for race (Black = 1, not Black = 0)
birthwt$raceBlack <- ifelse(birthwt$race == 1, 1, 0)
# Load the caret package
library(caret)
# Set a seed for reproducibility
set.seed(123)
# Split the dataset into training and test sets
trainIndex <- createDataPartition(birthwt$low, p = 0.8, list = FALSE)
trainSet <- birthwt[trainIndex, ]
testSet <- birthwt[-trainIndex, ]
step 3: Fit LDA and QDA Models
# Fit LDA and QDA models
lda_model <- lda(low ~ lwt + raceBlack + ht, data = trainSet)
qda_model <- qda(low ~ lwt + raceBlack + ht, data = trainSet)
# Make predictions for LDA and QDA models
lda_pred <- predict(lda_model, newdata = testSet)$class
qda_pred <- predict(qda_model, newdata = testSet)$class
# Convert 'low' to a factor (if it's not already)
testSet$low <- factor(testSet$low, levels = c(0, 1)) # Assuming 0 = "not low birth weight", 1 = "low birth weight"
# Convert predictions to factors with the same levels as 'low'
lda_pred <- factor(lda_pred, levels = c(0, 1))
qda_pred <- factor(qda_pred, levels = c(0, 1))
# Now you can proceed with confusion matrix and other evaluations
Step 4: Step 4: Evaluate Model Performance
# Make predictions for LDA and QDA models
lda_pred <- predict(lda_model, newdata = testSet)$class
qda_pred <- predict(qda_model, newdata = testSet)$class
# Confusion matrices
lda_cm <- confusionMatrix(lda_pred, testSet$low)
qda_cm <- confusionMatrix(qda_pred, testSet$low)
# Print confusion matrices and performance metrics
lda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 26 10
## 1 0 1
##
## Accuracy : 0.7297
## 95% CI : (0.5588, 0.8621)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.438235
##
## Kappa : 0.1232
##
## Mcnemar's Test P-Value : 0.004427
##
## Sensitivity : 1.00000
## Specificity : 0.09091
## Pos Pred Value : 0.72222
## Neg Pred Value : 1.00000
## Prevalence : 0.70270
## Detection Rate : 0.70270
## Detection Prevalence : 0.97297
## Balanced Accuracy : 0.54545
##
## 'Positive' Class : 0
##
qda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 24 9
## 1 2 2
##
## Accuracy : 0.7027
## 95% CI : (0.5302, 0.8413)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.58051
##
## Kappa : 0.1285
##
## Mcnemar's Test P-Value : 0.07044
##
## Sensitivity : 0.9231
## Specificity : 0.1818
## Pos Pred Value : 0.7273
## Neg Pred Value : 0.5000
## Prevalence : 0.7027
## Detection Rate : 0.6486
## Detection Prevalence : 0.8919
## Balanced Accuracy : 0.5524
##
## 'Positive' Class : 0
##
Accuracy: 72.97% Sensitivity: 100% (perfect at identifying “not low birth weight”) Specificity: 9.09% (very low at identifying “low birth weight”) Positive Predictive Value: 72.22% Negative Predictive Value: 100% Balanced Accuracy: 54.55% Kappa: 0.1232 (low agreement between predicted and actual values) McNemar’s Test P-Value: 0.004427 (indicates a significant difference in the misclassification of the two classes) ### QDA Model: Accuracy: 70.27% Sensitivity: 92.31% (strong at identifying “not low birth weight”) Specificity: 18.18% (still low, but higher than LDA for identifying “low birth weight”) Positive Predictive Value: 72.73% Negative Predictive Value: 50% Balanced Accuracy: 55.24% Kappa: 0.1285 (still low agreement) McNemar’s Test P-Value: 0.07044 (indicates a borderline significance) ### Interpretation: LDA has higher sensitivity but much lower specificity compared to QDA. Both models perform similarly in terms of accuracy and balanced accuracy. Both models suffer from poor specificity, indicating they struggle to accurately identify “low birth weight” cases.
Step 5: Evaluate Logistic Regression Models
# Fit Logistic Regression Models
logit_model_f <- glm(low ~ lwt + raceBlack + ht, data = trainSet, family = binomial)
logit_model_b <- glm(low ~ lwt + raceBlack + smoke + ptl + ht, data = trainSet, family = binomial)
# Make predictions for the logistic regression models
logit_pred_f <- predict(logit_model_f, newdata = testSet, type = "response")
logit_pred_b <- predict(logit_model_b, newdata = testSet, type = "response")
# Convert probabilities to binary predictions using a threshold of 0.5
logit_pred_f_class <- ifelse(logit_pred_f > 0.5, 1, 0)
logit_pred_b_class <- ifelse(logit_pred_b > 0.5, 1, 0)
# Convert predictions to factors
logit_pred_f_class <- factor(logit_pred_f_class, levels = c(0, 1))
logit_pred_b_class <- factor(logit_pred_b_class, levels = c(0, 1))
# Confusion matrices for logistic regression models
logit_cm_f <- confusionMatrix(logit_pred_f_class, testSet$low)
logit_cm_b <- confusionMatrix(logit_pred_b_class, testSet$low)
# Print confusion matrices and performance metrics for logistic regression models
logit_cm_f
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 26 10
## 1 0 1
##
## Accuracy : 0.7297
## 95% CI : (0.5588, 0.8621)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.438235
##
## Kappa : 0.1232
##
## Mcnemar's Test P-Value : 0.004427
##
## Sensitivity : 1.00000
## Specificity : 0.09091
## Pos Pred Value : 0.72222
## Neg Pred Value : 1.00000
## Prevalence : 0.70270
## Detection Rate : 0.70270
## Detection Prevalence : 0.97297
## Balanced Accuracy : 0.54545
##
## 'Positive' Class : 0
##
logit_cm_b
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23 7
## 1 3 4
##
## Accuracy : 0.7297
## 95% CI : (0.5588, 0.8621)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.4382
##
## Kappa : 0.2773
##
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.8846
## Specificity : 0.3636
## Pos Pred Value : 0.7667
## Neg Pred Value : 0.5714
## Prevalence : 0.7027
## Detection Rate : 0.6216
## Detection Prevalence : 0.8108
## Balanced Accuracy : 0.6241
##
## 'Positive' Class : 0
##
Step 6: Comparison of Performance Metrics
# Print metrics for LDA
cat("LDA Accuracy: ", lda_cm$overall['Accuracy'], "\n")
## LDA Accuracy: 0.7297297
cat("LDA Sensitivity: ", lda_cm$byClass['Sensitivity'], "\n")
## LDA Sensitivity: 1
cat("LDA Specificity: ", lda_cm$byClass['Specificity'], "\n")
## LDA Specificity: 0.09090909
# Print metrics for QDA
cat("QDA Accuracy: ", qda_cm$overall['Accuracy'], "\n")
## QDA Accuracy: 0.7027027
cat("QDA Sensitivity: ", qda_cm$byClass['Sensitivity'], "\n")
## QDA Sensitivity: 0.9230769
cat("QDA Specificity: ", qda_cm$byClass['Specificity'], "\n")
## QDA Specificity: 0.1818182
# Print metrics for Logistic Regression (f)
cat("Logit Model F Accuracy: ", logit_cm_f$overall['Accuracy'], "\n")
## Logit Model F Accuracy: 0.7297297
cat("Logit Model F Sensitivity: ", logit_cm_f$byClass['Sensitivity'], "\n")
## Logit Model F Sensitivity: 1
cat("Logit Model F Specificity: ", logit_cm_f$byClass['Specificity'], "\n")
## Logit Model F Specificity: 0.09090909
# Print metrics for Logistic Regression (b)
cat("Logit Model B Accuracy: ", logit_cm_b$overall['Accuracy'], "\n")
## Logit Model B Accuracy: 0.7297297
cat("Logit Model B Sensitivity: ", logit_cm_b$byClass['Sensitivity'], "\n")
## Logit Model B Sensitivity: 0.8846154
cat("Logit Model B Specificity: ", logit_cm_b$byClass['Specificity'], "\n")
## Logit Model B Specificity: 0.3636364
LDA and Logit Model F both have the highest accuracy (0.73) and sensitivity (1.00), meaning they perfectly identify the cases with low birth weight. However, their specificity is low (0.09), meaning they struggle with correctly identifying non-low birth weight cases (high false positive rate).
QDA has a slightly lower accuracy (0.70) and sensitivity (0.92) than LDA and Logit Model F, but it performs better than them in terms of specificity (0.18), though it’s still not ideal.
Logit Model B has the highest specificity (0.36) but lower sensitivity (0.88), indicating that it does a better job at identifying non-low birth weight cases, though it misses more of the low birth weight cases.
logit_model <- glm(low ~ lwt + raceBlack + ht + smoke, family = binomial, data = trainSet)
comment on above
b) Use either methods commonly used in the book/lecture notes to build a linear regression model predicting estimated performance from predictors in the dataset. Do not consider in this modeling approach. Explain the process used to arrive at your final model.