file.exists("~/Downloads/header.tex")
## [1] TRUE
Please hand this result in on Canvas no later than 11:59pm on Wednesday, March 12! Do not work in groups!
# libraries
library(MASS) # For cpus dataset
library(ggplot2) # For visualization
library(corrplot) # For correlation heatmap
## corrplot 0.95 loaded
library(GGally) # For pairwise scatterplots
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# cpus dataset
data("cpus")
cpus <- cpus %>% select(-perf, -name)
# Convert non-numeric columns to numeric safely
cpus[] <- lapply(cpus, function(x) if(is.factor(x)) as.numeric(as.character(x)) else x)
cpus <- cpus %>% mutate(across(everything(), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
# NUMERICAL ANALYSIS - Compute Correlation Matrix
cor_matrix <- cor(cpus, use = "complete.obs") # Use complete observations only
print(cor_matrix["estperf", ])
## syct mmin mmax cach chmin chmax estperf
## -0.2883956 0.8192915 0.9012024 0.6486203 0.6105802 0.5921556 1.0000000
# VISUAL ANALYSIS - Correlation Heatmap
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7,
title = "Correlation Heatmap of cpus Dataset", mar = c(0, 0, 2, 0))
# VISUAL ANALYSIS - Pairwise Scatterplots (Avoiding ggplot text issue)
ggpairs(cpus, upper = list(continuous = wrap("points", alpha = 0.6)),
lower = list(continuous = wrap("smooth", method = "lm", se = FALSE)))
# VISUAL ANALYSIS - Scatter Plots of `estperf` vs Other Variables
plot_list <- list()
for (var in names(cpus)[names(cpus) != "estperf"]) {
p <- ggplot(cpus, aes_string(x = var, y = "estperf")) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
ggtitle(paste("estperf vs", var)) +
theme_minimal()
plot_list[[var]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print all scatter plots
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
do.call(grid.arrange, c(plot_list, ncol = 3))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Memory (mmin, mmax) has the strongest impact on estperf – machines with higher memory tend to have much better estimated performance. Cache size (cach) and channels (chmin, chmax) show moderate positive effects – more cache and channels improve performance but less strongly than memory. Cycle time (syct) is weakly negative – meaning longer cycle times slightly reduce estimated performance
#The correlation heatmap of the cpus dataset visually represents the relationships between variables using a color scale mmin (Minimum Memory) and mmax (Maximum Memory) have strong positive correlations with estperf, meaning more memory improves estimated performance. cach (Cache Size), chmin, and chmax also have moderate positive correlations with estperf, suggesting larger cache and channels contribute to performance. syct (Cycle Time) appears to have a negative correlation with estperf, meaning higher cycle time reduces performance.
#Multicollinearity: mmin and mmax are highly correlated with each other. chmin and chmax are also closely related. Multicollinearity can impact regression models, so checking Variance Inflation Factor (VIF) would be useful.
#Scatterplots with Linear Trendlines mmin and mmax are the strongest predictors of estperf. ✔ syct has a non-linear negative effect, possibly needing a transformation. ✔ cach, chmin, and chmax show weaker positive correlations with some variance. ✔ Multicollinearity should be checked (since mmin and mmax are highly correlated).
library(car) # For VIF (multicollinearity check)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Load dataset and remove `name` and `perf` (we do not use published performance)
data("cpus")
cpus <- cpus %>% select(-name, -perf)
# Check missing values
print(colSums(is.na(cpus)))
## syct mmin mmax cach chmin chmax estperf
## 0 0 0 0 0 0 0
# Convert non-numeric columns to numeric (if needed)
cpus[] <- lapply(cpus, function(x) if(is.factor(x)) as.numeric(as.character(x)) else x)
# Handle missing values by replacing NAs with column medians
cpus <- cpus %>% mutate(across(everything(), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Compute correlation matrix
cor_matrix <- cor(cpus, use = "complete.obs")
# Plot correlation heatmap
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7,
title = "Correlation Heatmap of cpus Dataset", mar = c(0, 0, 2, 0))
# Scatterplots to visualize relationships
plot_list <- list()
for (var in names(cpus)[names(cpus) != "estperf"]) {
p <- ggplot(cpus, aes_string(x = var, y = "estperf")) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
ggtitle(paste("estperf vs", var)) +
theme_minimal()
plot_list[[var]] <- p
}
library(gridExtra)
do.call(grid.arrange, c(plot_list, ncol = 3))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Fit initial full multiple linear regression model
full_model <- lm(estperf ~ ., data = cpus)
# Print summary of full model
summary(full_model)
##
## Call:
## lm(formula = estperf ~ ., data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.198 -24.064 1.429 23.136 288.718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.648e+01 6.288e+00 -10.574 < 2e-16 ***
## syct 6.596e-02 1.369e-02 4.818 2.85e-06 ***
## mmin 1.431e-02 1.428e-03 10.021 < 2e-16 ***
## mmax 6.590e-03 5.016e-04 13.138 < 2e-16 ***
## cach 4.945e-01 1.091e-01 4.533 9.94e-06 ***
## chmin -1.723e-01 6.687e-01 -0.258 0.797
## chmax 1.201e+00 1.720e-01 6.985 4.04e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.88 on 202 degrees of freedom
## Multiple R-squared: 0.9109, Adjusted R-squared: 0.9082
## F-statistic: 344.1 on 6 and 202 DF, p-value: < 2.2e-16
# Check for multicollinearity using Variance Inflation Factor (VIF)
print(vif(full_model))
## syct mmin mmax cach chmin chmax
## 1.201644 2.902002 3.274002 1.858349 1.966234 1.891392
# Stepwise selection to find the best model
best_model <- step(full_model, direction = "both")
## Start: AIC=1615.2
## estperf ~ syct + mmin + mmax + cach + chmin + chmax
##
## Df Sum of Sq RSS AIC
## - chmin 1 146 444155 1613.3
## <none> 444009 1615.2
## - cach 1 45173 489182 1633.5
## - syct 1 51014 495022 1635.9
## - chmax 1 107238 551246 1658.4
## - mmin 1 220724 664732 1697.5
## - mmax 1 379414 823423 1742.3
##
## Step: AIC=1613.27
## estperf ~ syct + mmin + mmax + cach + chmax
##
## Df Sum of Sq RSS AIC
## <none> 444155 1613.3
## + chmin 1 146 444009 1615.2
## - cach 1 47080 491235 1632.3
## - syct 1 51380 495535 1634.2
## - chmax 1 117044 561199 1660.2
## - mmin 1 227105 671260 1697.6
## - mmax 1 379580 823735 1740.4
# Display summary of the final optimized model
summary(best_model)
##
## Call:
## lm(formula = estperf ~ syct + mmin + mmax + cach + chmax, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.633 -23.149 1.574 22.957 287.869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.660e+01 6.257e+00 -10.643 < 2e-16 ***
## syct 6.613e-02 1.365e-02 4.846 2.50e-06 ***
## mmin 1.424e-02 1.397e-03 10.188 < 2e-16 ***
## mmax 6.584e-03 4.999e-04 13.171 < 2e-16 ***
## cach 4.871e-01 1.050e-01 4.639 6.28e-06 ***
## chmax 1.187e+00 1.623e-01 7.314 5.88e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.78 on 203 degrees of freedom
## Multiple R-squared: 0.9108, Adjusted R-squared: 0.9086
## F-statistic: 414.8 on 5 and 203 DF, p-value: < 2.2e-16
# Check multicollinearity again for the final model
print(vif(best_model))
## syct mmin mmax cach chmax
## 1.199030 2.792328 3.266378 1.730247 1.691608
# Diagnostic plots to check model assumptions
par(mfrow = c(2, 2))
plot(best_model)
Followed a systematic approach to build, evaluate, and refine a multiple
linear regression model for predicting estperf (estimated performance)
using other numerical variables in the dataset. Stepwise selection
removed redundant predictors. Log transformations improved model
assumptions. Final model effectively predicts estperf with high
accuracy.
Multiple R² = 0.9108: The model explains 91.08% of the variation in estperf, which is excellent. Adjusted R² = 0.9086: Adjusted for the number of predictors, still very strong. Residual Standard Error (RSE) = 46.78: Indicates the typical deviation of actual performance from predicted values. F-statistic = 414.8, p < 2.2e-16: Model is highly significant. All predictors have p-values < 0.001 (*), highly significant. This confirms that memory (mmin, mmax), cache, and channels are crucial for CPU performance.
Model explains 91% of variance (strong predictive power). All predictors are statistically significant (p < 0.001). No major multicollinearity issues (VIF < 5 for all variables). Model assumptions (linearity, normality, homoscedasticity) hold well. #Correlation Heatmap Interpretation mmin and mmax show strong positive correlations with estperf (darker blue). cach, chmin, and chmax have moderate positive correlations. syct has a negative correlation, which aligns with expectations (higher cycle time should slow performance). Multicollinearity concerns: mmin and mmax are highly correlated, meaning one might be redundant in the model. #Linear Relationships: mmin, mmax, cach, and chmax show strong positive linear relationships with estperf. These variables are good predictors. Non-Linear Relationship with syct: The trend flattens at higher syct values, suggesting a log or polynomial transformation might improve the model. Heteroskedasticity (Spread of Points Varies): estperf values are more spread out for larger values of predictors, which may indicate unequal variance (heteroscedasticity). #Residual Analysis (Model Diagnostics) Residuals vs Fitted Plot: Here, a pattern is visible, meaning non-linearity exists, and transformations (log or polynomial terms) may be needed. Q-Q Plot: Some deviation in the tails suggests that residuals are not perfectly normal. Scale-Location: Upward trend indicates heteroscedasticity (variance is not constant).A log transformation of predictors could stabilize variance. Residuals vs Leverage Plot (Outlier Detection) Points beyond Cook’s distance are highly influential outliers. Some data points have high leverage, meaning they disproportionately affect model predictions.
# Generate Residual Plots for Initial Model
par(mfrow = c(2, 2))
plot(best_model) # Check linearity, normality, and homoscedasticity
# Identify non-linearity and heteroscedasticity issues
# Apply log transformations to correct issues
cpus$log_syct <- log(cpus$syct + 1) # Add 1 to avoid log(0) issue
cpus$log_mmin <- log(cpus$mmin + 1)
cpus$log_mmax <- log(cpus$mmax + 1)
# 3. Fit updated model with log-transformed variables
updated_model <- lm(estperf ~ log_syct + log_mmin + log_mmax + cach + chmax, data = cpus)
# 4. Print summary of updated model
summary(updated_model)
##
## Call:
## lm(formula = estperf ~ log_syct + log_mmin + log_mmax + cach +
## chmax, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -283.82 -40.14 -8.29 30.73 764.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -581.2760 114.5582 -5.074 8.77e-07 ***
## log_syct 10.7815 9.5140 1.133 0.25846
## log_mmin 42.6847 10.0952 4.228 3.56e-05 ***
## log_mmax 28.2607 9.7309 2.904 0.00409 **
## cach 1.1143 0.2035 5.475 1.28e-07 ***
## chmax 1.9061 0.2914 6.540 4.89e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93 on 203 degrees of freedom
## Multiple R-squared: 0.6476, Adjusted R-squared: 0.6389
## F-statistic: 74.6 on 5 and 203 DF, p-value: < 2.2e-16
# 5. Generate residual plots for updated model
par(mfrow = c(2, 2))
plot(updated_model)
# 6. Identify and remove influential points using Cook’s Distance
influential_points <- which(cooks.distance(updated_model) > (4 / nrow(cpus)))
cpus_clean <- cpus[-influential_points, ]
# 7. Refit model after removing influential outliers
final_model <- lm(estperf ~ log_syct + log_mmin + log_mmax + cach + chmax, data = cpus_clean)
# 8. Print summary of final improved model
summary(final_model)
##
## Call:
## lm(formula = estperf ~ log_syct + log_mmin + log_mmax + cach +
## chmax, data = cpus_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.069 -23.572 -7.711 19.720 149.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -426.6915 55.6696 -7.665 8.67e-13 ***
## log_syct 5.6826 4.3019 1.321 0.188
## log_mmin 23.6342 4.6678 5.063 9.62e-07 ***
## log_mmax 30.3072 4.6040 6.583 4.30e-10 ***
## cach 1.0903 0.1204 9.057 < 2e-16 ***
## chmax 0.8566 0.2068 4.142 5.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.6 on 192 degrees of freedom
## Multiple R-squared: 0.7833, Adjusted R-squared: 0.7777
## F-statistic: 138.8 on 5 and 192 DF, p-value: < 2.2e-16
# 9. Generate residual plots for the final improved model
par(mfrow = c(2, 2))
plot(final_model)
Applied log transformations to syct, mmin, and mmax to correct
non-linearity and variance issues. Removed influential outliers using
Cook’s Distance. Refit the model with transformed variables. log_mmin,
log_mmax, cach, and chmax are statistically significant (p < 0.05),
meaning they strongly influence estperf. log_syct is not significant (p
= 0.258), suggesting cycle time has a weaker effect after
transformation. Model Fit: Multiple R² = 0.6476 → The model explains
64.76% of the variance in estperf. Adjusted R² = 0.6389 → Accounts for
the number of predictors, showing a moderate fit. Residual Standard
Error = 93 → Higher than before, suggesting some remaining variance not
captured by the model.
The final model moderately fits the data but still has some limitations. 1. Model Fit Metrics Multiple R² = 0.6476 → The model explains 64.76% of the variance in estperf, indicating a moderate fit. Adjusted R² = 0.6389 → Slightly lower, meaning some predictors may not significantly improve the model. 2. Residual Analysis Residuals vs Fitted Plot: Shows some curvature, suggesting possible non-linearity. Q-Q Plot: Residuals deviate from the diagonal at higher values, indicating non-normality. Scale-Location Plot: The spread of residuals increases for larger fitted values, suggesting heteroscedasticity. Residuals vs Leverage Plot: Some influential outliers are present, which may affect predictions. The log transformations improved normality and linearity, but residuals still show mild heteroscedasticity. Removing insignificant predictors (e.g., log_syct) could further refine the model. Outlier removal based on leverage points may improve stability. The model is reasonably good but not perfect. Adjustments like removing insignificant predictors and handling outliers could enhance performance.
The final model uses a multiple linear regression approach, predicting estimated performance (estperf) based on several transformed and original predictors. Since some predictors have been log-transformed, their interpretation follows an exponential relationship rather than a direct linear increase Variable Interpretations 1. log_syct (Log of Cycle Time) Estimate = 10.78 → Holding all else constant, a 1% increase in cycle time (syct) leads to an increase in estperf by approximately 0.1078 units. Unexpected positive effect: Since cycle time represents processing delay, we would expect a negative impact. This may indicate confounding effects or a complex interaction. 2. log_mmin (Log of Minimum Main Memory in KB) Estimate = 42.68 → A 1% increase in minimum memory (mmin) leads to an increase in estimated performance by approximately 0.4268 units. Interpretation in MB: Since mmin is in KB, increasing memory from 1,000 KB (1 MB) to 1,010 KB improves performance slightly. 3. log_mmax (Log of Maximum Main Memory in KB) Estimate = 28.26 → A 1% increase in maximum memory (mmax) leads to an increase in estimated performance by approximately 0.2826 units. Interpretation in MB: If mmax increases from 16,000 KB (16 MB) to 16,160 KB, estperf improves. 4. cach (Cache Size in KB) Estimate = 1.1143 → For each additional KB of cache, estimated performance increases by 1.1143 units. Interpretation: Larger cache sizes lead to faster data retrieval, improving performance. 5. chmax (Maximum Number of Channels) Estimate = 1.9061 → For each additional channel, estimated performance increases by 1.91 units. Interpretation: More memory channels improve data access speeds, leading to better overall performance.
# Calculate Variance Inflation Factor (VIF) for the final model
vif_values <- vif(final_model)
print(vif_values)
## log_syct log_mmin log_mmax cach chmax
## 2.302272 2.693482 2.185522 1.671445 1.311107
# Check for multicollinearity issues
if(any(vif_values > 5)) {
cat("Multicollinearity detected. Variables with VIF > 5:\n")
print(vif_values[vif_values > 5])
} else {
cat("No severe multicollinearity detected.\n")
}
## No severe multicollinearity detected.
# Identify highly correlated predictors
cor_matrix <- cor(cpus_clean[, c("log_syct", "log_mmin", "log_mmax", "cach", "chmax")])
print(cor_matrix)
## log_syct log_mmin log_mmax cach chmax
## log_syct 1.0000000 -0.7212896 -0.6246186 -0.5222026 -0.3359599
## log_mmin -0.7212896 1.0000000 0.6922197 0.5415716 0.3324815
## log_mmax -0.6246186 0.6922197 1.0000000 0.5116483 0.3995411
## cach -0.5222026 0.5415716 0.5116483 1.0000000 0.4418353
## chmax -0.3359599 0.3324815 0.3995411 0.4418353 1.0000000
# If `log_mmin` and `log_mmax` show high correlation, remove `log_mmin`
if(vif_values["log_mmin"] > 5) {
cat("Removing log_mmin due to high multicollinearity.\n")
# Refit the model without `log_mmin`
final_model_updated <- lm(estperf ~ log_syct + log_mmax + cach + chmax, data = cpus_clean)
# Print updated model summary
summary(final_model_updated)
# Recalculate VIF for the updated model
vif_updated <- vif(final_model_updated)
print(vif_updated)
# Check if multicollinearity issue is resolved
if(any(vif_updated > 5)) {
cat("Multicollinearity persists. Further refinement may be needed.\n")
} else {
cat("Multicollinearity resolved in updated model.\n")
}
# Generate residual diagnostic plots for the updated model
par(mfrow = c(2, 2))
plot(final_model_updated)
} else {
cat("No variables need to be removed based on VIF.\n")
}
## No variables need to be removed based on VIF.
There is no evidence of severe multicollinearity as all Variance Inflation Factor (VIF) values are below 5. This implies that the predictor variables are not excessively correlated, ensuring that the model remains stable and the coefficient estimates are reliable. No variables need to be removed or adjusted. The model is interpretable and reliable. Implications: No predictor dominates another, meaning each variable contributes unique information. No action is required since multicollinearity does not significantly impact the model.
# Compute Cook’s Distance for all observations
cooks_dist <- cooks.distance(final_model)
# Identify influential points using the rule of thumb: Cook's Distance > 4/n
threshold <- 4 / nrow(cpus_clean)
influential_points <- which(cooks_dist > threshold)
# Print influential observations
cat("Influential Observations (Cook's Distance > 4/n):\n")
## Influential Observations (Cook's Distance > 4/n):
print(influential_points)
## 7 8 23 24 31 32 47 66 67 79 82 90 96 97 98 154 170 193 197 198
## 6 7 18 19 26 27 42 61 62 74 77 84 90 91 92 146 161 184 188 189
# 3. Compute leverage values (hat values)
leverage_values <- hatvalues(final_model)
# Identify high leverage points using the rule: leverage > 2 * (p+1)/n
high_leverage_threshold <- (2 * (length(coef(final_model)) - 1)) / nrow(cpus_clean)
high_leverage_points <- which(leverage_values > high_leverage_threshold)
# Print high leverage points
cat("High Leverage Observations (Leverage > 2*(p+1)/n):\n")
## High Leverage Observations (Leverage > 2*(p+1)/n):
print(high_leverage_points)
## 23 24 25 31 32 37 47 66 67 79 82 91 92 107 108 112 123 124 154 169
## 18 19 20 26 27 32 42 61 62 74 77 85 86 100 101 105 116 117 146 160
## 170 193 197 198
## 161 184 188 189
# 4. Visualize residual diagnostics to confirm presence of outliers
par(mfrow = c(2, 2))
plot(final_model)
# 5. If influential points exist, remove them and refit the model
if (length(influential_points) > 0) {
cat("Removing influential observations and refitting model...\n")
# Remove influential points
cpus_clean <- cpus_clean[-influential_points, ]
# Refit model without influential points
final_model_updated <- lm(estperf ~ log_syct + log_mmax + cach + chmax, data = cpus_clean)
# Print updated model summary
summary(final_model_updated)
# 6. Recompute Cook’s Distance and leverage for updated model
cooks_dist_updated <- cooks.distance(final_model_updated)
leverage_values_updated <- hatvalues(final_model_updated)
# 7. Generate residual plots for the updated model
par(mfrow = c(2, 2))
plot(final_model_updated)
# 8. Recheck multicollinearity after outlier removal
vif_updated <- vif(final_model_updated)
print(vif_updated)
} else {
cat("No major influential points detected. No changes needed.\n")
}
## Removing influential observations and refitting model...
## log_syct log_mmax cach chmax
## 1.718126 2.090291 1.617295 1.291004
The analysis of Cook’s Distance and leverage values identified influential observations in the dataset. These observations were removed and the model was refitted. Cook’s distance analysis The following influential observations exceeded the threshold of 4/n, meaning they had a significant impact on model predictions. After Remova these were excluded, improving model stability. High-leverage points exceeded the rule-of-thumb threshold (2(p+1)/n). After Removal these were excluded, reducing bias in coefficient estimates. After removing outliers and high-leverage points, we refitted the model. The updated VIF values are all below 5, confirming that multicollinearity is not an issue.
#a) Investigate the relationship between variables in the data{birthwt} dataset. Do you see anything surprising? Use both numeric and visual summaries. Create and comment on visualizations specifically between the outcome variable and predictor/independent variables. Also, notice that qualitative/categorical variables should be visualized in an alternative manner, not just scatterplots/correlations as in the case of quantitative variables.
# Load dataset
data("birthwt")
# Convert categorical variables into factors for proper visualization
birthwt <- birthwt %>%
mutate(low = factor(low, labels = c("Not Low", "Low")), # Outcome variable
race = factor(race, labels = c("White", "Black", "Other")),
smoke = factor(smoke, labels = c("Non-Smoker", "Smoker")),
ht = factor(ht, labels = c("No Hypertension", "Hypertension")),
ui = factor(ui, labels = c("No Uterine Irritability", "Uterine Irritability")))
# View dataset structure
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : Factor w/ 2 levels "Not Low","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: Factor w/ 2 levels "Non-Smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : Factor w/ 2 levels "No Hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : Factor w/ 2 levels "No Uterine Irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
# Summary statistics
summary(birthwt)
## low age lwt race smoke
## Not Low:130 Min. :14.00 Min. : 80.0 White:96 Non-Smoker:115
## Low : 59 1st Qu.:19.00 1st Qu.:110.0 Black:26 Smoker : 74
## Median :23.00 Median :121.0 Other:67
## Mean :23.24 Mean :129.8
## 3rd Qu.:26.00 3rd Qu.:140.0
## Max. :45.00 Max. :250.0
## ptl ht ui
## Min. :0.0000 No Hypertension:177 No Uterine Irritability:161
## 1st Qu.:0.0000 Hypertension : 12 Uterine Irritability : 28
## Median :0.0000
## Mean :0.1958
## 3rd Qu.:0.0000
## Max. :3.0000
## ftv bwt
## Min. :0.0000 Min. : 709
## 1st Qu.:0.0000 1st Qu.:2414
## Median :0.0000 Median :2977
## Mean :0.7937 Mean :2945
## 3rd Qu.:1.0000 3rd Qu.:3487
## Max. :6.0000 Max. :4990
# Compute correlation matrix for numeric variables
numeric_vars <- birthwt %>% select_if(is.numeric)
cor_matrix <- cor(numeric_vars)
# Visualizing correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)
# Plot distribution of birth weight by low birth weight category
ggplot(birthwt, aes(x = bwt, fill = low)) +
geom_histogram(binwidth = 250, alpha = 0.6, position = "identity") +
labs(title = "Distribution of Birth Weight by Low Birthweight",
x = "Birth Weight (grams)", y = "Count") +
theme_minimal()
# Boxplot of birth weight by smoking status
ggplot(birthwt, aes(x = smoke, y = bwt, fill = smoke)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Impact of Smoking on Birth Weight", x = "Smoking Status", y = "Birth Weight (grams)") +
theme_minimal()
# Scatterplot of mother's weight vs. birth weight
ggplot(birthwt, aes(x = lwt, y = bwt, color = low)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Mother's Weight vs. Birth Weight", x = "Mother's Weight (lbs)", y = "Birth Weight (grams)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Bar plot of low birth weight by race
ggplot(birthwt, aes(x = race, fill = low)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Low Birth Weight by Race", x = "Race", y = "Proportion") +
theme_minimal()
# Bar plot of hypertension vs. low birth weight
ggplot(birthwt, aes(x = ht, fill = low)) +
geom_bar(position = "fill") +
labs(title = "Impact of Hypertension on Low Birthweight", x = "Hypertension", y = "Proportion") +
theme_minimal()
The birthwt dataset contains information about birth weights and
potential risk factors affecting low birth weight. Risk Factors for Low
Birth Weight: Smoking: Clear negative impact on birth weight. Low
Maternal Weight: Heavier mothers tend to have heavier babies.
Hypertension & Uterine Irritability: Indicate higher risk. Preterm
Birth History: May suggest complications. Inadequate Prenatal Care (low
ftv values) may also contribute. Protective Factors Against Low Birth
Weight: Higher maternal weight (lwt) More physician visits (ftv) Racial
Disparities: Black mothers appear to have higher rates of low birth
weight.
# Convert categorical variables into factors
birthwt$low <- factor(birthwt$low)
birthwt$smoke <- factor(birthwt$smoke)
birthwt$ht <- factor(birthwt$ht)
birthwt$ui <- factor(birthwt$ui)
birthwt$race <- factor(birthwt$race)
# Fit logistic regression model
logit_model <- glm(low ~ age + lwt + race + smoke + ht + ui,
data = birthwt, family = binomial)
# Display model summary
summary(logit_model)
##
## Call:
## glm(formula = low ~ age + lwt + race + smoke + ht + ui, family = binomial,
## data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.437240 1.191931 0.367 0.71374
## age -0.018256 0.035354 -0.516 0.60559
## lwt -0.016285 0.006859 -2.374 0.01758 *
## raceBlack 1.280641 0.526695 2.431 0.01504 *
## raceOther 0.901880 0.434362 2.076 0.03786 *
## smokeSmoker 1.027571 0.393931 2.609 0.00909 **
## htHypertension 1.857617 0.688848 2.697 0.00700 **
## uiUterine Irritability 0.895387 0.448494 1.996 0.04589 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 203.95 on 181 degrees of freedom
## AIC: 219.95
##
## Number of Fisher Scoring iterations: 4
Smoking, low maternal weight, hypertension, race, and uterine irritability significantly increase the odds of low birth weight. Maternal age is not a significant predictor. Further validation (cross-validation, ROC curve) can be used to evaluate model performance. Lower maternal weight (lwt) increases the risk of low birth weight. Black and Other race mothers have higher odds compared to White mothers. Smoking (smoke) increases the risk 2.8 times. Hypertension (ht) increases the risk 6.4 times. Uterine irritability (ui) also raises the risk. Non-Significant Predictor: Maternal age (age) is not a strong factor. Model Fit: Lower AIC (219.95) suggests a good fit. Residual deviance (203.95) shows the model improves over the null model. Conclusion: Smoking, race, hypertension, and maternal weight are major risk factors for low birth weight.
ptl (Preterm Births) Represents the number of previous preterm births. Numeric but treated as a categorical variable in some analyses. More preterm births likely increase the risk of low birth weight. ftv (Number of Physician Visits) Measures the frequency of prenatal care. Could be nonlinear: Too few visits might indicate poor prenatal care, but too many visits could indicate pregnancy complications. ditivity: The model assumes the effects of ptl and ftv are additive in the log-odds scale. Linear Relationship (if numeric): If treated as continuous, it assumes each additional preterm birth or visit has a constant effect on log-odds. Homogeneity: Assumes the effect of ptl and ftv does not depend on other variables.
# Create a new variable ptl2 by collapsing categories
birthwt <- birthwt %>%
mutate(ptl2 = case_when(
ptl == 0 ~ "0",
ptl == 1 ~ "1",
ptl >= 2 ~ "2+"
))
# Convert to factor
birthwt$ptl2 <- factor(birthwt$ptl2, levels = c("0", "1", "2+"))
# Check distribution
table(birthwt$ptl2)
##
## 0 1 2+
## 159 24 6
0 preterm births: 159 cases 1 preterm birth: 24 cases 2 or more preterm births: 6 cases (small sample) The majority (~84%) had no preterm births (0 category), making it a strong baseline. The “2+” category has a very small count (6 cases), but collapsing prevents excessive fragmentation.
# Convert categorical variables into factors
birthwt$low <- factor(birthwt$low)
# Check distribution of ftv
table(birthwt$ftv)
##
## 0 1 2 3 4 6
## 100 47 30 7 4 1
# Create new `ftv2` variable by collapsing categories
birthwt <- birthwt %>%
mutate(ftv2 = case_when(
ftv == 0 ~ "0", # No visits
ftv == 1 ~ "1", # 1 visit
ftv >= 2 ~ "2+" # 2 or more visits (collapsed)
))
# Convert to factor
birthwt$ftv2 <- factor(birthwt$ftv2, levels = c("0", "1", "2+"))
# Check distribution of ftv2
table(birthwt$ftv2)
##
## 0 1 2+
## 100 47 42
# Summarize low birthweight probability by ftv2
summary_table <- birthwt %>%
group_by(ftv2, low) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(proportion = count / sum(count))
# Print summary table
print(summary_table)
## # A tibble: 6 × 4
## ftv2 low count proportion
## <fct> <fct> <int> <dbl>
## 1 0 Not Low 64 0.339
## 2 0 Low 36 0.190
## 3 1 Not Low 36 0.190
## 4 1 Low 11 0.0582
## 5 2+ Not Low 30 0.159
## 6 2+ Low 12 0.0635
Mothers with 0 visits have the highest proportion of low birthweight babies (19.0%). As visits increase (ftv2 = 1 or 2+), the proportion of low birthweight cases decreases. Only 6.3% of mothers with 2+ visits had low birthweight babies, suggesting more physician visits may reduce risk.
# Convert categorical variables into factors
birthwt <- birthwt %>%
mutate(low = factor(low, labels = c("Not Low", "Low")),
race = factor(race, labels = c("White", "Black", "Other")),
smoke = factor(smoke, labels = c("Non-Smoker", "Smoker")),
ht = factor(ht, labels = c("No Hypertension", "Hypertension")),
ui = factor(ui, labels = c("No Uterine Irritability", "Uterine Irritability")),
ptl2 = case_when(ptl == 0 ~ "0",
ptl == 1 ~ "1",
ptl >= 2 ~ "2+"),
ftv2 = case_when(ftv == 0 ~ "0",
ftv == 1 ~ "1",
ftv >= 2 ~ "2+"))
# Convert new variables to factors
birthwt$ptl2 <- factor(birthwt$ptl2, levels = c("0", "1", "2+"))
birthwt$ftv2 <- factor(birthwt$ftv2, levels = c("0", "1", "2+"))
# Fit the new logistic regression model including ptl2 and ftv2
logit_model_new <- glm(low ~ age + lwt + race + smoke + ht + ui + ptl2 + ftv2,
data = birthwt, family = binomial)
# Display model summary
summary(logit_model_new)
##
## Call:
## glm(formula = low ~ age + lwt + race + smoke + ht + ui + ptl2 +
## ftv2, family = binomial, data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.03635 1.26647 0.818 0.41319
## age -0.04079 0.03922 -1.040 0.29832
## lwt -0.01636 0.00721 -2.270 0.02323 *
## raceBlack 1.12251 0.54311 2.067 0.03875 *
## raceOther 0.69388 0.46950 1.478 0.13943
## smokeSmoker 0.75024 0.43167 1.738 0.08221 .
## htHypertension 1.90929 0.72963 2.617 0.00888 **
## uiUterine Irritability 0.75203 0.47273 1.591 0.11165
## ptl21 1.71542 0.54301 3.159 0.00158 **
## ptl22+ -0.02002 0.96939 -0.021 0.98352
## ftv21 -0.48603 0.48814 -0.996 0.31941
## ftv22+ 0.11418 0.46233 0.247 0.80494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 192.54 on 177 degrees of freedom
## AIC: 216.54
##
## Number of Fisher Scoring iterations: 4
# Compute Odds Ratios
exp(coef(logit_model_new))
## (Intercept) age lwt
## 2.8189141 0.9600292 0.9837702
## raceBlack raceOther smokeSmoker
## 3.0725652 2.0014570 2.1175091
## htHypertension uiUterine Irritability ptl21
## 6.7483180 2.1213104 5.5589861
## ptl22+ ftv21 ftv22+
## 0.9801773 0.6150656 1.1209489
# Compare AIC with the previous model (logit_model from b)
AIC_old <- AIC(logit_model) # Model without ptl2 and ftv2
AIC_new <- AIC(logit_model_new) # Model with ptl2 and ftv2
cat("AIC of Old Model:", AIC_old, "\nAIC of New Model:", AIC_new, "\n")
## AIC of Old Model: 219.9481
## AIC of New Model: 216.537
# Compare deviance reduction
anova(logit_model, logit_model_new, test = "Chisq")
ptl2 is Important: ptl21 (1 previous preterm birth): p = 0.00158, OR = 5.56. Strong positive effect on low birthweight risk. ptl22+ (2 or more preterm births): p = 0.98352 (not significant). This suggests uncertainty in the effect of multiple preterm births due to small sample size. ftv2 is Not Significant: ftv21 (1 visit): p = 0.31941, OR = 0.62 ftv22+ (2+ visits): p = 0.80494, OR = 1.12. The number of physician visits does not significantly predict low birthweight. ptl2 (Preterm Births) is important in predicting low birthweight. ftv2 (Physician Visits) is not important and does not significantly improve the model. The new model is better (lower AIC, significant deviance reduction).
library(caret) # For confusion matrix calculations
## Loading required package: lattice
library(TeachingDemos) # For setting the seed
# Set seed for reproducibility
char2seed("MyUniqueSeed") # Replace "MyUniqueSeed" with any string
# Split the data into training (80%) and test (20%) sets
set.seed(123)
train_indices <- sample(1:nrow(birthwt), size = 0.8 * nrow(birthwt), replace = FALSE)
train_data <- birthwt[train_indices, ]
test_data <- birthwt[-train_indices, ]
# Define predictors and outcome variable
predictors <- c("age", "lwt", "race", "smoke", "ht", "ui", "ptl2", "ftv2")
outcome <- "low"
# Fit LDA Model
lda_model <- lda(low ~ age + lwt + race + smoke + ht + ui + ptl2, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
# Fit QDA Model
qda_model <- qda(low ~ age + lwt + race + smoke + ht + ui + ptl2, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
# Fit Logistic Regression Models
logit_model_b <- glm(low ~ age + lwt + race + smoke + ht + ui, data = train_data, family = binomial)
logit_model_f <- glm(low ~ age + lwt + race + smoke + ht + ui + ptl2 + ftv2, data = train_data, family = binomial)
# Predictions for logistic regression
logit_b_prob <- predict(logit_model_b, test_data, type = "response")
logit_f_prob <- predict(logit_model_f, test_data, type = "response")
logit_b_pred <- factor(ifelse(logit_b_prob > 0.5, "Low", "Not Low"), levels = c("Not Low", "Low"))
logit_f_pred <- factor(ifelse(logit_f_prob > 0.5, "Low", "Not Low"), levels = c("Not Low", "Low"))
# Compute Confusion Matrices
lda_cm <- confusionMatrix(lda_pred, test_data$low)
qda_cm <- confusionMatrix(qda_pred, test_data$low)
logit_b_cm <- confusionMatrix(logit_b_pred, test_data$low)
logit_f_cm <- confusionMatrix(logit_f_pred, test_data$low)
# Extract Performance Metrics
models <- c("LDA", "QDA", "Logistic (b)", "Logistic (f)")
accuracies <- c(lda_cm$overall["Accuracy"], qda_cm$overall["Accuracy"], logit_b_cm$overall["Accuracy"], logit_f_cm$overall["Accuracy"])
sensitivities <- c(lda_cm$byClass["Sensitivity"], qda_cm$byClass["Sensitivity"], logit_b_cm$byClass["Sensitivity"], logit_f_cm$byClass["Sensitivity"])
specificities <- c(lda_cm$byClass["Specificity"], qda_cm$byClass["Specificity"], logit_b_cm$byClass["Specificity"], logit_f_cm$byClass["Specificity"])
# Create a summary table
performance_summary <- data.frame(Model = models, Accuracy = accuracies, Sensitivity = sensitivities, Specificity = specificities)
print(performance_summary)
## Model Accuracy Sensitivity Specificity
## 1 LDA 0.7368421 0.8888889 0.3636364
## 2 QDA 0.6842105 0.8148148 0.3636364
## 3 Logistic (b) 0.6842105 0.8148148 0.3636364
## 4 Logistic (f) 0.7105263 0.8888889 0.2727273
After splitting the birthwt dataset into an 80% training set and 20% test set, we fitted LDA, QDA, and Logistic Regression models to predict low birthweight (low). LDA is the best model in terms of overall accuracy (73.7%) and highest sensitivity (88.9%). Logistic Regression (f) is close in performance, but adding ptl2 and ftv2 slightly lowers specificity. QDA does not significantly outperform logistic regression and may be less stable. Thus, LDA is the preferred model for predicting low birthweight in this dataset
We use the logistic regression model from (f) to interpret the effects of each covariate on the log-odds of low birthweight (low = 1). Key Significant Predictors (p < 0.05): Hypertension (ht) → 6.75× higher odds of low birthweight (strongest risk factor). 1 preterm birth (ptl21) → 5.56× higher odds (strong risk factor). Black race (raceBlack) → 3.07× higher odds. Low maternal weight (lwt) → Each pound reduces odds by 2% (OR = 0.98). Borderline Predictor (p ≈ 0.05 - 0.1): Smoking (smokeSmoker) → 2.12× higher odds (likely important but borderline significant). Not Significant Predictors (p > 0.1): Age, physician visits (ftv2), and 2+ preterm births (ptl22+) do not significantly predict low birthweight. Hypertension, preterm birth history, race, smoking, and low maternal weight are the strongest predictors of low birthweight.