if (!requireNamespace("GGally", quietly = TRUE)) {
install.packages("GGally")
}
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.5.1 âś” tibble 3.2.1
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## âś– dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
# Load data
data("cpus")
# Display summary of dataset
summary(cpus)
## name syct mmin mmax
## ADVISOR 32/60 : 1 Min. : 17.0 Min. : 64 Min. : 64
## AMDAHL 470/7A : 1 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000
## AMDAHL 470V/7 : 1 Median : 110.0 Median : 2000 Median : 8000
## AMDAHL 470V/7B: 1 Mean : 203.8 Mean : 2868 Mean :11796
## AMDAHL 470V/7C: 1 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000
## AMDAHL 470V/8 : 1 Max. :1500.0 Max. :32000 Max. :64000
## (Other) :203
## cach chmin chmax perf
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 6.0
## 1st Qu.: 0.00 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0
## Median : 8.00 Median : 2.000 Median : 8.00 Median : 50.0
## Mean : 25.21 Mean : 4.699 Mean : 18.27 Mean : 105.6
## 3rd Qu.: 32.00 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0
## Max. :256.00 Max. :52.000 Max. :176.00 Max. :1150.0
##
## estperf
## Min. : 15.00
## 1st Qu.: 28.00
## Median : 45.00
## Mean : 99.33
## 3rd Qu.: 101.00
## Max. :1238.00
##
# Compute correlation matrix
cor_matrix <- cor(cpus[, -c(1,2)]) # Exclude 'name' and categorical variables if any
cor_matrix
## mmin mmax cach chmin chmax perf estperf
## mmin 1.0000000 0.7581573 0.5347291 0.5171892 0.2669074 0.7949233 0.8192915
## mmax 0.7581573 1.0000000 0.5379898 0.5605134 0.5272462 0.8629942 0.9012024
## cach 0.5347291 0.5379898 1.0000000 0.5822455 0.4878458 0.6626135 0.6486203
## chmin 0.5171892 0.5605134 0.5822455 1.0000000 0.5482812 0.6089025 0.6105802
## chmax 0.2669074 0.5272462 0.4878458 0.5482812 1.0000000 0.6052193 0.5921556
## perf 0.7949233 0.8629942 0.6626135 0.6089025 0.6052193 1.0000000 0.9664687
## estperf 0.8192915 0.9012024 0.6486203 0.6105802 0.5921556 0.9664687 1.0000000
# Visualize relationships using scatterplot matrix
ggpairs(cpus[, -c(1,2)])
Observations:
- The scatterplot matrix and correlation matrix indicate strong
correlations between some variables (e.g., mmin
,
mmax
, and estperf
).
- There is potential non-linearity in some relationships, suggesting the
need for transformations.
# Initial model
lm_full <- lm(estperf ~ . - name, data = cpus)
summary(lm_full)
##
## Call:
## lm(formula = estperf ~ . - name, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -117.481 -9.549 2.864 15.258 182.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.423e+01 4.732e+00 -7.233 9.71e-12 ***
## syct 3.777e-02 9.434e-03 4.003 8.79e-05 ***
## mmin 5.483e-03 1.120e-03 4.894 2.02e-06 ***
## mmax 3.376e-03 3.974e-04 8.495 4.41e-15 ***
## cach 1.245e-01 7.751e-02 1.606 0.10977
## chmin -1.651e-02 4.523e-01 -0.037 0.97091
## chmax 3.457e-01 1.287e-01 2.686 0.00783 **
## perf 5.770e-01 3.718e-02 15.519 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.7 on 201 degrees of freedom
## Multiple R-squared: 0.9595, Adjusted R-squared: 0.958
## F-statistic: 679.5 on 7 and 201 DF, p-value: < 2.2e-16
# Perform stepwise selection
library(MASS)
lm_step <- stepAIC(lm_full, direction = "both")
## Start: AIC=1452.58
## estperf ~ (name + syct + mmin + mmax + cach + chmin + chmax +
## perf) - name
##
## Df Sum of Sq RSS AIC
## - chmin 1 1 201983 1450.6
## <none> 201981 1452.6
## - cach 1 2593 204574 1453.2
## - chmax 1 7252 209233 1458.0
## - syct 1 16104 218086 1466.6
## - mmin 1 24068 226049 1474.1
## - mmax 1 72509 274491 1514.7
## - perf 1 242027 444009 1615.2
##
## Step: AIC=1450.58
## estperf ~ syct + mmin + mmax + cach + chmax + perf
##
## Df Sum of Sq RSS AIC
## <none> 201983 1450.6
## - cach 1 2743 204726 1451.4
## + chmin 1 1 201981 1452.6
## - chmax 1 7920 209902 1456.6
## - syct 1 16144 218127 1464.7
## - mmin 1 24795 226778 1472.8
## - mmax 1 72679 274661 1512.8
## - perf 1 242172 444155 1613.3
summary(lm_step)
##
## Call:
## lm(formula = estperf ~ syct + mmin + mmax + cach + chmax + perf,
## data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -117.541 -9.553 2.867 15.213 182.172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.424e+01 4.713e+00 -7.264 8.01e-12 ***
## syct 3.778e-02 9.403e-03 4.018 8.27e-05 ***
## mmin 5.476e-03 1.100e-03 4.980 1.36e-06 ***
## mmax 3.375e-03 3.959e-04 8.526 3.55e-15 ***
## cach 1.238e-01 7.473e-02 1.656 0.09920 .
## chmax 3.443e-01 1.223e-01 2.814 0.00537 **
## perf 5.770e-01 3.708e-02 15.563 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.62 on 202 degrees of freedom
## Multiple R-squared: 0.9595, Adjusted R-squared: 0.9582
## F-statistic: 796.7 on 6 and 202 DF, p-value: < 2.2e-16
Process:
- Stepwise selection (AIC-based) was used to determine the most
significant predictors. The final model includes variables
syct
, mmin
, mmax
,
cach
, chmax
, and perf
.
# Residual plots
par(mfrow = c(2, 2))
plot(lm_step)
Observations:
- The residual vs. fitted plot shows signs of
heteroskedasticity.
- The Q-Q plot indicates some deviation from normality.
- A log transformation of estperf
may improve model
assumptions.
# Log-transforming response variable to address heteroskedasticity
lm_log <- lm(log(estperf) ~ . - name, data = cpus)
summary(lm_log)
##
## Call:
## lm(formula = log(estperf) ~ . - name, data = cpus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78118 -0.10386 0.01431 0.10284 0.74145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.191e+00 2.834e-02 112.602 < 2e-16 ***
## syct -3.618e-04 5.651e-05 -6.403 1.06e-09 ***
## mmin 4.070e-05 6.710e-06 6.065 6.45e-09 ***
## mmax 6.403e-05 2.380e-06 26.900 < 2e-16 ***
## cach 8.194e-03 4.642e-04 17.652 < 2e-16 ***
## chmin 4.315e-03 2.709e-03 1.593 0.113
## chmax 1.108e-03 7.707e-04 1.438 0.152
## perf -1.825e-03 2.227e-04 -8.197 2.87e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1899 on 201 degrees of freedom
## Multiple R-squared: 0.9603, Adjusted R-squared: 0.9589
## F-statistic: 694.8 on 7 and 201 DF, p-value: < 2.2e-16
# Check residual plots again
par(mfrow = c(2, 2))
plot(lm_log)
Adjustments:
- The log transformation of estperf
improves residual
spread and normality, making the model more reliable
# Check R-squared and Adjusted R-squared
summary(lm_log)$r.squared
## [1] 0.9603119
summary(lm_log)$adj.r.squared
## [1] 0.9589297
Observations:
- The R² value of approximately 0.96 suggests an excellent model
fit.
- Adjusted R² confirms that the model is robust without unnecessary
predictors.
estperf
.0.2
means a 20% increase in
estperf
per unit increase in the predictor.perf
and mmax
have the
strongest influence on estimated performance.library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# Compute Variance Inflation Factors (VIF)
vif(lm_log)
## syct mmin mmax cach chmin chmax perf
## 1.247917 3.908761 4.495093 2.052522 1.967203 2.316488 7.400569
Observations:
- VIF values indicate moderate multicollinearity, particularly in
perf
and mmax
.
- No immediate action needed, but removing one of the highly correlated
predictors could be considered.
# Cook's Distance
cooksd <- cooks.distance(lm_log)
plot(cooksd, type = "h", main = "Cook's Distance for Influential Points")
abline(h = 4/length(cpus$estperf), col = "red")
# Leverage values
plot(hatvalues(lm_log), main = "Leverage Values", ylab = "Hat values")
abline(h = 2*mean(hatvalues(lm_log)), col = "red")
Observations:
- A few points show high leverage or Cook’s distance, suggesting
influential observations.
- These should be further examined to determine whether they should be
removed or accounted for in the model.
library(MASS)
library(tidyverse)
library(GGally)
# Load data
data("birthwt")
# Convert categorical variables to factors
birthwt <- birthwt %>%
mutate(low = factor(low), race = factor(race), smoke = factor(smoke),
ptl = factor(ptl), ht = factor(ht), ui = factor(ui), ftv = factor(ftv))
# Summary statistics
summary(birthwt)
## low age lwt race smoke ptl ht ui
## 0:130 Min. :14.00 Min. : 80.0 1:96 0:115 0:159 0:177 0:161
## 1: 59 1st Qu.:19.00 1st Qu.:110.0 2:26 1: 74 1: 24 1: 12 1: 28
## Median :23.00 Median :121.0 3:67 2: 5
## Mean :23.24 Mean :129.8 3: 1
## 3rd Qu.:26.00 3rd Qu.:140.0
## Max. :45.00 Max. :250.0
## ftv bwt
## 0:100 Min. : 709
## 1: 47 1st Qu.:2414
## 2: 30 Median :2977
## 3: 7 Mean :2945
## 4: 4 3rd Qu.:3487
## 6: 1 Max. :4990
# Correlation matrix for numeric variables
cor_matrix <- cor(birthwt %>% select(-low, -race, -smoke, -ptl, -ht, -ui, -ftv))
cor_matrix
## age lwt bwt
## age 1.00000000 0.1800732 0.09031781
## lwt 0.18007315 1.0000000 0.18573328
## bwt 0.09031781 0.1857333 1.00000000
# Visualizing relationships
ggpairs(birthwt, aes(color = low, alpha = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Observations:
- Certain predictors, like smoke
and ht
,
seem to influence low birth weight.
- Categorical variables require bar plots for better visualization.
# Initial logistic regression model
logit_model <- glm(low ~ . - bwt, data = birthwt, family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = low ~ . - bwt, family = binomial, data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.986e-01 1.299e+00 0.692 0.48908
## age -3.889e-02 3.952e-02 -0.984 0.32508
## lwt -1.536e-02 7.483e-03 -2.052 0.04017 *
## race2 1.110e+00 5.509e-01 2.016 0.04382 *
## race3 6.698e-01 4.774e-01 1.403 0.16061
## smoke1 7.073e-01 4.399e-01 1.608 0.10784
## ptl1 1.865e+00 5.722e-01 3.259 0.00112 **
## ptl2 4.875e-01 1.008e+00 0.484 0.62858
## ptl3 -1.553e+01 1.455e+03 -0.011 0.99148
## ht1 1.805e+00 7.488e-01 2.410 0.01595 *
## ui1 7.930e-01 4.780e-01 1.659 0.09712 .
## ftv1 -5.598e-01 4.958e-01 -1.129 0.25879
## ftv2 -8.141e-02 5.447e-01 -0.149 0.88120
## ftv3 1.103e+00 8.648e-01 1.276 0.20213
## ftv4 -9.225e-01 1.384e+00 -0.667 0.50496
## ftv6 -1.291e+01 1.455e+03 -0.009 0.99292
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 188.60 on 173 degrees of freedom
## AIC: 220.6
##
## Number of Fisher Scoring iterations: 14
# Stepwise selection
library(MASS)
logit_step <- stepAIC(logit_model, direction = "both")
## Start: AIC=220.6
## low ~ (age + lwt + race + smoke + ptl + ht + ui + ftv + bwt) -
## bwt
##
## Df Deviance AIC
## - ftv 5 192.45 214.45
## - age 1 189.58 219.58
## <none> 188.60 220.60
## - smoke 1 191.20 221.20
## - race 2 193.23 221.23
## - ui 1 191.32 221.32
## - lwt 1 193.13 223.13
## - ht 1 194.72 224.72
## - ptl 3 202.21 228.21
##
## Step: AIC=214.45
## low ~ age + lwt + race + smoke + ptl + ht + ui
##
## Df Deviance AIC
## - age 1 193.59 213.59
## <none> 192.45 214.45
## - ui 1 195.67 215.67
## - race 2 197.91 215.91
## - smoke 1 196.91 216.91
## - lwt 1 198.05 218.05
## - ht 1 199.64 219.64
## - ptl 3 203.95 219.95
## + ftv 5 188.60 220.60
##
## Step: AIC=213.59
## low ~ lwt + race + smoke + ptl + ht + ui
##
## Df Deviance AIC
## <none> 193.59 213.59
## + age 1 192.45 214.45
## - ui 1 197.17 215.17
## - race 2 200.27 216.27
## - smoke 1 198.40 216.40
## - ptl 3 204.22 218.22
## - lwt 1 200.29 218.29
## - ht 1 200.94 218.94
## + ftv 5 189.58 219.58
summary(logit_step)
##
## Call:
## glm(formula = low ~ lwt + race + smoke + ptl + ht + ui, family = binomial,
## data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.030369 0.986202 0.031 0.97543
## lwt -0.017173 0.007121 -2.412 0.01588 *
## race2 1.248872 0.535197 2.333 0.01962 *
## race3 0.796707 0.447359 1.781 0.07493 .
## smoke1 0.885373 0.409389 2.163 0.03057 *
## ptl1 1.457868 0.507406 2.873 0.00406 **
## ptl2 0.273850 0.980762 0.279 0.78007
## ptl3 -14.744564 882.743533 -0.017 0.98667
## ht1 1.898206 0.717535 2.645 0.00816 **
## ui1 0.894205 0.469649 1.904 0.05691 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 193.59 on 179 degrees of freedom
## AIC: 213.59
##
## Number of Fisher Scoring iterations: 13
Process:
- Stepwise selection removes non-significant predictors, improving model interpretability.
table(birthwt$ptl, birthwt$low)
##
## 0 1
## 0 118 41
## 1 8 16
## 2 3 2
## 3 1 0
table(birthwt$ftv, birthwt$low)
##
## 0 1
## 0 64 36
## 1 36 11
## 2 23 7
## 3 3 4
## 4 3 1
## 6 1 0
Observations:
- Sparse data in ptl
and ftv
might affect
regression accuracy.
- The model assumes a linear relationship, which may not be
appropriate.
birthwt <- birthwt %>% mutate(ptl = as.numeric(as.character(ptl)), # Convert factor to numeric
ptl2 = ifelse(ptl >= 2, "2+", as.character(ptl)))
birthwt$ptl2 <- factor(birthwt$ptl2) # Convert new variable to factor
table(birthwt$ptl2, birthwt$low)
##
## 0 1
## 0 118 41
## 1 8 16
## 2+ 4 2
Observations:
- ptl2
collapses multiple categories for better
statistical power.
birthwt <- birthwt %>%
mutate(ftv = as.numeric(as.character(ftv)), # Convert factor to numeric
ftv2 = ifelse(ftv >= 2, "2+", as.character(ftv)))
birthwt$ftv2 <- factor(birthwt$ftv2) # Convert new variable to factor
table(birthwt$ftv2, birthwt$low)
##
## 0 1
## 0 64 36
## 1 36 11
## 2+ 30 12
Observations:
- ftv2
simplifies interpretation and statistical
robustness.
logit_final <- glm(low ~ age + lwt + race + smoke + ht + ui + ptl2 + ftv2,
data = birthwt, family = binomial)
summary(logit_final)
##
## Call:
## glm(formula = low ~ age + lwt + race + smoke + ht + ui + ptl2 +
## ftv2, family = binomial, data = birthwt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.03635 1.26647 0.818 0.41319
## age -0.04079 0.03922 -1.040 0.29832
## lwt -0.01636 0.00721 -2.270 0.02323 *
## race2 1.12251 0.54311 2.067 0.03875 *
## race3 0.69388 0.46950 1.478 0.13943
## smoke1 0.75024 0.43167 1.738 0.08221 .
## ht1 1.90929 0.72963 2.617 0.00888 **
## ui1 0.75203 0.47273 1.591 0.11165
## ptl21 1.71542 0.54301 3.159 0.00158 **
## ptl22+ -0.02002 0.96939 -0.021 0.98352
## ftv21 -0.48603 0.48814 -0.996 0.31941
## ftv22+ 0.11418 0.46233 0.247 0.80494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.67 on 188 degrees of freedom
## Residual deviance: 192.54 on 177 degrees of freedom
## AIC: 216.54
##
## Number of Fisher Scoring iterations: 4
Observations:
- ptl2
and ftv2
improve model
interpretability.
- The significant predictors remain stable.
if (!requireNamespace("TeachingDemos", quietly = TRUE)) {
install.packages("TeachingDemos")
}
library(MASS)
library(TeachingDemos)
set.seed(1234)
# Split data into training (80%) and test (20%) sets
train_indices <- sample(1:nrow(birthwt), 0.8 * nrow(birthwt))
train_data <- birthwt[train_indices, ]
test_data <- birthwt[-train_indices, ]
# Logistic regression
logit_pred <- predict(logit_final, test_data, type = "response")
logit_class <- ifelse(logit_pred > 0.5, 1, 0)
# LDA model
lda_model <- lda(low ~ age + lwt + race + smoke + ht + ui + ptl2 + ftv2, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
# QDA model
qda_model <- qda(low ~ age + lwt + race + smoke + ht + ui + ptl2 + ftv2, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
# Confusion matrices
logit_cm <- table(logit_class, test_data$low)
lda_cm <- table(lda_pred, test_data$low)
qda_cm <- table(qda_pred, test_data$low)
logit_cm
##
## logit_class 0 1
## 0 27 4
## 1 4 3
lda_cm
##
## lda_pred 0 1
## 0 26 4
## 1 5 3
qda_cm
##
## qda_pred 0 1
## 0 24 4
## 1 7 3
Model Performance:
- Comparing accuracy, sensitivity, and specificity of models.
- Logistic regression provides interpretable coefficients, while LDA and
QDA may offer better classification.
exp(coef(logit_final))
## (Intercept) age lwt race2 race3 smoke1
## 2.8189141 0.9600292 0.9837702 3.0725652 2.0014570 2.1175091
## ht1 ui1 ptl21 ptl22+ ftv21 ftv22+
## 6.7483180 2.1213104 5.5589861 0.9801773 0.6150656 1.1209489
Interpretations:
- Each coefficient represents an odds ratio.
- Example: An odds ratio of 2 for smoke
means smokers have
twice the risk of low birth weight.