Load the Required Library
library(MASS)
Load the Data
data("biopsy", package = "MASS")
biopsy_data <- biopsy
1.Get the Dimensions of the Data
dim(biopsy_data)
## [1] 699 11
2.Show the Top Ten Rows
head(biopsy_data, 10)
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class
## 1 1000025 5 1 1 1 2 1 3 1 1 benign
## 2 1002945 5 4 4 5 7 10 3 2 1 benign
## 3 1015425 3 1 1 1 2 2 3 1 1 benign
## 4 1016277 6 8 8 1 3 4 3 7 1 benign
## 5 1017023 4 1 1 3 2 1 3 1 1 benign
## 6 1017122 8 10 10 8 7 10 9 7 1 malignant
## 7 1018099 1 1 1 1 2 10 3 1 1 benign
## 8 1018561 2 1 2 1 2 1 3 1 1 benign
## 9 1033078 2 1 1 1 2 1 1 1 5 benign
## 10 1033078 4 2 1 1 2 1 2 1 1 benign
3.Remove the First Column (ID)
biopsy_cleaned <- biopsy_data[, -1]
4.Describe the First Nine Columns
str(biopsy_cleaned)
## 'data.frame': 699 obs. of 10 variables:
## $ V1 : int 5 5 3 6 4 8 1 2 2 4 ...
## $ V2 : int 1 4 1 8 1 10 1 1 1 2 ...
## $ V3 : int 1 4 1 8 1 10 1 2 1 1 ...
## $ V4 : int 1 5 1 1 3 8 1 1 1 1 ...
## $ V5 : int 2 7 2 3 2 7 2 2 2 2 ...
## $ V6 : int 1 10 2 4 1 10 10 1 1 1 ...
## $ V7 : int 3 3 3 3 3 9 3 3 1 2 ...
## $ V8 : int 1 2 1 7 1 7 1 1 1 1 ...
## $ V9 : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
5.Obtain Summary Statistics
summary(biopsy_cleaned)
## V1 V2 V3 V4
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean : 2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## V5 V6 V7 V8
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.216 Mean : 3.545 Mean : 3.438 Mean : 2.867
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :16
## V9 class
## Min. : 1.000 benign :458
## 1st Qu.: 1.000 malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
##
6.Identify and Remove Missing Observations
biopsy_cleaned <- na.omit(biopsy_cleaned)
dim(biopsy_cleaned)
## [1] 683 10
Check for Missing Values
colSums(is.na(biopsy_cleaned))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 class
## 0 0 0 0 0 0 0 0 0 0
colnames(biopsy_data)
## [1] "ID" "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8"
## [10] "V9" "class"
Ensure ‘Class’ Column Exists and Convert to Binary
table(biopsy_cleaned$class)
##
## benign malignant
## 444 239
if ("class" %in% colnames(biopsy_cleaned)) {
biopsy_cleaned$class <- ifelse(biopsy_cleaned$class == "malignant", 1, 0)
biopsy_cleaned$class <- as.factor(biopsy_cleaned$class)
} else {
stop("Class column is missing from the dataset!")
}
7.Summary Statistics of Cleaned Data
summary(biopsy_cleaned)
## V1 V2 V3 V4
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.00
## Mean : 4.442 Mean : 3.151 Mean : 3.215 Mean : 2.83
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## V5 V6 V7 V8
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.00
## Mean : 3.234 Mean : 3.545 Mean : 3.445 Mean : 2.87
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## V9 class
## Min. : 1.000 0:444
## 1st Qu.: 1.000 1:239
## Median : 1.000
## Mean : 1.603
## 3rd Qu.: 1.000
## Max. :10.000
8.Fit Logistic Regression Model
logistic_model <- glm(class ~ ., data = biopsy_cleaned, family = binomial)
8.Prediction Model Summary
summary(logistic_model)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = biopsy_cleaned)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.10394 1.17488 -8.600 < 2e-16 ***
## V1 0.53501 0.14202 3.767 0.000165 ***
## V2 -0.00628 0.20908 -0.030 0.976039
## V3 0.32271 0.23060 1.399 0.161688
## V4 0.33064 0.12345 2.678 0.007400 **
## V5 0.09663 0.15659 0.617 0.537159
## V6 0.38303 0.09384 4.082 4.47e-05 ***
## V7 0.44719 0.17138 2.609 0.009073 **
## V8 0.21303 0.11287 1.887 0.059115 .
## V9 0.53484 0.32877 1.627 0.103788
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 884.35 on 682 degrees of freedom
## Residual deviance: 102.89 on 673 degrees of freedom
## AIC: 122.89
##
## Number of Fisher Scoring iterations: 8
8.Check Goodness-of-Fit
anova(logistic_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: class
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 682 884.35
## V1 1 425.87 681 458.48 < 2.2e-16 ***
## V2 1 261.91 680 196.58 < 2.2e-16 ***
## V3 1 20.08 679 176.50 7.427e-06 ***
## V4 1 21.39 678 155.11 3.750e-06 ***
## V5 1 6.45 677 148.66 0.01111 *
## V6 1 28.97 676 119.69 7.348e-08 ***
## V7 1 9.21 675 110.48 0.00241 **
## V8 1 3.87 674 106.61 0.04906 *
## V9 1 3.72 673 102.89 0.05378 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
8.Identify Significant Predictors
significant_predictors <- summary(logistic_model)$coefficients
significant_predictors[significant_predictors[,4] < 0.05, ]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.1039422 1.17487744 -8.599997 7.971831e-18
## V1 0.5350141 0.14201743 3.767242 1.650608e-04
## V4 0.3306369 0.12345089 2.678287 7.399977e-03
## V6 0.3830246 0.09384327 4.081535 4.473930e-05
## V7 0.4471879 0.17138238 2.609299 9.072785e-03
9.Report Odds Ratios and Confidence Intervals
odds_ratios <- exp(coef(logistic_model))
conf_intervals <- exp(confint(logistic_model))
## Waiting for profiling to be done...
odds_ratios
## (Intercept) V1 V2 V3 V4 V5
## 4.091793e-05 1.707472e+00 9.937400e-01 1.380860e+00 1.391854e+00 1.101459e+00
## V6 V7 V8 V9
## 1.466714e+00 1.563908e+00 1.237423e+00 1.707168e+00
conf_intervals
## 2.5 % 97.5 %
## (Intercept) 2.878802e-06 0.0003081064
## V1 1.315481e+00 2.3111005101
## V2 6.737927e-01 1.5494825697
## V3 8.622282e-01 2.1562448344
## V4 1.097470e+00 1.7984446648
## V5 8.050104e-01 1.4990926079
## V6 1.229862e+00 1.7836925585
## V7 1.131130e+00 2.2251645818
## V8 9.981573e-01 1.5613578633
## V9 9.933376e-01 3.0237053763
10. Obtain Confusion Matrix
predicted_probs <- predict(logistic_model, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
table(Predicted = predicted_classes, Actual = biopsy_cleaned$class)
## Actual
## Predicted 0 1
## 0 434 11
## 1 10 228
11. Calculate Misclassification Rate
misclassification_rate <- mean(predicted_classes != biopsy_cleaned$class)
misclassification_rate
## [1] 0.03074671