Logistic Regression on Biopsy Data

Load the Required Library

library(MASS)

Load the Data

data("biopsy", package = "MASS")
biopsy_data <- biopsy

1.Get the Dimensions of the Data

dim(biopsy_data)

## [1] 699  11

2.Show the Top Ten Rows

head(biopsy_data, 10)

##         ID V1 V2 V3 V4 V5 V6 V7 V8 V9     class
## 1  1000025  5  1  1  1  2  1  3  1  1    benign
## 2  1002945  5  4  4  5  7 10  3  2  1    benign
## 3  1015425  3  1  1  1  2  2  3  1  1    benign
## 4  1016277  6  8  8  1  3  4  3  7  1    benign
## 5  1017023  4  1  1  3  2  1  3  1  1    benign
## 6  1017122  8 10 10  8  7 10  9  7  1 malignant
## 7  1018099  1  1  1  1  2 10  3  1  1    benign
## 8  1018561  2  1  2  1  2  1  3  1  1    benign
## 9  1033078  2  1  1  1  2  1  1  1  5    benign
## 10 1033078  4  2  1  1  2  1  2  1  1    benign

3.Remove the First Column (ID)

biopsy_cleaned <- biopsy_data[, -1]

4.Describe the First Nine Columns

str(biopsy_cleaned)

## 'data.frame':    699 obs. of  10 variables:
##  $ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

5.Obtain Summary Statistics

summary(biopsy_cleaned)

##        V1               V2               V3               V4        
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.000  
##  Mean   : 4.418   Mean   : 3.134   Mean   : 3.207   Mean   : 2.807  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                     
##        V5               V6               V7               V8        
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.216   Mean   : 3.545   Mean   : 3.438   Mean   : 2.867  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                   NA's   :16                                        
##        V9               class    
##  Min.   : 1.000   benign   :458  
##  1st Qu.: 1.000   malignant:241  
##  Median : 1.000                  
##  Mean   : 1.589                  
##  3rd Qu.: 1.000                  
##  Max.   :10.000                  
##

6.Identify and Remove Missing Observations

biopsy_cleaned <- na.omit(biopsy_cleaned)
dim(biopsy_cleaned)

## [1] 683  10

Check for Missing Values

colSums(is.na(biopsy_cleaned))

##    V1    V2    V3    V4    V5    V6    V7    V8    V9 class 
##     0     0     0     0     0     0     0     0     0     0

colnames(biopsy_data)

##  [1] "ID"    "V1"    "V2"    "V3"    "V4"    "V5"    "V6"    "V7"    "V8"   
## [10] "V9"    "class"

Ensure ‘Class’ Column Exists and Convert to Binary

table(biopsy_cleaned$class)

## 
##    benign malignant 
##       444       239

if ("class" %in% colnames(biopsy_cleaned)) {
  biopsy_cleaned$class <- ifelse(biopsy_cleaned$class == "malignant", 1, 0)
  biopsy_cleaned$class <- as.factor(biopsy_cleaned$class)
} else {
  stop("Class column is missing from the dataset!")
}

7.Summary Statistics of Cleaned Data

summary(biopsy_cleaned)

##        V1               V2               V3               V4       
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.00  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.00  
##  Mean   : 4.442   Mean   : 3.151   Mean   : 3.215   Mean   : 2.83  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##        V5               V6               V7               V8       
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.00  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.00  
##  Mean   : 3.234   Mean   : 3.545   Mean   : 3.445   Mean   : 2.87  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##        V9         class  
##  Min.   : 1.000   0:444  
##  1st Qu.: 1.000   1:239  
##  Median : 1.000          
##  Mean   : 1.603          
##  3rd Qu.: 1.000          
##  Max.   :10.000

8.Fit Logistic Regression Model

logistic_model <- glm(class ~ ., data = biopsy_cleaned, family = binomial)

8.Prediction Model Summary

summary(logistic_model)

## 
## Call:
## glm(formula = class ~ ., family = binomial, data = biopsy_cleaned)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.10394    1.17488  -8.600  < 2e-16 ***
## V1            0.53501    0.14202   3.767 0.000165 ***
## V2           -0.00628    0.20908  -0.030 0.976039    
## V3            0.32271    0.23060   1.399 0.161688    
## V4            0.33064    0.12345   2.678 0.007400 ** 
## V5            0.09663    0.15659   0.617 0.537159    
## V6            0.38303    0.09384   4.082 4.47e-05 ***
## V7            0.44719    0.17138   2.609 0.009073 ** 
## V8            0.21303    0.11287   1.887 0.059115 .  
## V9            0.53484    0.32877   1.627 0.103788    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 884.35  on 682  degrees of freedom
## Residual deviance: 102.89  on 673  degrees of freedom
## AIC: 122.89
## 
## Number of Fisher Scoring iterations: 8

8.Check Goodness-of-Fit

anova(logistic_model, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: class
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   682     884.35              
## V1    1   425.87       681     458.48 < 2.2e-16 ***
## V2    1   261.91       680     196.58 < 2.2e-16 ***
## V3    1    20.08       679     176.50 7.427e-06 ***
## V4    1    21.39       678     155.11 3.750e-06 ***
## V5    1     6.45       677     148.66   0.01111 *  
## V6    1    28.97       676     119.69 7.348e-08 ***
## V7    1     9.21       675     110.48   0.00241 ** 
## V8    1     3.87       674     106.61   0.04906 *  
## V9    1     3.72       673     102.89   0.05378 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.Identify Significant Predictors

significant_predictors <- summary(logistic_model)$coefficients
significant_predictors[significant_predictors[,4] < 0.05, ]

##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -10.1039422 1.17487744 -8.599997 7.971831e-18
## V1            0.5350141 0.14201743  3.767242 1.650608e-04
## V4            0.3306369 0.12345089  2.678287 7.399977e-03
## V6            0.3830246 0.09384327  4.081535 4.473930e-05
## V7            0.4471879 0.17138238  2.609299 9.072785e-03

9.Report Odds Ratios and Confidence Intervals

odds_ratios <- exp(coef(logistic_model))
conf_intervals <- exp(confint(logistic_model))

## Waiting for profiling to be done...

odds_ratios

##  (Intercept)           V1           V2           V3           V4           V5 
## 4.091793e-05 1.707472e+00 9.937400e-01 1.380860e+00 1.391854e+00 1.101459e+00 
##           V6           V7           V8           V9 
## 1.466714e+00 1.563908e+00 1.237423e+00 1.707168e+00

conf_intervals

##                    2.5 %       97.5 %
## (Intercept) 2.878802e-06 0.0003081064
## V1          1.315481e+00 2.3111005101
## V2          6.737927e-01 1.5494825697
## V3          8.622282e-01 2.1562448344
## V4          1.097470e+00 1.7984446648
## V5          8.050104e-01 1.4990926079
## V6          1.229862e+00 1.7836925585
## V7          1.131130e+00 2.2251645818
## V8          9.981573e-01 1.5613578633
## V9          9.933376e-01 3.0237053763

10. Obtain Confusion Matrix

predicted_probs <- predict(logistic_model, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
table(Predicted = predicted_classes, Actual = biopsy_cleaned$class)

##          Actual
## Predicted   0   1
##         0 434  11
##         1  10 228

11. Calculate Misclassification Rate

misclassification_rate <- mean(predicted_classes != biopsy_cleaned$class)
misclassification_rate

## [1] 0.03074671

Logistic Regression on Biopsy Data

Grandhe Venkata Guna Sundhar – M16552512

2025-02-18