WK6Assignment

Question 14.1 — Breast Cancer Missing Data Imputation

Load the data

breastcancer <- read.table(
  "C:/Users/moham/Downloads/Georgia Tech University/WK6/week 6 Homework-Summer/week 6 data-summer/data 14.1/breast-cancer-wisconsin.txt.txt",
  header = FALSE,
  na.strings = "?",
  sep = ","
)

head(breastcancer)

##        V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025  5  1  1  1  2  1  3  1   1   2
## 2 1002945  5  4  4  5  7 10  3  2   1   2
## 3 1015425  3  1  1  1  2  2  3  1   1   2
## 4 1016277  6  8  8  1  3  4  3  7   1   2
## 5 1017023  4  1  1  3  2  1  3  1   1   2
## 6 1017122  8 10 10  8  7 10  9  7   1   4

dim(breastcancer)

## [1] 699  11

str(breastcancer)

## 'data.frame':    699 obs. of  11 variables:
##  $ V1 : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ V2 : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V3 : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V4 : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V5 : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V6 : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V7 : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ V8 : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V9 : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V10: int  1 1 1 1 1 1 1 1 5 1 ...
##  $ V11: int  2 2 2 2 2 4 2 2 2 2 ...

summary(breastcancer)

##        V1                 V2               V3               V4        
##  Min.   :   61634   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:  870689   1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 1171710   Median : 4.000   Median : 1.000   Median : 1.000  
##  Mean   : 1071704   Mean   : 4.418   Mean   : 3.134   Mean   : 3.207  
##  3rd Qu.: 1238298   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000  
##  Max.   :13454352   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                       
##        V5               V6               V7               V8        
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000  
##  Median : 1.000   Median : 2.000   Median : 1.000   Median : 3.000  
##  Mean   : 2.807   Mean   : 3.216   Mean   : 3.545   Mean   : 3.438  
##  3rd Qu.: 4.000   3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                    NA's   :16                       
##        V9              V10              V11      
##  Min.   : 1.000   Min.   : 1.000   Min.   :2.00  
##  1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:2.00  
##  Median : 1.000   Median : 1.000   Median :2.00  
##  Mean   : 2.867   Mean   : 1.589   Mean   :2.69  
##  3rd Qu.: 4.000   3rd Qu.: 1.000   3rd Qu.:4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :4.00  
##

Check for missing values

anyNA(breastcancer)

## [1] TRUE

colSums(is.na(breastcancer))

##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 
##   0   0   0   0   0   0  16   0   0   0   0

sum(is.na(breastcancer))

## [1] 16

There are 16 missing values, all in one column.

Assign column names

colnames(breastcancer) <- c(
  "Sample_code_number",
  "Clump_Thickness",
  "Uniformity_of_Cell_Size",
  "Uniformity_of_Cell_Shape",
  "Marginal_Adhesion",
  "Single_Epithelial_Cell_Size",
  "Bare_Nuclei",
  "Bland_Chromatin",
  "Normal_Nucleoli",
  "Mitoses",
  "Class"
)

colSums(is.na(breastcancer))

##          Sample_code_number             Clump_Thickness 
##                           0                           0 
##     Uniformity_of_Cell_Size    Uniformity_of_Cell_Shape 
##                           0                           0 
##           Marginal_Adhesion Single_Epithelial_Cell_Size 
##                           0                           0 
##                 Bare_Nuclei             Bland_Chromatin 
##                          16                           0 
##             Normal_Nucleoli                     Mitoses 
##                           0                           0 
##                       Class 
##                           0

Missing values are confirmed to be only in Bare_Nuclei.

Method 1: Mean Imputation

breastcancer_mean <- breastcancer

mean_bare_nuclei <- mean(breastcancer_mean$Bare_Nuclei, na.rm = TRUE)
breastcancer_mean$Bare_Nuclei[is.na(breastcancer_mean$Bare_Nuclei)] <- mean_bare_nuclei

sum(is.na(breastcancer_mean$Bare_Nuclei))

## [1] 0

mean_bare_nuclei

## [1] 3.544656

Method 2: Regression Imputation

breastcancer_reg <- breastcancer

complete_rows <- breastcancer_reg[!is.na(breastcancer_reg$Bare_Nuclei), ]
missing_rows  <- breastcancer_reg[is.na(breastcancer_reg$Bare_Nuclei), ]

reg_model <- lm(Bare_Nuclei ~ Clump_Thickness + Uniformity_of_Cell_Size +
                   Uniformity_of_Cell_Shape + Marginal_Adhesion +
                   Single_Epithelial_Cell_Size + Bland_Chromatin +
                   Normal_Nucleoli + Mitoses,
                 data = complete_rows)

summary(reg_model)

## 
## Call:
## lm(formula = Bare_Nuclei ~ Clump_Thickness + Uniformity_of_Cell_Size + 
##     Uniformity_of_Cell_Shape + Marginal_Adhesion + Single_Epithelial_Cell_Size + 
##     Bland_Chromatin + Normal_Nucleoli + Mitoses, data = complete_rows)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7316 -0.9426 -0.3002  0.6725  8.6998 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.616652   0.194975  -3.163  0.00163 ** 
## Clump_Thickness              0.230156   0.041691   5.521 4.83e-08 ***
## Uniformity_of_Cell_Size     -0.067980   0.076170  -0.892  0.37246    
## Uniformity_of_Cell_Shape     0.340442   0.073420   4.637 4.25e-06 ***
## Marginal_Adhesion            0.339705   0.045919   7.398 4.13e-13 ***
## Single_Epithelial_Cell_Size  0.090392   0.062541   1.445  0.14883    
## Bland_Chromatin              0.320577   0.059047   5.429 7.91e-08 ***
## Normal_Nucleoli              0.007293   0.044486   0.164  0.86983    
## Mitoses                     -0.075230   0.059331  -1.268  0.20524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.274 on 674 degrees of freedom
## Multiple R-squared:  0.615,  Adjusted R-squared:  0.6104 
## F-statistic: 134.6 on 8 and 674 DF,  p-value: < 2.2e-16

predicted_values <- predict(reg_model, newdata = missing_rows)
predicted_values

##        24        41       140       146       159       165       236       250 
## 5.3660666 8.2259101 0.8892805 1.6605574 1.0899300 2.2208736 2.7818889 1.7605617 
##       276       293       295       298       316       322       412       618 
## 2.1208694 5.8459477 0.9796727 2.3918282 5.5419942 1.7605617 0.8892805 0.5687034

predicted_values_clamped <- round(predicted_values)
predicted_values_clamped[predicted_values_clamped < 1] <- 1
predicted_values_clamped[predicted_values_clamped > 10] <- 10

breastcancer_reg$Bare_Nuclei[is.na(breastcancer_reg$Bare_Nuclei)] <- predicted_values_clamped

sum(is.na(breastcancer_reg$Bare_Nuclei))

## [1] 0

Method 3: Regression Imputation with Perturbation

breastcancer_pert <- breastcancer

predicted_values_pert <- predict(reg_model, newdata = missing_rows)
residual_sd <- summary(reg_model)$sigma

set.seed(42)
noise <- rnorm(length(predicted_values_pert), mean = 0, sd = residual_sd)
predicted_values_pert_noisy <- predicted_values_pert + noise

predicted_values_pert_noisy <- round(predicted_values_pert_noisy)
predicted_values_pert_noisy[predicted_values_pert_noisy < 1] <- 1
predicted_values_pert_noisy[predicted_values_pert_noisy > 10] <- 10

breastcancer_pert$Bare_Nuclei[is.na(breastcancer_pert$Bare_Nuclei)] <- predicted_values_pert_noisy

sum(is.na(breastcancer_pert$Bare_Nuclei))

## [1] 0

All three imputed datasets (breastcancer_mean, breastcancer_reg, breastcancer_pert) now have zero missing values in Bare_Nuclei.