library(readr)
data<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 11
##        X1    X2    X3    X4    X5    X6 X7       X8    X9   X10   X11
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1000025     5     1     1     1     2 1         3     1     1     2
## 2 1002945     5     4     4     5     7 10        3     2     1     2
## 3 1015425     3     1     1     1     2 2         3     1     1     2
## 4 1016277     6     8     8     1     3 4         3     7     1     2
## 5 1017023     4     1     1     3     2 1         3     1     1     2
## 6 1017122     8    10    10     8     7 10        9     7     1     4
sapply(data, class) #x7 is character, need to change to numeric to find percentage
##          X1          X2          X3          X4          X5          X6 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##          X7          X8          X9         X10         X11 
## "character"   "numeric"   "numeric"   "numeric"   "numeric"
data$X7 <- as.numeric(data$X7)
## Warning: NAs introduced by coercion
#is the N/A less than 5%? 
sum(is.na(data))/prod(dim(data))*100 #less than 5% so we can go ahead with imputation
## [1] 0.2080895

1 - Use the mean/mode imputation method to impute values for the missing data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
is.na(data) <- data == "?" #replace all ? with NA 

data %>% summarise_all(funs(sum(is.na(.)))) #X7 has 16 NA values
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 1 × 11
##      X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11
##   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1     0     0     0     0     0     0    16     0     0     0     0
#replace all NA characters with mean
data$X7[is.na(data$X7)] <- mean(data$X7, na.rm=TRUE)

sum(is.na(data$X7)) #verify that no NA values are there 
## [1] 0

2- Use regression to impute values for the missing data

data2<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#make data numeric
data2 <- mutate_all(data2, function(x) as.numeric(as.character(x)))
## Warning in (function (x) : NAs introduced by coercion
# data without NA 
df <- data2 %>% filter(!is.na(data2$X7))

#skip x1 and x11 because they are id number and type of tumor
model <- lm(X7~X2+X3+X4+X5+X6+X7+X8+X9+X10, data = df)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 6 in
## model.matrix: no columns are assigned
summary(model)
## 
## Call:
## lm(formula = X7 ~ X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7316 -0.9426 -0.3002  0.6725  8.6998 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.616652   0.194975  -3.163  0.00163 ** 
## X2           0.230156   0.041691   5.521 4.83e-08 ***
## X3          -0.067980   0.076170  -0.892  0.37246    
## X4           0.340442   0.073420   4.637 4.25e-06 ***
## X5           0.339705   0.045919   7.398 4.13e-13 ***
## X6           0.090392   0.062541   1.445  0.14883    
## X8           0.320577   0.059047   5.429 7.91e-08 ***
## X9           0.007293   0.044486   0.164  0.86983    
## X10         -0.075230   0.059331  -1.268  0.20524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.274 on 674 degrees of freedom
## Multiple R-squared:  0.615,  Adjusted R-squared:  0.6104 
## F-statistic: 134.6 on 8 and 674 DF,  p-value: < 2.2e-16
#create better model with significant variables  

model2 <-lm(X7~X2+X4+X5+X8, data = df)

summary(model2)
## 
## Call:
## lm(formula = X7 ~ X2 + X4 + X5 + X8, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8115 -0.9531 -0.3111  0.6678  8.6889 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.53601    0.17514  -3.060   0.0023 ** 
## X2           0.22617    0.04121   5.488 5.75e-08 ***
## X4           0.31729    0.05086   6.239 7.76e-10 ***
## X5           0.33227    0.04431   7.499 2.03e-13 ***
## X8           0.32378    0.05606   5.775 1.17e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.274 on 678 degrees of freedom
## Multiple R-squared:  0.6129, Adjusted R-squared:  0.6107 
## F-statistic: 268.4 on 4 and 678 DF,  p-value: < 2.2e-16
#replace NA with predicted values using regression
data2<- data2 %>% mutate(pred = predict(model2,.)) %>% 
  mutate(X7 = ifelse(is.na(X7), pred, X7))

sum(is.na(data2$X7)) #verify that no NA values are there 
## [1] 0

part two using mice since I just realized it exists

set.seed(2) 

library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
mice_data <- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#norm predict: Imputes univariate missing data using the predicted value from a linear regression
reg <- mice(mice_data, method='norm.predict')
## 
##  iter imp variable
##   1   1
##   1   2
##   1   3
##   1   4
##   1   5
##   2   1
##   2   2
##   2   3
##   2   4
##   2   5
##   3   1
##   3   2
##   3   3
##   3   4
##   3   5
##   4   1
##   4   2
##   4   3
##   4   4
##   4   5
##   5   1
##   5   2
##   5   3
##   5   4
##   5   5
## Warning: Number of logged events: 1
reg
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 
##  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## PredictorMatrix:
##    X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## X1  0  1  1  1  1  1  0  1  1   1   1
## X2  1  0  1  1  1  1  0  1  1   1   1
## X3  1  1  0  1  1  1  0  1  1   1   1
## X4  1  1  1  0  1  1  0  1  1   1   1
## X5  1  1  1  1  0  1  0  1  1   1   1
## X6  1  1  1  1  1  0  0  1  1   1   1
## Number of logged events:  1 
##   it im dep     meth out
## 1  0  0     constant  X7
mice_data <- complete(reg)

sum(is.na(mice_data$X7))
## [1] 0

3 - Use regression with perturbation to impute values for the missing data

data3<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# norm.nob imputes univariate missing data using linear regression  without accounting for the uncertainty of the model parameters.

p1 <- mice(data3,method ='norm.nob')
## 
##  iter imp variable
##   1   1
##   1   2
##   1   3
##   1   4
##   1   5
##   2   1
##   2   2
##   2   3
##   2   4
##   2   5
##   3   1
##   3   2
##   3   3
##   3   4
##   3   5
##   4   1
##   4   2
##   4   3
##   4   4
##   4   5
##   5   1
##   5   2
##   5   3
##   5   4
##   5   5
## Warning: Number of logged events: 1
p1
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 
##  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## PredictorMatrix:
##    X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## X1  0  1  1  1  1  1  0  1  1   1   1
## X2  1  0  1  1  1  1  0  1  1   1   1
## X3  1  1  0  1  1  1  0  1  1   1   1
## X4  1  1  1  0  1  1  0  1  1   1   1
## X5  1  1  1  1  0  1  0  1  1   1   1
## X6  1  1  1  1  1  0  0  1  1   1   1
## Number of logged events:  1 
##   it im dep     meth out
## 1  0  0     constant  X7
data3 = complete(p1)


sum(is.na(data3$X7)) #verify that no NA values are there 
## [1] 0