library(readr)
data<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 11
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1000025 5 1 1 1 2 1 3 1 1 2
## 2 1002945 5 4 4 5 7 10 3 2 1 2
## 3 1015425 3 1 1 1 2 2 3 1 1 2
## 4 1016277 6 8 8 1 3 4 3 7 1 2
## 5 1017023 4 1 1 3 2 1 3 1 1 2
## 6 1017122 8 10 10 8 7 10 9 7 1 4
sapply(data, class) #x7 is character, need to change to numeric to find percentage
## X1 X2 X3 X4 X5 X6
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## X7 X8 X9 X10 X11
## "character" "numeric" "numeric" "numeric" "numeric"
data$X7 <- as.numeric(data$X7)
## Warning: NAs introduced by coercion
#is the N/A less than 5%?
sum(is.na(data))/prod(dim(data))*100 #less than 5% so we can go ahead with imputation
## [1] 0.2080895
1 - Use the mean/mode imputation method to impute values for the
missing data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
is.na(data) <- data == "?" #replace all ? with NA
data %>% summarise_all(funs(sum(is.na(.)))) #X7 has 16 NA values
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 1 × 11
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 16 0 0 0 0
#replace all NA characters with mean
data$X7[is.na(data$X7)] <- mean(data$X7, na.rm=TRUE)
sum(is.na(data$X7)) #verify that no NA values are there
## [1] 0
2- Use regression to impute values for the missing data
data2<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#make data numeric
data2 <- mutate_all(data2, function(x) as.numeric(as.character(x)))
## Warning in (function (x) : NAs introduced by coercion
# data without NA
df <- data2 %>% filter(!is.na(data2$X7))
#skip x1 and x11 because they are id number and type of tumor
model <- lm(X7~X2+X3+X4+X5+X6+X7+X8+X9+X10, data = df)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 6 in
## model.matrix: no columns are assigned
summary(model)
##
## Call:
## lm(formula = X7 ~ X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7316 -0.9426 -0.3002 0.6725 8.6998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.616652 0.194975 -3.163 0.00163 **
## X2 0.230156 0.041691 5.521 4.83e-08 ***
## X3 -0.067980 0.076170 -0.892 0.37246
## X4 0.340442 0.073420 4.637 4.25e-06 ***
## X5 0.339705 0.045919 7.398 4.13e-13 ***
## X6 0.090392 0.062541 1.445 0.14883
## X8 0.320577 0.059047 5.429 7.91e-08 ***
## X9 0.007293 0.044486 0.164 0.86983
## X10 -0.075230 0.059331 -1.268 0.20524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.274 on 674 degrees of freedom
## Multiple R-squared: 0.615, Adjusted R-squared: 0.6104
## F-statistic: 134.6 on 8 and 674 DF, p-value: < 2.2e-16
#create better model with significant variables
model2 <-lm(X7~X2+X4+X5+X8, data = df)
summary(model2)
##
## Call:
## lm(formula = X7 ~ X2 + X4 + X5 + X8, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8115 -0.9531 -0.3111 0.6678 8.6889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.53601 0.17514 -3.060 0.0023 **
## X2 0.22617 0.04121 5.488 5.75e-08 ***
## X4 0.31729 0.05086 6.239 7.76e-10 ***
## X5 0.33227 0.04431 7.499 2.03e-13 ***
## X8 0.32378 0.05606 5.775 1.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.274 on 678 degrees of freedom
## Multiple R-squared: 0.6129, Adjusted R-squared: 0.6107
## F-statistic: 268.4 on 4 and 678 DF, p-value: < 2.2e-16
#replace NA with predicted values using regression
data2<- data2 %>% mutate(pred = predict(model2,.)) %>%
mutate(X7 = ifelse(is.na(X7), pred, X7))
sum(is.na(data2$X7)) #verify that no NA values are there
## [1] 0
part two using mice since I just realized it exists
set.seed(2)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
mice_data <- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#norm predict: Imputes univariate missing data using the predicted value from a linear regression
reg <- mice(mice_data, method='norm.predict')
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
## Warning: Number of logged events: 1
reg
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## "" "" "" "" "" "" "" "" "" "" ""
## PredictorMatrix:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## X1 0 1 1 1 1 1 0 1 1 1 1
## X2 1 0 1 1 1 1 0 1 1 1 1
## X3 1 1 0 1 1 1 0 1 1 1 1
## X4 1 1 1 0 1 1 0 1 1 1 1
## X5 1 1 1 1 0 1 0 1 1 1 1
## X6 1 1 1 1 1 0 0 1 1 1 1
## Number of logged events: 1
## it im dep meth out
## 1 0 0 constant X7
mice_data <- complete(reg)
sum(is.na(mice_data$X7))
## [1] 0
3 - Use regression with perturbation to impute values for the
missing data
data3<- read_csv("breast-cancer-wisconsin.data", col_names = FALSE)
## Rows: 699 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X7
## dbl (10): X1, X2, X3, X4, X5, X6, X8, X9, X10, X11
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# norm.nob imputes univariate missing data using linear regression without accounting for the uncertainty of the model parameters.
p1 <- mice(data3,method ='norm.nob')
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
## Warning: Number of logged events: 1
p1
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## "" "" "" "" "" "" "" "" "" "" ""
## PredictorMatrix:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## X1 0 1 1 1 1 1 0 1 1 1 1
## X2 1 0 1 1 1 1 0 1 1 1 1
## X3 1 1 0 1 1 1 0 1 1 1 1
## X4 1 1 1 0 1 1 0 1 1 1 1
## X5 1 1 1 1 0 1 0 1 1 1 1
## X6 1 1 1 1 1 0 0 1 1 1 1
## Number of logged events: 1
## it im dep meth out
## 1 0 0 constant X7
data3 = complete(p1)
sum(is.na(data3$X7)) #verify that no NA values are there
## [1] 0