new_data <- readRDS("loan_data_ch1.rds")
summary(new_data$emp_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 4.000 6.145 8.000 62.000 809
data_delete = new_data
summary(data_delete$emp_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 4.000 6.145 8.000 62.000 809
## Delete rows
index_NA = which(is.na(data_delete$emp_length))
data_delete = data_delete[-index_NA,]
## Delete columns
#data_delete$emp_length = NULL
summary(data_delete$emp_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 6.145 8.000 62.000
data_replace = new_data
index_NA = which(is.na(data_replace$emp_length))
data_replace$emp_length[index_NA] = median(data_replace$emp_length,na.rm = T)
summary(data_replace$emp_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 6.086 8.000 62.000
When NA values are important, it’s fine to keep them. But some functions may cause row deletions. To solve this, we can use coarse classification, meaning put variables into “bins”.
For example, in this case we can divide emp_length in to five categories: “0-15”, “15-30”, “30-45”, “45+” and “missing”.
data_keep = new_data
data_keep$int_cat = rep(NA, length(data_keep$int_rate))
data_keep$int_cat[which(data_keep$int_rate <= 8)] <- "0-8"
data_keep$int_cat[which(data_keep$int_rate > 8 & data_keep$int_rate <= 11)] <- "8-11"
data_keep$int_cat[which(data_keep$int_rate > 11 & data_keep$int_rate <= 13.5)] <- "11-13.5"
data_keep$int_cat[which(data_keep$int_rate > 13.5)] <- "13.5+"
data_keep$int_cat[which(is.na(data_keep$int_rate))] <- "Missing"
data_keep$int_cat <- as.factor(data_keep$int_cat)
plot(data_keep$int_cat)