Data 624 - Predictive Analytics
Chapter 3
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DataExplorer)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(caret)
## Loading required package: lattice
library(summarytools)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
## For best results, restart R session and update pander using devtools:: or remotes::install_github('rapporter/pander')
library (e1071)
library(mlbench)
data(Soybean)
nearZeroVar(Soybean)
## [1] 19 26 28
paste('The degenerate variables are:', paste(names(Soybean[,nearZeroVar(Soybean)]), collapse = ', '))
## [1] "The degenerate variables are: leaf.mild, mycelium, sclerotia"
summary(Soybean[19])
## leaf.mild
## 0 :535
## 1 : 20
## 2 : 20
## NA's:108
summary(Soybean[26])
## mycelium
## 0 :639
## 1 : 6
## NA's: 38
summary(Soybean[28])
## sclerotia
## 0 :625
## 1 : 20
## NA's: 38
# Mark the rows that has missing values and has the class being "phytophthora-rot"
eliminate <- (!complete.cases(Soybean)) & ifelse(Soybean$Class=='phytophthora-rot', 1, 0)
# Eliminate those rows
Soybean.a <- Soybean[!eliminate,]
paste('Eliminated', sum(eliminate), 'rows.')
## [1] "Eliminated 68 rows."
paste(sum(!complete.cases(Soybean.a)), 'rows still contain missing values.')
## [1] "53 rows still contain missing values."
fill_na <- function(df){
for (i in 2:dim(df)[2]){
paste('Filling', sum(is.na(df[,i])), 'missing values for feature: ', names(df)[i], '.') %>% print()
find.mode <- df[,i] %>% table() %>% sort(decreasing = T) %>% prop.table() %>% round(4)
mode.name <- find.mode %>% names() %>% .[1]
paste('The most frequent factor of this feature is:', mode.name, ', which is', find.mode[mode.name]*100, '% of the class.') %>% print()
df[is.na(df[,i]), i] <- mode.name
paste('------------------------------------------------') %>% print()
}
return(df)
}
Soybean.b <- fill_na(Soybean.a)
## [1] "Filling 1 missing values for feature: date ."
## [1] "The most frequent factor of this feature is: 5 , which is 24.27 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 36 missing values for feature: plant.stand ."
## [1] "The most frequent factor of this feature is: 0 , which is 61.14 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: precip ."
## [1] "The most frequent factor of this feature is: 2 , which is 72.27 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 30 missing values for feature: temp ."
## [1] "The most frequent factor of this feature is: 1 , which is 57.09 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 53 missing values for feature: hail ."
## [1] "The most frequent factor of this feature is: 0 , which is 77.4 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 16 missing values for feature: crop.hist ."
## [1] "The most frequent factor of this feature is: 3 , which is 32.39 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 1 missing values for feature: area.dam ."
## [1] "The most frequent factor of this feature is: 3 , which is 30.46 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 53 missing values for feature: sever ."
## [1] "The most frequent factor of this feature is: 1 , which is 57.3 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 53 missing values for feature: seed.tmt ."
## [1] "The most frequent factor of this feature is: 0 , which is 54.27 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 44 missing values for feature: germ ."
## [1] "The most frequent factor of this feature is: 1 , which is 37.3 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 16 missing values for feature: plant.growth ."
## [1] "The most frequent factor of this feature is: 0 , which is 73.62 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 0 missing values for feature: leaves ."
## [1] "The most frequent factor of this feature is: 1 , which is 87.48 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 29 missing values for feature: leaf.halo ."
## [1] "The most frequent factor of this feature is: 2 , which is 58.36 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 29 missing values for feature: leaf.marg ."
## [1] "The most frequent factor of this feature is: 0 , which is 60.92 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 29 missing values for feature: leaf.size ."
## [1] "The most frequent factor of this feature is: 1 , which is 55.8 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 45 missing values for feature: leaf.shread ."
## [1] "The most frequent factor of this feature is: 0 , which is 83.16 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 29 missing values for feature: leaf.malf ."
## [1] "The most frequent factor of this feature is: 0 , which is 92.32 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 53 missing values for feature: leaf.mild ."
## [1] "The most frequent factor of this feature is: 0 , which is 92.88 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 16 missing values for feature: stem ."
## [1] "The most frequent factor of this feature is: 1 , which is 50.58 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 53 missing values for feature: lodging ."
## [1] "The most frequent factor of this feature is: 0 , which is 92.53 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: stem.cankers ."
## [1] "The most frequent factor of this feature is: 0 , which is 64.64 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: canker.lesion ."
## [1] "The most frequent factor of this feature is: 0 , which is 55.46 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: fruiting.bodies ."
## [1] "The most frequent factor of this feature is: 0 , which is 81.98 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: ext.decay ."
## [1] "The most frequent factor of this feature is: 0 , which is 76.6 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: mycelium ."
## [1] "The most frequent factor of this feature is: 0 , which is 98.96 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: int.discolor ."
## [1] "The most frequent factor of this feature is: 0 , which is 88.91 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: sclerotia ."
## [1] "The most frequent factor of this feature is: 0 , which is 96.53 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 16 missing values for feature: fruit.pods ."
## [1] "The most frequent factor of this feature is: 0 , which is 67.95 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: fruit.spots ."
## [1] "The most frequent factor of this feature is: 0 , which is 59.79 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 24 missing values for feature: seed ."
## [1] "The most frequent factor of this feature is: 0 , which is 80.54 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 24 missing values for feature: mold.growth ."
## [1] "The most frequent factor of this feature is: 0 , which is 88.66 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: seed.discolor ."
## [1] "The most frequent factor of this feature is: 0 , which is 88.91 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 24 missing values for feature: seed.size ."
## [1] "The most frequent factor of this feature is: 0 , which is 90.02 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 38 missing values for feature: shriveling ."
## [1] "The most frequent factor of this feature is: 0 , which is 93.41 % of the class."
## [1] "------------------------------------------------"
## [1] "Filling 31 missing values for feature: roots ."
## [1] "The most frequent factor of this feature is: 0 , which is 94.35 % of the class."
## [1] "------------------------------------------------"
paste('There are now', dim(Soybean.b)[1], 'rows.', sum(!complete.cases(Soybean.b)), 'rows have missing values.')
## [1] "There are now 615 rows. 0 rows have missing values."
Soybean.b %>%
arrange(Class) %>%
missmap(main = "Missing vs Observed")