## plot histograms
ggplot(gather(Glass[,1:8]), aes(value),) +
geom_histogram() +
facet_wrap(~key, scales = 'free_x')## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
col_names_list <- colnames(Glass)
par(mfrow = c(2, 4))
for (col in 1:8) {
hist(Glass[,col],main=col_names_list[col])
}## explore correlations
nearZeroVar(Glass)## integer(0)
correlations <- (cor(Glass[1:8]))
corrplot(correlations)highCorr <- findCorrelation(correlations, cutoff = .75)
length(highCorr)## [1] 1
description <- psych::describe(Glass[,1:8])
kable(description,digits =2,'markdown',booktabs =T)| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RI | 1 | 214 | 1.52 | 0.00 | 1.52 | 1.52 | 0.00 | 1.51 | 1.53 | 0.02 | 1.60 | 4.72 | 0.00 |
| Na | 2 | 214 | 13.41 | 0.82 | 13.30 | 13.38 | 0.64 | 10.73 | 17.38 | 6.65 | 0.45 | 2.90 | 0.06 |
| Mg | 3 | 214 | 2.68 | 1.44 | 3.48 | 2.87 | 0.30 | 0.00 | 4.49 | 4.49 | -1.14 | -0.45 | 0.10 |
| Al | 4 | 214 | 1.44 | 0.50 | 1.36 | 1.41 | 0.31 | 0.29 | 3.50 | 3.21 | 0.89 | 1.94 | 0.03 |
| Si | 5 | 214 | 72.65 | 0.77 | 72.79 | 72.71 | 0.57 | 69.81 | 75.41 | 5.60 | -0.72 | 2.82 | 0.05 |
| K | 6 | 214 | 0.50 | 0.65 | 0.56 | 0.43 | 0.17 | 0.00 | 6.21 | 6.21 | 6.46 | 52.87 | 0.04 |
| Ca | 7 | 214 | 8.96 | 1.42 | 8.60 | 8.74 | 0.66 | 5.43 | 16.19 | 10.76 | 2.02 | 6.41 | 0.10 |
| Ba | 8 | 214 | 0.18 | 0.50 | 0.00 | 0.03 | 0.00 | 0.00 | 3.15 | 3.15 | 3.37 | 12.08 | 0.03 |
# Run shapiro test
apply(Glass[,1:8], 2,shapiro.test)## $RI
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.86757, p-value = 1.077e-12
##
##
## $Na
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.94576, p-value = 3.466e-07
##
##
## $Mg
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.69934, p-value < 2.2e-16
##
##
## $Al
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.94341, p-value = 2.083e-07
##
##
## $Si
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.91966, p-value = 2.175e-09
##
##
## $K
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.44162, p-value < 2.2e-16
##
##
## $Ca
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.79387, p-value = 4.287e-16
##
##
## $Ba
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.40857, p-value < 2.2e-16
#boxplots
ggplot(stack(Glass[,c(1,2,3,4,6,7,8)]), aes(x = ind, y = values)) +
geom_boxplot()boxplot(Glass[,5],main="Si")## qq plots
par(mfrow = c(2, 4))
for (x in 1:8){
qqplot(Glass[,x],Glass[,9])
}## get lambda values
lambda_list <- round(apply(Glass[,1:8], 2,BoxCox.lambda),2)
lambda_list## RI Na Mg Al Si K Ca Ba
## -1.00 -0.74 1.00 0.36 -1.00 0.06 -0.39 0.09
## make transformations- could have liekly used some sort of mapping apply
transformed_RI <- DescTools::BoxCox(Glass[,"RI"],lambda_list[1])
transformed_Na <- DescTools::BoxCox(Glass[,"Na"],lambda_list[2])
transformed_Mg <- DescTools::BoxCox(Glass[,"Mg"]+.0001,lambda_list[3])
transformed_Al <- DescTools::BoxCox(Glass[,"Al"],lambda_list[4])
transformed_Si <- DescTools::BoxCox(Glass[,"Si"],lambda_list[5])
transformed_K <- DescTools::BoxCox(Glass[,"K"]+.0001,lambda_list[6])
transformed_Ca <- DescTools::BoxCox(Glass[,"Ca"],lambda_list[7])
transformed_Ba <- DescTools::BoxCox(Glass[,"Ba"]+.0001,lambda_list[8])
## build df to store transformations for comparison
transfomed_vals_df <- cbind(transformed_RI,transformed_Na,transformed_Mg,transformed_Al,transformed_Si,transformed_K,transformed_Ca,transformed_Ba)
## compare original skew to new skew and print
original_vals <- round(apply(Glass[,1:8], 2, e1071::skewness),2)
transformed_vals <- round(apply(transfomed_vals_df, 2, e1071::skewness),2)
comparison_df <- tibble::as_tibble(rbind(original_vals,transformed_vals))
rownames(comparison_df) <- c("original skew ","transformed skew")## Warning: Setting row names on a tibble is deprecated.
kable(comparison_df,'markdown',booktabs =T)| RI | Na | Mg | Al | Si | K | Ca | Ba | |
|---|---|---|---|---|---|---|---|---|
| original skew | 1.60 | 0.45 | -1.14 | 0.89 | -0.72 | 6.46 | 2.02 | 3.37 |
| transformed skew | 1.58 | -0.27 | -1.14 | -0.15 | -0.86 | -1.65 | 0.64 | 1.79 |
#plot hists
par(mfrow = c(2, 2))
hist(transformed_RI )
hist(Glass[,"RI"])
hist(transformed_Na)
hist(Glass[,"Na"])## $transformed_RI
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.86975, p-value = 1.416e-12
##
##
## $transformed_Na
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.94669, p-value = 4.248e-07
##
##
## $transformed_Mg
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.69934, p-value < 2.2e-16
##
##
## $transformed_Al
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.96967, p-value = 0.0001463
##
##
## $transformed_Si
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.91385, p-value = 8.116e-10
##
##
## $transformed_K
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.65918, p-value < 2.2e-16
##
##
## $transformed_Ca
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.88015, p-value = 5.462e-12
##
##
## $transformed_Ba
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.48899, p-value < 2.2e-16
trans <- preProcess(Glass[1:8], method = c("BoxCox", "center", "scale", "pca"))
trans## Created from 214 samples and 8 variables
##
## Pre-processing:
## - Box-Cox transformation (5)
## - centered (8)
## - ignored (0)
## - principal component signal extraction (8)
## - scaled (8)
##
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, 0.5, 2, -1.1
## PCA needed 6 components to capture 95 percent of the variance
transformed <- predict(trans, Glass[1:8])
head(transformed[, 1:5])## PC1 PC2 PC3 PC4 PC5
## 1 -1.33901195 -0.4927873 -0.33303617 1.51077970 -0.2622680
## 2 0.51903168 -0.7427874 0.28396049 0.81767438 0.1580859
## 3 0.90824298 -0.9117231 0.49487035 0.31088338 -0.1136152
## 4 0.06764426 -0.9477904 -0.08650409 0.27208686 -0.1564156
## 5 0.27237037 -1.0598101 0.38886907 0.08780329 -0.1862970
## 6 0.76258760 -1.1746396 -0.02049669 -0.47101783 -0.4150976
apply(transformed[, 1:5], 2,hist)## $PC1
## $breaks
## [1] -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6
##
## $counts
## [1] 1 4 1 13 21 71 68 15 14 1 2 3
##
## $density
## [1] 0.004672897 0.018691589 0.004672897 0.060747664 0.098130841
## [6] 0.331775701 0.317757009 0.070093458 0.065420561 0.004672897
## [11] 0.009345794 0.014018692
##
## $mids
## [1] -5.5 -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $PC2
## $breaks
## [1] -2 -1 0 1 2 3 4 5 6
##
## $counts
## [1] 50 97 19 18 19 10 0 1
##
## $density
## [1] 0.233644860 0.453271028 0.088785047 0.084112150 0.088785047 0.046728972
## [7] 0.000000000 0.004672897
##
## $mids
## [1] -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $PC3
## $breaks
## [1] -8 -6 -4 -2 0 2 4 6
##
## $counts
## [1] 2 2 3 89 114 3 1
##
## $density
## [1] 0.004672897 0.004672897 0.007009346 0.207943925 0.266355140 0.007009346
## [7] 0.002336449
##
## $mids
## [1] -7 -5 -3 -1 1 3 5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $PC4
## $breaks
## [1] -5 -4 -3 -2 -1 0 1 2 3 4
##
## $counts
## [1] 1 4 4 17 82 76 23 6 1
##
## $density
## [1] 0.004672897 0.018691589 0.018691589 0.079439252 0.383177570 0.355140187
## [7] 0.107476636 0.028037383 0.004672897
##
## $mids
## [1] -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $PC5
## $breaks
## [1] -5 -4 -3 -2 -1 0 1 2 3 4
##
## $counts
## [1] 1 0 1 6 119 74 10 1 2
##
## $density
## [1] 0.004672897 0.000000000 0.004672897 0.028037383 0.556074766 0.345794393
## [7] 0.046728972 0.004672897 0.009345794
##
## $mids
## [1] -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
summary(transformed)## PC1 PC2 PC3
## Min. :-5.78438 Min. :-1.5855 Min. :-7.41802
## 1st Qu.:-0.61576 1st Qu.:-0.9758 1st Qu.:-0.30967
## Median :-0.01952 Median :-0.6592 Median : 0.07274
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.67777 3rd Qu.: 0.7074 3rd Qu.: 0.38584
## Max. : 5.13739 Max. : 5.7588 Max. : 5.31220
## PC4 PC5 PC6
## Min. :-4.26759 Min. :-4.67554 Min. :-1.36928
## 1st Qu.:-0.46781 1st Qu.:-0.33779 1st Qu.:-0.35231
## Median :-0.02018 Median :-0.08574 Median :-0.05271
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.49207 3rd Qu.: 0.27991 3rd Qu.: 0.21843
## Max. : 3.54995 Max. : 3.79830 Max. : 3.23628
new_df <- spatialSign(Glass[1:8])
apply(new_df[, 1:8], 2,hist)## $RI
## $breaks
## [1] 0.0194 0.0196 0.0198 0.0200 0.0202 0.0204 0.0206 0.0208 0.0210 0.0212
## [11] 0.0214
##
## $counts
## [1] 1 1 2 24 110 55 13 6 1 1
##
## $density
## [1] 23.36449 23.36449 46.72897 560.74766 2570.09346 1285.04673
## [7] 303.73832 140.18692 23.36449 23.36449
##
## $mids
## [1] 0.0195 0.0197 0.0199 0.0201 0.0203 0.0205 0.0207 0.0209 0.0211 0.0213
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Na
## $breaks
## [1] 0.14 0.15 0.16 0.17 0.18 0.19 0.20 0.21 0.22 0.23
##
## $counts
## [1] 4 3 20 93 59 29 4 1 1
##
## $density
## [1] 1.8691589 1.4018692 9.3457944 43.4579439 27.5700935 13.5514019
## [7] 1.8691589 0.4672897 0.4672897
##
## $mids
## [1] 0.145 0.155 0.165 0.175 0.185 0.195 0.205 0.215 0.225
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Mg
## $breaks
## [1] 0.000 0.005 0.010 0.015 0.020 0.025 0.030 0.035 0.040 0.045 0.050
## [12] 0.055 0.060 0.065
##
## $counts
## [1] 43 0 2 1 5 4 5 11 10 104 28 0 1
##
## $density
## [1] 40.1869159 0.0000000 1.8691589 0.9345794 4.6728972 3.7383178
## [7] 4.6728972 10.2803738 9.3457944 97.1962617 26.1682243 0.0000000
## [13] 0.9345794
##
## $mids
## [1] 0.0025 0.0075 0.0125 0.0175 0.0225 0.0275 0.0325 0.0375 0.0425 0.0475
## [11] 0.0525 0.0575 0.0625
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Al
## $breaks
## [1] 0.000 0.005 0.010 0.015 0.020 0.025 0.030 0.035 0.040 0.045 0.050
##
## $counts
## [1] 2 12 26 90 53 16 7 5 2 1
##
## $density
## [1] 1.8691589 11.2149533 24.2990654 84.1121495 49.5327103 14.9532710
## [7] 6.5420561 4.6728972 1.8691589 0.9345794
##
## $mids
## [1] 0.0025 0.0075 0.0125 0.0175 0.0225 0.0275 0.0325 0.0375 0.0425 0.0475
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Si
## $breaks
## [1] 0.960 0.962 0.964 0.966 0.968 0.970 0.972 0.974 0.976 0.978 0.980
##
## $counts
## [1] 1 0 1 3 9 18 41 57 78 6
##
## $density
## [1] 2.336449 0.000000 2.336449 7.009346 21.028037 42.056075
## [7] 95.794393 133.177570 182.242991 14.018692
##
## $mids
## [1] 0.961 0.963 0.965 0.967 0.969 0.971 0.973 0.975 0.977 0.979
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $K
## $breaks
## [1] 0.00 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09
##
## $counts
## [1] 202 7 2 1 0 0 0 0 2
##
## $density
## [1] 94.3925234 3.2710280 0.9345794 0.4672897 0.0000000 0.0000000
## [7] 0.0000000 0.0000000 0.9345794
##
## $mids
## [1] 0.005 0.015 0.025 0.035 0.045 0.055 0.065 0.075 0.085
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Ca
## $breaks
## [1] 0.06 0.08 0.10 0.12 0.14 0.16 0.18 0.20 0.22 0.24
##
## $counts
## [1] 2 8 135 47 13 3 5 0 1
##
## $density
## [1] 0.4672897 1.8691589 31.5420561 10.9813084 3.0373832 0.7009346
## [7] 1.1682243 0.0000000 0.2336449
##
## $mids
## [1] 0.07 0.09 0.11 0.13 0.15 0.17 0.19 0.21 0.23
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $Ba
## $breaks
## [1] 0.000 0.005 0.010 0.015 0.020 0.025 0.030 0.035 0.040 0.045
##
## $counts
## [1] 185 11 3 2 10 0 1 1 1
##
## $density
## [1] 172.8971963 10.2803738 2.8037383 1.8691589 9.3457944 0.0000000
## [7] 0.9345794 0.9345794 0.9345794
##
## $mids
## [1] 0.0025 0.0075 0.0125 0.0175 0.0225 0.0275 0.0325 0.0375 0.0425
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
glass_pca <- prcomp(transformed,center = FALSE)
summary(glass_pca)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.5726 1.3845 1.1651 1.0757 0.75380 0.65100
## Proportion of Variance 0.3132 0.2428 0.1719 0.1465 0.07196 0.05367
## Cumulative Proportion 0.3132 0.5559 0.7278 0.8744 0.94633 1.00000
data(Soybean)
nearZeroVar(Soybean)## [1] 19 26 28
apply(Soybean[,c(19,26,28)],2,count)## $leaf.mild
## x freq
## 1 0 535
## 2 1 20
## 3 2 20
## 4 <NA> 108
##
## $mycelium
## x freq
## 1 0 639
## 2 1 6
## 3 <NA> 38
##
## $sclerotia
## x freq
## 1 0 625
## 2 1 20
## 3 <NA> 38
the logical condition returns true
toward the end below, I rpint out the distribution of the classes in observations which have missing values, versus in the general dataset. You can see that the NA dataset
## missing values
pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(Soybean,2,pMiss)## Class date plant.stand precip
## 0.0000000 0.1464129 5.2708638 5.5636896
## temp hail crop.hist area.dam
## 4.3923865 17.7159590 2.3426061 0.1464129
## sever seed.tmt germ plant.growth
## 17.7159590 17.7159590 16.3982430 2.3426061
## leaves leaf.halo leaf.marg leaf.size
## 0.0000000 12.2986823 12.2986823 12.2986823
## leaf.shread leaf.malf leaf.mild stem
## 14.6412884 12.2986823 15.8125915 2.3426061
## lodging stem.cankers canker.lesion fruiting.bodies
## 17.7159590 5.5636896 5.5636896 15.5197657
## ext.decay mycelium int.discolor sclerotia
## 5.5636896 5.5636896 5.5636896 5.5636896
## fruit.pods fruit.spots seed mold.growth
## 12.2986823 15.5197657 13.4699854 13.4699854
## seed.discolor seed.size shriveling roots
## 15.5197657 13.4699854 15.5197657 4.5387994
aggr_plot <- aggr(Soybean, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))##
## Variables sorted by number of missings:
## Variable Count
## hail 0.177159590
## sever 0.177159590
## seed.tmt 0.177159590
## lodging 0.177159590
## germ 0.163982430
## leaf.mild 0.158125915
## fruiting.bodies 0.155197657
## fruit.spots 0.155197657
## seed.discolor 0.155197657
## shriveling 0.155197657
## leaf.shread 0.146412884
## seed 0.134699854
## mold.growth 0.134699854
## seed.size 0.134699854
## leaf.halo 0.122986823
## leaf.marg 0.122986823
## leaf.size 0.122986823
## leaf.malf 0.122986823
## fruit.pods 0.122986823
## precip 0.055636896
## stem.cankers 0.055636896
## canker.lesion 0.055636896
## ext.decay 0.055636896
## mycelium 0.055636896
## int.discolor 0.055636896
## sclerotia 0.055636896
## plant.stand 0.052708638
## roots 0.045387994
## temp 0.043923865
## crop.hist 0.023426061
## plant.growth 0.023426061
## stem 0.023426061
## date 0.001464129
## area.dam 0.001464129
## Class 0.000000000
## leaves 0.000000000
is.na(Soybean[1:50,c("leaf.halo",'leaf.marg',"leaf.size","leaf.malf")])## leaf.halo leaf.marg leaf.size leaf.malf
## 1 FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE FALSE
## 6 FALSE FALSE FALSE FALSE
## 7 FALSE FALSE FALSE FALSE
## 8 FALSE FALSE FALSE FALSE
## 9 FALSE FALSE FALSE FALSE
## 10 FALSE FALSE FALSE FALSE
## 11 FALSE FALSE FALSE FALSE
## 12 FALSE FALSE FALSE FALSE
## 13 FALSE FALSE FALSE FALSE
## 14 FALSE FALSE FALSE FALSE
## 15 FALSE FALSE FALSE FALSE
## 16 FALSE FALSE FALSE FALSE
## 17 FALSE FALSE FALSE FALSE
## 18 FALSE FALSE FALSE FALSE
## 19 FALSE FALSE FALSE FALSE
## 20 FALSE FALSE FALSE FALSE
## 21 FALSE FALSE FALSE FALSE
## 22 FALSE FALSE FALSE FALSE
## 23 FALSE FALSE FALSE FALSE
## 24 FALSE FALSE FALSE FALSE
## 25 FALSE FALSE FALSE FALSE
## 26 FALSE FALSE FALSE FALSE
## 27 FALSE FALSE FALSE FALSE
## 28 FALSE FALSE FALSE FALSE
## 29 FALSE FALSE FALSE FALSE
## 30 FALSE FALSE FALSE FALSE
## 31 FALSE FALSE FALSE FALSE
## 32 FALSE FALSE FALSE FALSE
## 33 TRUE TRUE TRUE TRUE
## 34 FALSE FALSE FALSE FALSE
## 35 TRUE TRUE TRUE TRUE
## 36 TRUE TRUE TRUE TRUE
## 37 FALSE FALSE FALSE FALSE
## 38 FALSE FALSE FALSE FALSE
## 39 FALSE FALSE FALSE FALSE
## 40 FALSE FALSE FALSE FALSE
## 41 FALSE FALSE FALSE FALSE
## 42 TRUE TRUE TRUE TRUE
## 43 FALSE FALSE FALSE FALSE
## 44 FALSE FALSE FALSE FALSE
## 45 FALSE FALSE FALSE FALSE
## 46 TRUE TRUE TRUE TRUE
## 47 FALSE FALSE FALSE FALSE
## 48 FALSE FALSE FALSE FALSE
## 49 FALSE FALSE FALSE FALSE
## 50 FALSE FALSE FALSE FALSE
is.na(Soybean[1:50,c("hail",'sever')])## hail sever
## 1 FALSE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
## 7 FALSE FALSE
## 8 FALSE FALSE
## 9 FALSE FALSE
## 10 FALSE FALSE
## 11 FALSE FALSE
## 12 FALSE FALSE
## 13 FALSE FALSE
## 14 FALSE FALSE
## 15 FALSE FALSE
## 16 FALSE FALSE
## 17 FALSE FALSE
## 18 FALSE FALSE
## 19 FALSE FALSE
## 20 FALSE FALSE
## 21 FALSE FALSE
## 22 FALSE FALSE
## 23 FALSE FALSE
## 24 FALSE FALSE
## 25 FALSE FALSE
## 26 FALSE FALSE
## 27 FALSE FALSE
## 28 FALSE FALSE
## 29 FALSE FALSE
## 30 FALSE FALSE
## 31 FALSE FALSE
## 32 TRUE TRUE
## 33 TRUE TRUE
## 34 FALSE FALSE
## 35 TRUE TRUE
## 36 TRUE TRUE
## 37 FALSE FALSE
## 38 FALSE FALSE
## 39 TRUE TRUE
## 40 FALSE FALSE
## 41 TRUE TRUE
## 42 TRUE TRUE
## 43 FALSE FALSE
## 44 FALSE FALSE
## 45 FALSE FALSE
## 46 TRUE TRUE
## 47 FALSE FALSE
## 48 TRUE TRUE
## 49 FALSE FALSE
## 50 FALSE FALSE
## test where NAs are
truth <- is.na(Soybean$lodging)
truth2 <- is.na(Soybean$germ)
aa <- which(truth2!=truth)
length(aa)## [1] 9
##imputeby dropping all observations that are na, then compare to dropping NA from any of the top count predictors
no_na_df <- dim(na.omit(Soybean))[1]
hail_df <- length(na.omit(Soybean$hail))
no_na_df==hail_df## [1] TRUE
new_DF <- subset(Soybean, is.na(Soybean$hail))
table(new_DF$Class, dnn = "NA dataset")## NA dataset
## 2-4-d-injury alternarialeaf-spot
## 16 0
## anthracnose bacterial-blight
## 0 0
## bacterial-pustule brown-spot
## 0 0
## brown-stem-rot charcoal-rot
## 0 0
## cyst-nematode diaporthe-pod-&-stem-blight
## 14 15
## diaporthe-stem-canker downy-mildew
## 0 0
## frog-eye-leaf-spot herbicide-injury
## 0 8
## phyllosticta-leaf-spot phytophthora-rot
## 0 68
## powdery-mildew purple-seed-stain
## 0 0
## rhizoctonia-root-rot
## 0
table(Soybean$Class,dnn="original dataset")## original dataset
## 2-4-d-injury alternarialeaf-spot
## 16 91
## anthracnose bacterial-blight
## 44 20
## bacterial-pustule brown-spot
## 20 92
## brown-stem-rot charcoal-rot
## 44 20
## cyst-nematode diaporthe-pod-&-stem-blight
## 14 15
## diaporthe-stem-canker downy-mildew
## 20 20
## frog-eye-leaf-spot herbicide-injury
## 91 8
## phyllosticta-leaf-spot phytophthora-rot
## 20 88
## powdery-mildew purple-seed-stain
## 20 20
## rhizoctonia-root-rot
## 20
I think, as this is a unique case, the best strategy is to just omit all these observations. Looking at the imputation results above, we would need to eliminate the top 19 predictors in order to bring the missing data down to about 5% for most of the remaining columns. Imputation from there would lead to further information loss. However, if we chose to just omit the 17.7% of observations that have NA, we have fully cleaned our dataset.
This method still bares risk as well. Perhaps there is a reason these observations are missing data and we are dropping nearly 20% of our observations. We can see from the above tables that the missing data does moslty associate with only 3 classes.