library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.2.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.2.3
library(ggplot2)
library(lattice)
We would expect that a flexible model performs better than an inflexible model, given that the former will provide a closer fit with the large sample size.
We would expect that a flexible model performs more worse than an inflexible model because of the possible overfitting issue ofa flexible model.
We would expect that a flexible model would performs better than an inflexible model due to its large flexibility in terms of degrees of freedom.
We would expect that a flexible model performs better than an inflexible model, since the inflexible model will be heavily influenced by the noise of the error terms, thus resulting in an increasing variance.
(i): (squared) bias - decreases monotonically since increases in flexibility result in a closer fit.
variance - increases monotonically since increases in flexibility result in overfit.
training error - decreases monotonically since increases in flexibility result in a closer fit.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## The following object is masked from 'package:dplyr':
##
## select
dim(Boston)
## [1] 506 14
Some findings about the pairwise correlations can be summarized as follows
Several predictors in this data are highly correlated, such as the correlation between rad and tax being more than 0.91.
Some predictors are negatively correlated, such as the nox and dis being -0.77
the predictors age and nox seem to be highly correlated with many other predictors.
pairs(Boston)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
correlationvalue = cor(Boston)
corrplot.mixed(correlationvalue, order = 'AOE')
par(mfrow=c(3,2))
plot(Boston$age, Boston$crim)
# Older homes, more crime
plot(Boston$nox, Boston$crim)
# middle value of nox, more crime
plot(Boston$dis, Boston$crim)
# Closer to work-area, more crime
plot(Boston$rad, Boston$crim)
# Higher index of accessibility to radial highways, more crime
plot(Boston$tax, Boston$crim)
# Higher tax rate, more crime
plot(Boston$ptratio, Boston$crim)
# Higher pupil:teacher ratio, more crime
par(mfrow=c(1,3))
hist(Boston$crim[Boston$crim>1], breaks=20, freq = T, main="Crime More than 1")
# The crime rates are heavily right skewed, indicating that most cities have low crime rates,
# but there are a few citites having high crimial rate, which could reach to above 80.
hist(Boston$tax, breaks=20, freq = T, main="Tax")
# There is a large divide between suburbs with low tax rates and a peak at 600-700
hist(Boston$ptratio, breaks=20, freq = T, main="Ptratio")
# The ptratio seems to be slighly left skewed, indicating a skew towards high ratios.
library(e1071)
skewness(Boston$ptratio)
## [1] -0.7975743
nrow(subset(Boston, chas == 1))
## [1] 35
median(Boston$ptratio)
## [1] 19.05
t(subset(Boston, medv == min(Boston$medv)))
## 399 406
## crim 38.3518 67.9208
## zn 0.0000 0.0000
## indus 18.1000 18.1000
## chas 0.0000 0.0000
## nox 0.6930 0.6930
## rm 5.4530 5.6830
## age 100.0000 100.0000
## dis 1.4896 1.4254
## rad 24.0000 24.0000
## tax 666.0000 666.0000
## ptratio 20.2000 20.2000
## black 396.9000 384.9700
## lstat 30.5900 22.9800
## medv 5.0000 5.0000
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
dim(subset(Boston, rm > 7))
## [1] 64 14
dim(subset(Boston, rm > 8))
## [1] 13 14
summary(subset(Boston, rm > 8))
## crim zn indus chas
## Min. :0.02009 Min. : 0.00 Min. : 2.680 Min. :0.0000
## 1st Qu.:0.33147 1st Qu.: 0.00 1st Qu.: 3.970 1st Qu.:0.0000
## Median :0.52014 Median : 0.00 Median : 6.200 Median :0.0000
## Mean :0.71879 Mean :13.62 Mean : 7.078 Mean :0.1538
## 3rd Qu.:0.57834 3rd Qu.:20.00 3rd Qu.: 6.200 3rd Qu.:0.0000
## Max. :3.47428 Max. :95.00 Max. :19.580 Max. :1.0000
## nox rm age dis
## Min. :0.4161 Min. :8.034 Min. : 8.40 Min. :1.801
## 1st Qu.:0.5040 1st Qu.:8.247 1st Qu.:70.40 1st Qu.:2.288
## Median :0.5070 Median :8.297 Median :78.30 Median :2.894
## Mean :0.5392 Mean :8.349 Mean :71.54 Mean :3.430
## 3rd Qu.:0.6050 3rd Qu.:8.398 3rd Qu.:86.50 3rd Qu.:3.652
## Max. :0.7180 Max. :8.780 Max. :93.90 Max. :8.907
## rad tax ptratio black
## Min. : 2.000 Min. :224.0 Min. :13.00 Min. :354.6
## 1st Qu.: 5.000 1st Qu.:264.0 1st Qu.:14.70 1st Qu.:384.5
## Median : 7.000 Median :307.0 Median :17.40 Median :386.9
## Mean : 7.462 Mean :325.1 Mean :16.36 Mean :385.2
## 3rd Qu.: 8.000 3rd Qu.:307.0 3rd Qu.:17.40 3rd Qu.:389.7
## Max. :24.000 Max. :666.0 Max. :20.20 Max. :396.9
## lstat medv
## Min. :2.47 Min. :21.9
## 1st Qu.:3.32 1st Qu.:41.7
## Median :4.14 Median :48.3
## Mean :4.31 Mean :44.2
## 3rd Qu.:5.12 3rd Qu.:50.0
## Max. :7.44 Max. :50.0
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
#install.packages('mlbench') # if it is required.
library(mlbench)
data("Glass")
Glass1 <- Glass[,(1:9)]
#Histgrams before transformations to see how skewed the predictable variables are
par(mfrow=c(3,3))
hist(Glass1$RI, xlab = "RI",type = "count",main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Na, xlab = "Na",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Mg, xlab = "Mg",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Al, xlab = "Al",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Si, xlab = "Si",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$K, xlab = "K",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Ca, xlab = "Ca",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Ba, xlab = "Ba",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
hist(Glass1$Fe, xlab = "Fe",type = "count", main="Original")
## Warning in plot.window(xlim, ylim, "", ...): graphical parameter "type" is
## obsolete
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## graphical parameter "type" is obsolete
## Warning in axis(1, ...): graphical parameter "type" is obsolete
## Warning in axis(2, at = yt, ...): graphical parameter "type" is obsolete
We draw the scatter plot among the predictors as follows:
#Scatter plot for predictor variables
pairs(~Na+Mg+Al+Si+K+Ca+Ba+Fe, data=Glass1, main="Scatter Plot Matrix for Glass")
In addition, we can draw the correlation plot below and then find the high correlted predictors with a cutoff value of 0.75
#Correlation Matrix
glassCorr <- cor(Glass1[,1:9])
corrplot(glassCorr, method = "number", tl.cex = .35)
#correlation cutoff
highcorr <- findCorrelation(glassCorr, cutoff = .75)
head(highcorr)
## [1] 7
###Residuals/outliers
residuals <- Glass1[,1:9]
par(mfrow = c(3, 3))
for (i in 1:ncol(residuals)) {
boxplot(residuals[ ,i], ylab = names(residuals[i]), horizontal=T,
main = paste(names(residuals[i]), "Boxplot"), col="steelblue")}
#calculate skewness
skewed <- apply(Glass1[,1:9],2, skewness)
skewed
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
#As an illustration, we consider the data transformation for Ca
#Histograms before & after transformations
par(mfrow = c(1, 2))
hist(Glass1$Ca, prob = T, main = "Original")
hist(1/Glass1$Ca, prob = T, main = "Transformation")
#loading the data
data(Soybean)
summary(Soybean)
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
#frequency of predictors
par(mfrow = c(3,3))
for(i in 2:ncol(Soybean)) {
plot(Soybean[i], main = colnames(Soybean[i]))}
#Near Zero Variance
nearZeroVar(Soybean)
## [1] 19 26 28
names(Soybean[nearZeroVar(Soybean)])
## [1] "leaf.mild" "mycelium" "sclerotia"
#find missing predictors by column
colSums(is.na(Soybean))
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 84 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 121 38 38 106 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 84 106
## seed mold.growth seed.discolor seed.size shriveling
## 92 92 106 92 106
## roots
## 31
#graph of missing data
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
plot_missing(Soybean)
#sort/filter missing data together
#check for missing values of each predictor.
sapply(Soybean, function(x) sum(is.na(x)))
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 84 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 121 38 38 106 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 84 106
## seed mold.growth seed.discolor seed.size shriveling
## 92 92 106 92 106
## roots
## 31
library(VIM)
## Warning: package 'VIM' was built under R version 4.2.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(Soybean, col=c('navyblue','red'),
numbers=TRUE, sortVars=TRUE,
combined = TRUE,
prop = FALSE,
only.miss = TRUE,
labels=names(Soybean),
cex.axis=.7, gap=3,
ylab=c("Missing Data"))
##
## Variables sorted by number of missings:
## Variable Count
## hail 121
## sever 121
## seed.tmt 121
## lodging 121
## germ 112
## leaf.mild 108
## fruiting.bodies 106
## fruit.spots 106
## seed.discolor 106
## shriveling 106
## leaf.shread 100
## seed 92
## mold.growth 92
## seed.size 92
## leaf.halo 84
## leaf.marg 84
## leaf.size 84
## leaf.malf 84
## fruit.pods 84
## precip 38
## stem.cankers 38
## canker.lesion 38
## ext.decay 38
## mycelium 38
## int.discolor 38
## sclerotia 38
## plant.stand 36
## roots 31
## temp 30
## crop.hist 16
## plant.growth 16
## stem 16
## date 1
## area.dam 1
## Class 0
## leaves 0
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
set.seed(1)
missingSoybeans <- mice(Soybean, method = "pmm", printFlag = F, seed =112)
## Warning: Number of logged events: 1662
names(missingSoybeans)
## [1] "data" "imp" "m" "where"
## [5] "blocks" "call" "nmis" "method"
## [9] "predictorMatrix" "visitSequence" "formulas" "post"
## [13] "blots" "ignore" "seed" "iteration"
## [17] "lastSeedValue" "chainMean" "chainVar" "loggedEvents"
## [21] "version" "date"
summary(missingSoybeans)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Class date plant.stand precip temp
## "" "pmm" "pmm" "pmm" "pmm"
## hail crop.hist area.dam sever seed.tmt
## "pmm" "pmm" "pmm" "pmm" "pmm"
## germ plant.growth leaves leaf.halo leaf.marg
## "pmm" "pmm" "" "pmm" "pmm"
## leaf.size leaf.shread leaf.malf leaf.mild stem
## "pmm" "pmm" "pmm" "pmm" "pmm"
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## "pmm" "pmm" "pmm" "pmm" "pmm"
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## "pmm" "pmm" "pmm" "pmm" "pmm"
## seed mold.growth seed.discolor seed.size shriveling
## "pmm" "pmm" "pmm" "pmm" "pmm"
## roots
## "pmm"
## PredictorMatrix:
## Class date plant.stand precip temp hail crop.hist area.dam sever
## Class 0 1 1 1 1 1 1 1 1
## date 1 0 1 1 1 1 1 1 1
## plant.stand 1 1 0 1 1 1 1 1 1
## precip 1 1 1 0 1 1 1 1 1
## temp 1 1 1 1 0 1 1 1 1
## hail 1 1 1 1 1 0 1 1 1
## seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## Class 1 1 1 1 1 1 1
## date 1 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1 1
## precip 1 1 1 1 1 1 1
## temp 1 1 1 1 1 1 1
## hail 1 1 1 1 1 1 1
## leaf.shread leaf.malf leaf.mild stem lodging stem.cankers
## Class 1 1 1 1 1 1
## date 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1
## precip 1 1 1 1 1 1
## temp 1 1 1 1 1 1
## hail 1 1 1 1 1 1
## canker.lesion fruiting.bodies ext.decay mycelium int.discolor
## Class 1 1 1 1 1
## date 1 1 1 1 1
## plant.stand 1 1 1 1 1
## precip 1 1 1 1 1
## temp 1 1 1 1 1
## hail 1 1 1 1 1
## sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor
## Class 1 1 1 1 1 1
## date 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1
## precip 1 1 1 1 1 1
## temp 1 1 1 1 1 1
## hail 1 1 1 1 1 1
## seed.size shriveling roots
## Class 1 1 1
## date 1 1 1
## plant.stand 1 1 1
## precip 1 1 1
## temp 1 1 1
## hail 1 1 1
## Number of logged events: 1662
## it im dep meth
## 1 1 1 plant.stand pmm
## 2 1 1 precip pmm
## 3 1 1 precip pmm
## 4 1 1 temp pmm
## 5 1 1 temp pmm
## 6 1 1 hail pmm
## out
## 1 Classcharcoal-rot, Classcyst-nematode, int.discolor1, sclerotia1
## 2 Classbrown-spot, Classcharcoal-rot, Classcyst-nematode, Classdiaporthe-pod-&-stem-blight, Classherbicide-injury, fruit.pods3
## 3 mice detected that your data are (nearly) multi-collinear.\nIt applied a ridge penalty to continue calculations, but the results can be unstable.\nDoes your dataset contain duplicates, linear transformation, or factors with unique respondent names?
## 4 Classbrown-spot, Classcyst-nematode, int.discolor1, fruit.pods2
## 5 mice detected that your data are (nearly) multi-collinear.\nIt applied a ridge penalty to continue calculations, but the results can be unstable.\nDoes your dataset contain duplicates, linear transformation, or factors with unique respondent names?
## 6 Classbrown-spot, Classcharcoal-rot, Classcyst-nematode, Classdiaporthe-pod-&-stem-blight, Classherbicide-injury, Classpurple-seed-stain, leaf.marg1, lodging1, stem.cankers2, ext.decay2, sclerotia1, fruit.pods2, seed1
#visualize after imputation to see if missing data remains
md.pattern(missingSoybeans$predictorMatrix)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
## 36 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0
## germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
## 36 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 36 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth
## 36 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## seed.discolor seed.size shriveling roots
## 36 1 1 1 1 0
## 0 0 0 0 0