data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
predictors <- Glass %>% dplyr::select(-Type)
## Plot histograms of the predictor variables
col_index = 1
for(col in predictors) {
plt <- ggplot(data=predictors, aes(x=col)) +
geom_histogram(color="black", fill="white") +
xlab(colnames(predictors)[col_index])
print(plt)
col_index = col_index+1
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Plot counts of the Response variable
Glass %>%
ggplot(aes(x=Type)) +
geom_bar()
## Generate the correlation matrix for the predictor variables
cor_matrix <- cor(predictors)
col <- colorRampPalette(c("#BB4444","#EE9988","#FFFFFF", "#77AADD", "#4477AA"))
corrplot(cor_matrix,
method="shade",
addCoef.col = "black",
shade.col=NA,
type="upper",
tl.col = "black",
tl.srt = 45,
col=col(200),
cl.pos="n")
b. Do there appear to be any outliers in the data? Are any predictors
skewed?
## Show the skewness in the data
skewness(predictors)
## RI Na Mg Al Si K Ca
## 1.6140150 0.4509917 -1.1444648 0.9009179 -0.7253173 6.5056358 2.0326774
## Ba Fe
## 3.3924309 1.7420068
outlier(predictors)
## RI Na Mg Al Si K Ca Ba
## 1.53393 17.38000 0.00000 3.50000 69.81000 6.21000 16.19000 3.15000
## Fe
## 0.51000
## Plot boxplots of the predictor variables
col_index = 1
for(col in predictors) {
plt <- boxplot(col,horizontal = TRUE,xlab=colnames(predictors)[col_index])
print(plt)
col_index = col_index+1
}
## $stats
## [,1]
## [1,] 1.51299
## [2,] 1.51652
## [3,] 1.51768
## [4,] 1.51916
## [5,] 1.52300
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 1.517395
## [2,] 1.517965
##
## $out
## [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 11.56
## [2,] 12.90
## [3,] 13.30
## [4,] 13.83
## [5,] 15.15
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 13.19955
## [2,] 13.40045
##
## $out
## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79
##
## $group
## [1] 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 0.00
## [2,] 2.09
## [3,] 3.48
## [4,] 3.60
## [5,] 4.49
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 3.31691
## [2,] 3.64309
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 0.56
## [2,] 1.19
## [3,] 1.36
## [4,] 1.63
## [5,] 2.27
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 1.312477
## [2,] 1.407523
##
## $out
## [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 71.15
## [2,] 72.28
## [3,] 72.79
## [4,] 73.09
## [5,] 73.88
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 72.70251
## [2,] 72.87749
##
## $out
## [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 0.000
## [2,] 0.120
## [3,] 0.555
## [4,] 0.610
## [5,] 1.100
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 0.5020768
## [2,] 0.6079232
##
## $out
## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41
##
## $group
## [1] 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 6.93
## [2,] 8.24
## [3,] 8.60
## [4,] 9.18
## [5,] 10.56
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 8.498474
## [2,] 8.701526
##
## $out
## [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13] 5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22 6.65 5.43
## [25] 5.79 6.47
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] 0
## [2,] 0
##
## $out
## [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
## $stats
## [,1]
## [1,] 0.00
## [2,] 0.00
## [3,] 0.00
## [4,] 0.10
## [5,] 0.25
##
## $n
## [1] 214
##
## $conf
## [,1]
## [1,] -0.01080066
## [2,] 0.01080066
##
## $out
## [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] ""
Yeah, through a look at the histogram and the boxplots, we can see evidence of outliers in a number of the predictors. Additionally, using the skewness function, we can see that all of the predictors are skewed
There are several predictors that can benefit from a transformation. For the Mg predictor variable I used a cubed transformation; whereas for the Al, Si, K, Ca, BA, and Fe variables, I went with a log transformation. These transformations are helping to make the data more normal and should help to improve our predictions in the classification model.
hist((predictors$Mg)^3)
hist(log(predictors$Al))
hist(log(predictors$Si))
hist(log(predictors$K))
hist(log(predictors$Ca))
hist(log(predictors$Ba))
hist(log(predictors$Fe))
When looking at the
data("Soybean")
summary(Soybean)
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
index <- 2
for(col in Soybean[,-1]) {
plt <- ggplot(data=Soybean, aes(x=col)) +
geom_bar() +
xlab(colnames(Soybean)[index])
print(plt)
index = index +1
}
When looking at the data, it doesn’t appear that there are any explicit and completely degenerate variables. However, there are some columns in which a significant portion of the values are of a certain class. Mycelium and Sclerotia are two such values.
map(Soybean, ~sum(is.na(.))/length(.))
## $Class
## [1] 0
##
## $date
## [1] 0.001464129
##
## $plant.stand
## [1] 0.05270864
##
## $precip
## [1] 0.0556369
##
## $temp
## [1] 0.04392387
##
## $hail
## [1] 0.1771596
##
## $crop.hist
## [1] 0.02342606
##
## $area.dam
## [1] 0.001464129
##
## $sever
## [1] 0.1771596
##
## $seed.tmt
## [1] 0.1771596
##
## $germ
## [1] 0.1639824
##
## $plant.growth
## [1] 0.02342606
##
## $leaves
## [1] 0
##
## $leaf.halo
## [1] 0.1229868
##
## $leaf.marg
## [1] 0.1229868
##
## $leaf.size
## [1] 0.1229868
##
## $leaf.shread
## [1] 0.1464129
##
## $leaf.malf
## [1] 0.1229868
##
## $leaf.mild
## [1] 0.1581259
##
## $stem
## [1] 0.02342606
##
## $lodging
## [1] 0.1771596
##
## $stem.cankers
## [1] 0.0556369
##
## $canker.lesion
## [1] 0.0556369
##
## $fruiting.bodies
## [1] 0.1551977
##
## $ext.decay
## [1] 0.0556369
##
## $mycelium
## [1] 0.0556369
##
## $int.discolor
## [1] 0.0556369
##
## $sclerotia
## [1] 0.0556369
##
## $fruit.pods
## [1] 0.1229868
##
## $fruit.spots
## [1] 0.1551977
##
## $seed
## [1] 0.1346999
##
## $mold.growth
## [1] 0.1346999
##
## $seed.discolor
## [1] 0.1551977
##
## $seed.size
## [1] 0.1346999
##
## $shriveling
## [1] 0.1551977
##
## $roots
## [1] 0.04538799
As shown in the data above, there are several predictors in which the missing data represents more than 10% of the data. The NAs seem to be related to data that corresponds to the seed, fruit, and the leaves.
In looking at the data