Do problems 3.1 and 3.2 in the Kuhn and Johnson book Applied Predictive Modeling.
Question 3.1
The UC Irvine Machin eLearning Repository 6 contains a dataset related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:
library(mlbench)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
library(lattice)
library(ggplot2)
library(tidyr)
library(purrr)
library(corrplot)
## corrplot 0.95 loaded
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
help("lattice")
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
Exploring Predictor vairbles # Predictor variabel is on the ax #Na - Sodium
xyplot(RI ~ Na, # this x to y , predictor variables is x
data = Glass,
groups = Glass$Type, # we will group by the the 7 types of glasses
xlab = "Percentage of Soduim",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Sodium Percentage",
aspect = 1)
# Mg - Magnesium
xyplot(RI ~ Mg,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Magnesium",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Magnesium Percentage",
aspect = 1)
# AL - Aluminum
xyplot(RI ~ Al,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Aluminum",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Aluminum Percentage",
aspect = 1)
#Si - Silicon
xyplot(RI ~ Si,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Silicon",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Silicon Percentage",
aspect = 1)
# K - Potassium
xyplot(RI ~ K,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Potassium",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Potassium Percentage",
aspect = 1)
#Ca - Calcium
xyplot(RI ~ Ca,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Calcium",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Calcium Percentage",
aspect = 1)
# Ba - Barium
xyplot(RI ~ Ba,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Barium",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Barium Percentage",
aspect = 1)
#Fe - Iron
xyplot(RI ~ Fe,
data = Glass,
groups = Glass$Type,
xlab = "Percentage of Iron",
ylab = "Refractive Index (RI)",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Refractive Index vs Iron Percentage",
aspect = 1)
Observing the relationships between Predictor variables
glass_cor <- cor(Glass %>% purrr::keep(is.numeric)) # we can use cor from corplot package , to calculate the correlation matrix for numeric predictors
corrplot(glass_cor, method = "circle", # this will plot a heatmapp to show the relationship between the predictor vairables
title = "Correlation Heatmap of Predictor Variables(Elements) For Glass Dataset")
help("Glass")
This graph allows to observe the relationship between all elements. The elements with the strongest positive correlations are identified by the blue color and negative correlations by their red color. A deeper blue color indicate a correlation closer to one, so as the predictor variable increases the other will also increase. A deeper red color indicates as one predictor variable increase the other will decrease. A light shade indicate close to no or weak correlation between the variable. The bigger the circle the stronger the relationship between both variable.
# Boxplot outlier for each element predictor variable with their glass type
bwplot(Na ~ Type, data = Glass, main = "(Na) Sodium Distribution by Type")
bwplot(Mg ~ Type, data = Glass, main = " (Mg) Magnesium Distribution by Type")
bwplot(Al ~ Type, data = Glass, main = "(Al) Aluminum Distribution by Type")
bwplot(Si ~ Type, data = Glass, main = "(Si) Silicon Distribution by Type")
bwplot(K ~ Type, data = Glass, main = " (K) Potassium Distribution by Type")
bwplot(Ca ~ Type, data = Glass, main = " (Ca) Calcium Distribution by Type")
bwplot(Ba ~ Type, data = Glass, main = " (Ba) Barium Distribution by Type")
bwplot(Fe ~ Type, data = Glass, main = " (Fe) Iron Distribution by Type")
For skewness
library(e1071) # package with skewness fuction
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
skew_values <- data.frame(
Predictor = c("Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"),
Skewness = c(
skewness(Glass$Na),
skewness(Glass$Mg),
skewness(Glass$Al),
skewness(Glass$Si),
skewness(Glass$K),
skewness(Glass$Ca),
skewness(Glass$Ba),
skewness(Glass$Fe)
)
) # make new data frame to store predictor variablies skewess , then plot them
print(skew_values)
## Predictor Skewness
## 1 Na 0.4478343
## 2 Mg -1.1364523
## 3 Al 0.8946104
## 4 Si -0.7202392
## 5 K 6.4600889
## 6 Ca 2.0184463
## 7 Ba 3.3686800
## 8 Fe 1.7298107
skew_values <- skew_values[order(skew_values$Skewness), ]
print(skew_values) # look better in order from from small to big
## Predictor Skewness
## 2 Mg -1.1364523
## 4 Si -0.7202392
## 1 Na 0.4478343
## 3 Al 0.8946104
## 8 Fe 1.7298107
## 6 Ca 2.0184463
## 7 Ba 3.3686800
## 5 K 6.4600889
ggplot(skew_values, aes(x = reorder(Predictor, Skewness), y = Skewness, fill = Skewness)) +
geom_bar(stat = "identity") +
coord_flip() + # coord flip much easier to look at tha nwithout
theme_minimal() +
labs(title = "Skewness of Glass Dataset Predictors", x = "Predictor", y = "Skewness")
In the skewness scale, if the skewness is close to zero, it indicate a bell shape graph or a symmetric distribution. If lower than -0.5, then is negativity or left skewed, so most the tail will be seen on the right side and for skew greater than 0.5 the graph will be positive skewed so the most the vales will be observed the left. Using the skewness function, K(Potassium) and Ba(Barium) is strong right skewed, Fe(Iron) , Ca(Calcium) and Al (Aluminum) are moderate right skewed, Na (Soduium) this will be close to symmetric bell shape . Si (Silicon ) is slightly left skewed and lasttly Mg (Magnesium) is moderalty left skewed
library(caret) # packag foir machine learning and model traning
Glass_trans<- preProcess(Glass[-10], method = "BoxCox") # Box cox transformation and prepossessed the data , preprocess prepares the data, -10 to take out the 10th colum
Glass_pred <- predict(Glass_trans, Glass[-10]) # this will apply the box transformation we did
Glass_pred %>%
purrr::keep(is.numeric) %>% # use only numeric value
gather() %>% # reshape data from wide to long formnat
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") + # seperate plots with each scale
geom_density()
Need to make graph to compare the before and after like Chapter 3
# can turn glass into long format with the lattice package
Glass_long <- reshape2::melt(Glass[, -10]) # OG data, had to remove type colum to -10
## No id variables; using all as measure variables
Glass_trans_long <- reshape2::melt(Glass_pred) # box cox data ( melt like melt the fat candle to a skinny candle)(wide to long)
## No id variables; using all as measure variables
# will use histogram to compare the before and after
histogram(~ value | variable, data = Glass_long, layout = c(3,3), # histogram of the original data
main = "Original Predictor Variable Distributions", xlab = "Value")
histogram(~ value | variable, data = Glass_trans_long, layout = c(3,3), # histogram of the transformed daat
main = "Box-Cox Transformed Predictor Variable Distributions", xlab = "Value")
Here we use the box cox transformation to address teh skewness among the predictor variables.
Question 3.2
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions(e.g.,temperature,precipitation)and plant conditions(e.g.,left spots, mold growth). The outcome labels consist of 19 distinct classes.
library(mlbench)
data(Soybean)
help("Soybean")
## Help on topic 'Soybean' was found in the following packages:
##
## Package Library
## mlbench /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## nlme /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
##
## Using the first match ...
help("Soybean")
## Help on topic 'Soybean' was found in the following packages:
##
## Package Library
## mlbench /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## nlme /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
##
## Using the first match ...
describe(Soybean) # get a descripton of the variables before any transformation
## Soybean
##
## 36 Variables 683 Observations
## --------------------------------------------------------------------------------
## Class
## n missing distinct
## 683 0 19
##
## lowest : 2-4-d-injury alternarialeaf-spot anthracnose bacterial-blight bacterial-pustule
## highest: phyllosticta-leaf-spot phytophthora-rot powdery-mildew purple-seed-stain rhizoctonia-root-rot
## --------------------------------------------------------------------------------
## date
## n missing distinct
## 682 1 7
##
## Value 0 1 2 3 4 5 6
## Frequency 26 75 93 118 131 149 90
## Proportion 0.038 0.110 0.136 0.173 0.192 0.218 0.132
## --------------------------------------------------------------------------------
## plant.stand
## n missing distinct
## 647 36 2
##
## Value 0 1
## Frequency 354 293
## Proportion 0.547 0.453
## --------------------------------------------------------------------------------
## precip
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 74 112 459
## Proportion 0.115 0.174 0.712
## --------------------------------------------------------------------------------
## temp
## n missing distinct
## 653 30 3
##
## Value 0 1 2
## Frequency 80 374 199
## Proportion 0.123 0.573 0.305
## --------------------------------------------------------------------------------
## hail
## n missing distinct
## 562 121 2
##
## Value 0 1
## Frequency 435 127
## Proportion 0.774 0.226
## --------------------------------------------------------------------------------
## crop.hist
## n missing distinct
## 667 16 4
##
## Value 0 1 2 3
## Frequency 65 165 219 218
## Proportion 0.097 0.247 0.328 0.327
## --------------------------------------------------------------------------------
## area.dam
## n missing distinct
## 682 1 4
##
## Value 0 1 2 3
## Frequency 123 227 145 187
## Proportion 0.180 0.333 0.213 0.274
## --------------------------------------------------------------------------------
## sever
## n missing distinct
## 562 121 3
##
## Value 0 1 2
## Frequency 195 322 45
## Proportion 0.347 0.573 0.080
## --------------------------------------------------------------------------------
## seed.tmt
## n missing distinct
## 562 121 3
##
## Value 0 1 2
## Frequency 305 222 35
## Proportion 0.543 0.395 0.062
## --------------------------------------------------------------------------------
## germ
## n missing distinct
## 571 112 3
##
## Value 0 1 2
## Frequency 165 213 193
## Proportion 0.289 0.373 0.338
## --------------------------------------------------------------------------------
## plant.growth
## n missing distinct
## 667 16 2
##
## Value 0 1
## Frequency 441 226
## Proportion 0.661 0.339
## --------------------------------------------------------------------------------
## leaves
## n missing distinct
## 683 0 2
##
## Value 0 1
## Frequency 77 606
## Proportion 0.113 0.887
## --------------------------------------------------------------------------------
## leaf.halo
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 221 36 342
## Proportion 0.369 0.060 0.571
## --------------------------------------------------------------------------------
## leaf.marg
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 357 21 221
## Proportion 0.596 0.035 0.369
## --------------------------------------------------------------------------------
## leaf.size
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 51 327 221
## Proportion 0.085 0.546 0.369
## --------------------------------------------------------------------------------
## leaf.shread
## n missing distinct
## 583 100 2
##
## Value 0 1
## Frequency 487 96
## Proportion 0.835 0.165
## --------------------------------------------------------------------------------
## leaf.malf
## n missing distinct
## 599 84 2
##
## Value 0 1
## Frequency 554 45
## Proportion 0.925 0.075
## --------------------------------------------------------------------------------
## leaf.mild
## n missing distinct
## 575 108 3
##
## Value 0 1 2
## Frequency 535 20 20
## Proportion 0.930 0.035 0.035
## --------------------------------------------------------------------------------
## stem
## n missing distinct
## 667 16 2
##
## Value 0 1
## Frequency 296 371
## Proportion 0.444 0.556
## --------------------------------------------------------------------------------
## lodging
## n missing distinct
## 562 121 2
##
## Value 0 1
## Frequency 520 42
## Proportion 0.925 0.075
## --------------------------------------------------------------------------------
## stem.cankers
## n missing distinct
## 645 38 4
##
## Value 0 1 2 3
## Frequency 379 39 36 191
## Proportion 0.588 0.060 0.056 0.296
## --------------------------------------------------------------------------------
## canker.lesion
## n missing distinct
## 645 38 4
##
## Value 0 1 2 3
## Frequency 320 83 177 65
## Proportion 0.496 0.129 0.274 0.101
## --------------------------------------------------------------------------------
## fruiting.bodies
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 473 104
## Proportion 0.82 0.18
## --------------------------------------------------------------------------------
## ext.decay
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 497 135 13
## Proportion 0.771 0.209 0.020
## --------------------------------------------------------------------------------
## mycelium
## n missing distinct
## 645 38 2
##
## Value 0 1
## Frequency 639 6
## Proportion 0.991 0.009
## --------------------------------------------------------------------------------
## int.discolor
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 581 44 20
## Proportion 0.901 0.068 0.031
## --------------------------------------------------------------------------------
## sclerotia
## n missing distinct
## 645 38 2
##
## Value 0 1
## Frequency 625 20
## Proportion 0.969 0.031
## --------------------------------------------------------------------------------
## fruit.pods
## n missing distinct
## 599 84 4
##
## Value 0 1 2 3
## Frequency 407 130 14 48
## Proportion 0.679 0.217 0.023 0.080
## --------------------------------------------------------------------------------
## fruit.spots
## n missing distinct
## 577 106 4
##
## Value 0 1 2 4
## Frequency 345 75 57 100
## Proportion 0.598 0.130 0.099 0.173
## --------------------------------------------------------------------------------
## seed
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 476 115
## Proportion 0.805 0.195
## --------------------------------------------------------------------------------
## mold.growth
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 524 67
## Proportion 0.887 0.113
## --------------------------------------------------------------------------------
## seed.discolor
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 513 64
## Proportion 0.889 0.111
## --------------------------------------------------------------------------------
## seed.size
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 532 59
## Proportion 0.9 0.1
## --------------------------------------------------------------------------------
## shriveling
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 539 38
## Proportion 0.934 0.066
## --------------------------------------------------------------------------------
## roots
## n missing distinct
## 652 31 3
##
## Value 0 1 2
## Frequency 551 86 15
## Proportion 0.845 0.132 0.023
## --------------------------------------------------------------------------------
summary(Soybean)
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
soybean_cat <- Soybean %>% #get only categorical predictor with dply package and put data into long format
select_if(is.factor) # to check if column is categorical or not
soybean_tall <- soybean_cat %>%
gather(key = "variable", value = "category") # reshape in a long format for better graphing , y is the key and x is the value
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(soybean_tall, aes(x = category)) +
geom_bar(aes(fill = category), show.legend = FALSE) +
facet_wrap(~ variable, scales = "free_x") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency Distributions for the Categorical Predictors For Soybean Data ",
x = "Category", y = "Frequency")
# graph look rough
Observing the frequency distributions we can some category have dominant frequency and others have lower frequencies. Some variables are skewed heavily toward one caterogory like plant stand and seed trat. Shriveling is dominated by one categroy. There also seems to be alot of missing values which does have impact on our data.
head(Soybean)
## Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker 6 0 2 1 0 1 1
## 2 diaporthe-stem-canker 4 0 2 1 0 2 0
## 3 diaporthe-stem-canker 3 0 2 1 0 1 0
## 4 diaporthe-stem-canker 3 0 2 1 0 1 0
## 5 diaporthe-stem-canker 6 0 2 1 0 2 0
## 6 diaporthe-stem-canker 5 0 2 1 0 3 0
## sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1 1 0 0 1 1 0 2 2
## 2 2 1 1 1 1 0 2 2
## 3 2 1 2 1 1 0 2 2
## 4 2 0 1 1 1 0 2 2
## 5 1 0 2 1 1 0 2 2
## 6 1 0 1 1 1 0 2 2
## leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1 0 0 0 1 1 3 1
## 2 0 0 0 1 0 3 1
## 3 0 0 0 1 0 3 0
## 4 0 0 0 1 0 3 0
## 5 0 0 0 1 0 3 1
## 6 0 0 0 1 0 3 0
## fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1 1 1 0 0 0 0
## 2 1 1 0 0 0 0
## 3 1 1 0 0 0 0
## 4 1 1 0 0 0 0
## 5 1 1 0 0 0 0
## 6 1 1 0 0 0 0
## fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 4 0 0 0 0 0 0
## 2 4 0 0 0 0 0 0
## 3 4 0 0 0 0 0 0
## 4 4 0 0 0 0 0 0
## 5 4 0 0 0 0 0 0
## 6 4 0 0 0 0 0 0
sapply(Soybean, function(x) mean(is.na(x)) * 100) # help calculate the percentage of missing data in the column
## Class date plant.stand precip temp
## 0.0000000 0.1464129 5.2708638 5.5636896 4.3923865
## hail crop.hist area.dam sever seed.tmt
## 17.7159590 2.3426061 0.1464129 17.7159590 17.7159590
## germ plant.growth leaves leaf.halo leaf.marg
## 16.3982430 2.3426061 0.0000000 12.2986823 12.2986823
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 12.2986823 14.6412884 12.2986823 15.8125915 2.3426061
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 17.7159590 5.5636896 5.5636896 15.5197657 5.5636896
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 5.5636896 5.5636896 5.5636896 12.2986823 15.5197657
## seed mold.growth seed.discolor seed.size shriveling
## 13.4699854 13.4699854 15.5197657 13.4699854 15.5197657
## roots
## 4.5387994
We can find the percenateg of NA’s in each column
library("mice") # this package will help with dealing with missing data
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library("naniar") # this package will help with analyzing missing data.
vis_miss(Soybean)
gg_miss_var(Soybean) + labs(title = "Number of Missing Vlaues With Each Predictor") # used number at first but percentage looks better
gg_miss_var(Soybean, show_pct = TRUE) + labs(title = " Percentage of Missing Values With Each Predictor") # used number at first but percentage looks better
About 9.5 percent of the data has missing data, with 90.5 percent have
data filled in their categories. The gg_miss_var can help us evaluate
the percentage and number of missing values with the predictor
variables. With server have the most, followed by the class having the
least missing values.
head(soybean_cat)
## Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker 6 0 2 1 0 1 1
## 2 diaporthe-stem-canker 4 0 2 1 0 2 0
## 3 diaporthe-stem-canker 3 0 2 1 0 1 0
## 4 diaporthe-stem-canker 3 0 2 1 0 1 0
## 5 diaporthe-stem-canker 6 0 2 1 0 2 0
## 6 diaporthe-stem-canker 5 0 2 1 0 3 0
## sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1 1 0 0 1 1 0 2 2
## 2 2 1 1 1 1 0 2 2
## 3 2 1 2 1 1 0 2 2
## 4 2 0 1 1 1 0 2 2
## 5 1 0 2 1 1 0 2 2
## 6 1 0 1 1 1 0 2 2
## leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1 0 0 0 1 1 3 1
## 2 0 0 0 1 0 3 1
## 3 0 0 0 1 0 3 0
## 4 0 0 0 1 0 3 0
## 5 0 0 0 1 0 3 1
## 6 0 0 0 1 0 3 0
## fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1 1 1 0 0 0 0
## 2 1 1 0 0 0 0
## 3 1 1 0 0 0 0
## 4 1 1 0 0 0 0
## 5 1 1 0 0 0 0
## 6 1 1 0 0 0 0
## fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 4 0 0 0 0 0 0
## 2 4 0 0 0 0 0 0
## 3 4 0 0 0 0 0 0
## 4 4 0 0 0 0 0 0
## 5 4 0 0 0 0 0 0
## 6 4 0 0 0 0 0 0
describe(soybean_cat) # this from the Hmisc pakage
## soybean_cat
##
## 36 Variables 683 Observations
## --------------------------------------------------------------------------------
## Class
## n missing distinct
## 683 0 19
##
## lowest : 2-4-d-injury alternarialeaf-spot anthracnose bacterial-blight bacterial-pustule
## highest: phyllosticta-leaf-spot phytophthora-rot powdery-mildew purple-seed-stain rhizoctonia-root-rot
## --------------------------------------------------------------------------------
## date
## n missing distinct
## 682 1 7
##
## Value 0 1 2 3 4 5 6
## Frequency 26 75 93 118 131 149 90
## Proportion 0.038 0.110 0.136 0.173 0.192 0.218 0.132
## --------------------------------------------------------------------------------
## plant.stand
## n missing distinct
## 647 36 2
##
## Value 0 1
## Frequency 354 293
## Proportion 0.547 0.453
## --------------------------------------------------------------------------------
## precip
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 74 112 459
## Proportion 0.115 0.174 0.712
## --------------------------------------------------------------------------------
## temp
## n missing distinct
## 653 30 3
##
## Value 0 1 2
## Frequency 80 374 199
## Proportion 0.123 0.573 0.305
## --------------------------------------------------------------------------------
## hail
## n missing distinct
## 562 121 2
##
## Value 0 1
## Frequency 435 127
## Proportion 0.774 0.226
## --------------------------------------------------------------------------------
## crop.hist
## n missing distinct
## 667 16 4
##
## Value 0 1 2 3
## Frequency 65 165 219 218
## Proportion 0.097 0.247 0.328 0.327
## --------------------------------------------------------------------------------
## area.dam
## n missing distinct
## 682 1 4
##
## Value 0 1 2 3
## Frequency 123 227 145 187
## Proportion 0.180 0.333 0.213 0.274
## --------------------------------------------------------------------------------
## sever
## n missing distinct
## 562 121 3
##
## Value 0 1 2
## Frequency 195 322 45
## Proportion 0.347 0.573 0.080
## --------------------------------------------------------------------------------
## seed.tmt
## n missing distinct
## 562 121 3
##
## Value 0 1 2
## Frequency 305 222 35
## Proportion 0.543 0.395 0.062
## --------------------------------------------------------------------------------
## germ
## n missing distinct
## 571 112 3
##
## Value 0 1 2
## Frequency 165 213 193
## Proportion 0.289 0.373 0.338
## --------------------------------------------------------------------------------
## plant.growth
## n missing distinct
## 667 16 2
##
## Value 0 1
## Frequency 441 226
## Proportion 0.661 0.339
## --------------------------------------------------------------------------------
## leaves
## n missing distinct
## 683 0 2
##
## Value 0 1
## Frequency 77 606
## Proportion 0.113 0.887
## --------------------------------------------------------------------------------
## leaf.halo
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 221 36 342
## Proportion 0.369 0.060 0.571
## --------------------------------------------------------------------------------
## leaf.marg
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 357 21 221
## Proportion 0.596 0.035 0.369
## --------------------------------------------------------------------------------
## leaf.size
## n missing distinct
## 599 84 3
##
## Value 0 1 2
## Frequency 51 327 221
## Proportion 0.085 0.546 0.369
## --------------------------------------------------------------------------------
## leaf.shread
## n missing distinct
## 583 100 2
##
## Value 0 1
## Frequency 487 96
## Proportion 0.835 0.165
## --------------------------------------------------------------------------------
## leaf.malf
## n missing distinct
## 599 84 2
##
## Value 0 1
## Frequency 554 45
## Proportion 0.925 0.075
## --------------------------------------------------------------------------------
## leaf.mild
## n missing distinct
## 575 108 3
##
## Value 0 1 2
## Frequency 535 20 20
## Proportion 0.930 0.035 0.035
## --------------------------------------------------------------------------------
## stem
## n missing distinct
## 667 16 2
##
## Value 0 1
## Frequency 296 371
## Proportion 0.444 0.556
## --------------------------------------------------------------------------------
## lodging
## n missing distinct
## 562 121 2
##
## Value 0 1
## Frequency 520 42
## Proportion 0.925 0.075
## --------------------------------------------------------------------------------
## stem.cankers
## n missing distinct
## 645 38 4
##
## Value 0 1 2 3
## Frequency 379 39 36 191
## Proportion 0.588 0.060 0.056 0.296
## --------------------------------------------------------------------------------
## canker.lesion
## n missing distinct
## 645 38 4
##
## Value 0 1 2 3
## Frequency 320 83 177 65
## Proportion 0.496 0.129 0.274 0.101
## --------------------------------------------------------------------------------
## fruiting.bodies
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 473 104
## Proportion 0.82 0.18
## --------------------------------------------------------------------------------
## ext.decay
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 497 135 13
## Proportion 0.771 0.209 0.020
## --------------------------------------------------------------------------------
## mycelium
## n missing distinct
## 645 38 2
##
## Value 0 1
## Frequency 639 6
## Proportion 0.991 0.009
## --------------------------------------------------------------------------------
## int.discolor
## n missing distinct
## 645 38 3
##
## Value 0 1 2
## Frequency 581 44 20
## Proportion 0.901 0.068 0.031
## --------------------------------------------------------------------------------
## sclerotia
## n missing distinct
## 645 38 2
##
## Value 0 1
## Frequency 625 20
## Proportion 0.969 0.031
## --------------------------------------------------------------------------------
## fruit.pods
## n missing distinct
## 599 84 4
##
## Value 0 1 2 3
## Frequency 407 130 14 48
## Proportion 0.679 0.217 0.023 0.080
## --------------------------------------------------------------------------------
## fruit.spots
## n missing distinct
## 577 106 4
##
## Value 0 1 2 4
## Frequency 345 75 57 100
## Proportion 0.598 0.130 0.099 0.173
## --------------------------------------------------------------------------------
## seed
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 476 115
## Proportion 0.805 0.195
## --------------------------------------------------------------------------------
## mold.growth
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 524 67
## Proportion 0.887 0.113
## --------------------------------------------------------------------------------
## seed.discolor
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 513 64
## Proportion 0.889 0.111
## --------------------------------------------------------------------------------
## seed.size
## n missing distinct
## 591 92 2
##
## Value 0 1
## Frequency 532 59
## Proportion 0.9 0.1
## --------------------------------------------------------------------------------
## shriveling
## n missing distinct
## 577 106 2
##
## Value 0 1
## Frequency 539 38
## Proportion 0.934 0.066
## --------------------------------------------------------------------------------
## roots
## n missing distinct
## 652 31 3
##
## Value 0 1 2
## Frequency 551 86 15
## Proportion 0.845 0.132 0.023
## --------------------------------------------------------------------------------
sum(is.na(soybean_cat))
## [1] 2337
sapply(soybean_cat, function(x) mean(is.na(x)) * 100) # help calculate the percentage of missing data in the column
## Class date plant.stand precip temp
## 0.0000000 0.1464129 5.2708638 5.5636896 4.3923865
## hail crop.hist area.dam sever seed.tmt
## 17.7159590 2.3426061 0.1464129 17.7159590 17.7159590
## germ plant.growth leaves leaf.halo leaf.marg
## 16.3982430 2.3426061 0.0000000 12.2986823 12.2986823
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 12.2986823 14.6412884 12.2986823 15.8125915 2.3426061
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 17.7159590 5.5636896 5.5636896 15.5197657 5.5636896
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 5.5636896 5.5636896 5.5636896 12.2986823 15.5197657
## seed mold.growth seed.discolor seed.size shriveling
## 13.4699854 13.4699854 15.5197657 13.4699854 15.5197657
## roots
## 4.5387994
gg_miss_var(soybean_cat) + labs(title = "Percentage of Missing With Each Predictor")
Step 2 Handle Missing data
soybean_complete = na.omit(soybean_cat) # remove any rows with missing value
sum(is.na(soybean_complete)) # count how many there are
## [1] 0
dim(soybean_complete) # get the dimensions of the data set with rows and column
## [1] 562 36
step 3 analyze impact of missing data
soybean_mice <- mice(soybean_cat, method = "polyreg", m = 5) # the m =5 will generate the amount of data sets we want like making five diff version of the the data set with each imputed differently , polyreg = Polytomous Logistic Regression
##
## iter imp variable
## 1 1 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 2 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 3 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 4 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1 5 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 2 1 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 2 2 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 2 3 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 2 4 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 2 5 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 3 1 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 3 2 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 3 3 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 3 4 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 3 5 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 4 1 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 4 2 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 4 3 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 4 4 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 4 5 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 5 1 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 5 2 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 5 3 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 5 4 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 5 5 date plant.stand precip temp hail crop.hist area.dam sever seed.tmt germ plant.growth leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## Warning: Number of logged events: 844
soybean_imputed_mice <- complete(soybean_mice)
dim(soybean_complete) # we will check teh number rows before and after imputation
## [1] 562 36
dim(soybean_imputed_mice) # we got more rows after imputatation
## [1] 683 36
# put both data frame into a long format to compare
soybean_cat_tall <- soybean_cat %>%
gather(key = "variable", value = "category")
## Warning: attributes are not identical across measure variables; they will be
## dropped
soybean_imputed_mice_tall <- soybean_imputed_mice %>%
gather(key = "variable", value = "category")
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(soybean_cat_tall, aes(x = category)) +
geom_bar(aes(fill = category), show.legend = FALSE) +
facet_wrap(~ variable, scales = "free_x") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency Distributions for the Categorical Predictors For Soybean Data ",
x = "Category", y = "Frequency")
ggplot(soybean_imputed_mice_tall, aes(x = category)) +
geom_bar(aes(fill = category), show.legend = FALSE) +
facet_wrap(~ variable, scales = "free_x") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Frequency Distributions for the Categorical Predictors For Soybean Data ",
x = "Category", y = "Frequency")
I used the MICE strategy because it handle the categorical and continuous variables within the data set. The polyreg method was important here, because of the multiple categories which have can multiple outcome when we impute the data.Using the poly reg, we can handle more categories that have different levels. This seen with the inputs of 0,1,2,3, and 4 showing a different level of the predictor variable. Also, there are some several levels of categories in the data for example the mold growth and germ which indicate the health of the bean. Using the MICE , we can make five different versions of the imputed vales so that we can compare the different versions of how the missing data is handled. Also by using MICE we can get fuller data to improve the performance of the model and dispose missing values without losing crucial information.