## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Homework Assignment 4

3.1

data(Glass)
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

a.

histogram(Glass$RI)

histogram(Glass$Na)

histogram(Glass$Mg)

histogram(Glass$Al)

histogram(Glass$Si)

histogram(Glass$K)

histogram(Glass$Ca)

histogram(Glass$Ba)

histogram(Glass$Fe)

histogram(Glass$Type)

Looking at the different predictor variables in these histograms, I’m able to see the points along the x axis for each element. They represent the values of each sample. The y axis indicates the percentage of the total.

b.

There are no predictor variable without a skew to either the right or the left. Ca, Si, and RI have a high concentration of data points around the center but very few predictors on either tail. Fe, Ba, K, and Fe all have several zero values, which in some instances creates a bimodal distribution of the data. K and Na might have some outliers in the data set.

c.

With several predictors having zeros in the dataset, box cox and log transformations would not work to resolve skewness. Data transformation that is considered to minimize the sensitivity to outliers is the spatial sign for resolving outliers.

3.2

data("Soybean")
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

a.

?Soybean
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
y1 = count(Soybean, 'date')
y1
##   date freq
## 1    0   26
## 2    1   75
## 3    2   93
## 4    3  118
## 5    4  131
## 6    5  149
## 7    6   90
## 8 <NA>    1
lapply(Soybean, table, useNA='always')
## $Class
## 
##                2-4-d-injury         alternarialeaf-spot 
##                          16                          91 
##                 anthracnose            bacterial-blight 
##                          44                          20 
##           bacterial-pustule                  brown-spot 
##                          20                          92 
##              brown-stem-rot                charcoal-rot 
##                          44                          20 
##               cyst-nematode diaporthe-pod-&-stem-blight 
##                          14                          15 
##       diaporthe-stem-canker                downy-mildew 
##                          20                          20 
##          frog-eye-leaf-spot            herbicide-injury 
##                          91                           8 
##      phyllosticta-leaf-spot            phytophthora-rot 
##                          20                          88 
##              powdery-mildew           purple-seed-stain 
##                          20                          20 
##        rhizoctonia-root-rot                        <NA> 
##                          20                           0 
## 
## $date
## 
##    0    1    2    3    4    5    6 <NA> 
##   26   75   93  118  131  149   90    1 
## 
## $plant.stand
## 
##    0    1 <NA> 
##  354  293   36 
## 
## $precip
## 
##    0    1    2 <NA> 
##   74  112  459   38 
## 
## $temp
## 
##    0    1    2 <NA> 
##   80  374  199   30 
## 
## $hail
## 
##    0    1 <NA> 
##  435  127  121 
## 
## $crop.hist
## 
##    0    1    2    3 <NA> 
##   65  165  219  218   16 
## 
## $area.dam
## 
##    0    1    2    3 <NA> 
##  123  227  145  187    1 
## 
## $sever
## 
##    0    1    2 <NA> 
##  195  322   45  121 
## 
## $seed.tmt
## 
##    0    1    2 <NA> 
##  305  222   35  121 
## 
## $germ
## 
##    0    1    2 <NA> 
##  165  213  193  112 
## 
## $plant.growth
## 
##    0    1 <NA> 
##  441  226   16 
## 
## $leaves
## 
##    0    1 <NA> 
##   77  606    0 
## 
## $leaf.halo
## 
##    0    1    2 <NA> 
##  221   36  342   84 
## 
## $leaf.marg
## 
##    0    1    2 <NA> 
##  357   21  221   84 
## 
## $leaf.size
## 
##    0    1    2 <NA> 
##   51  327  221   84 
## 
## $leaf.shread
## 
##    0    1 <NA> 
##  487   96  100 
## 
## $leaf.malf
## 
##    0    1 <NA> 
##  554   45   84 
## 
## $leaf.mild
## 
##    0    1    2 <NA> 
##  535   20   20  108 
## 
## $stem
## 
##    0    1 <NA> 
##  296  371   16 
## 
## $lodging
## 
##    0    1 <NA> 
##  520   42  121 
## 
## $stem.cankers
## 
##    0    1    2    3 <NA> 
##  379   39   36  191   38 
## 
## $canker.lesion
## 
##    0    1    2    3 <NA> 
##  320   83  177   65   38 
## 
## $fruiting.bodies
## 
##    0    1 <NA> 
##  473  104  106 
## 
## $ext.decay
## 
##    0    1    2 <NA> 
##  497  135   13   38 
## 
## $mycelium
## 
##    0    1 <NA> 
##  639    6   38 
## 
## $int.discolor
## 
##    0    1    2 <NA> 
##  581   44   20   38 
## 
## $sclerotia
## 
##    0    1 <NA> 
##  625   20   38 
## 
## $fruit.pods
## 
##    0    1    2    3 <NA> 
##  407  130   14   48   84 
## 
## $fruit.spots
## 
##    0    1    2    4 <NA> 
##  345   75   57  100  106 
## 
## $seed
## 
##    0    1 <NA> 
##  476  115   92 
## 
## $mold.growth
## 
##    0    1 <NA> 
##  524   67   92 
## 
## $seed.discolor
## 
##    0    1 <NA> 
##  513   64  106 
## 
## $seed.size
## 
##    0    1 <NA> 
##  532   59   92 
## 
## $shriveling
## 
##    0    1 <NA> 
##  539   38  106 
## 
## $roots
## 
##    0    1    2 <NA> 
##  551   86   15   31

There appears to be several NA values for the this dataset. The missing data will have a significant effect on finding the frequency distribution of the data. We could find out which predictors carry the most NAs and using a technique to come up with a prediction for what those variables might have been.

b.

table(Soybean$Class, complete.cases(Soybean))
##                              
##                               FALSE TRUE
##   2-4-d-injury                   16    0
##   alternarialeaf-spot             0   91
##   anthracnose                     0   44
##   bacterial-blight                0   20
##   bacterial-pustule               0   20
##   brown-spot                      0   92
##   brown-stem-rot                  0   44
##   charcoal-rot                    0   20
##   cyst-nematode                  14    0
##   diaporthe-pod-&-stem-blight    15    0
##   diaporthe-stem-canker           0   20
##   downy-mildew                    0   20
##   frog-eye-leaf-spot              0   91
##   herbicide-injury                8    0
##   phyllosticta-leaf-spot          0   20
##   phytophthora-rot               68   20
##   powdery-mildew                  0   20
##   purple-seed-stain               0   20
##   rhizoctonia-root-rot            0   20
soybean3<-Soybean%>%
  group_by(Class)%>%
  do(data.frame(sum(is.na(.)))) 
soybean3
## # A tibble: 19 x 2
## # Groups:   Class [19]
##    Class                       sum.is.na....
##    <fct>                               <int>
##  1 2-4-d-injury                          450
##  2 alternarialeaf-spot                     0
##  3 anthracnose                             0
##  4 bacterial-blight                        0
##  5 bacterial-pustule                       0
##  6 brown-spot                              0
##  7 brown-stem-rot                          0
##  8 charcoal-rot                            0
##  9 cyst-nematode                         336
## 10 diaporthe-pod-&-stem-blight           177
## 11 diaporthe-stem-canker                   0
## 12 downy-mildew                            0
## 13 frog-eye-leaf-spot                      0
## 14 herbicide-injury                      160
## 15 phyllosticta-leaf-spot                  0
## 16 phytophthora-rot                     1214
## 17 powdery-mildew                          0
## 18 purple-seed-stain                       0
## 19 rhizoctonia-root-rot                    0
colnames(soybean3)[2] <- "NAs"
soybean3
## # A tibble: 19 x 2
## # Groups:   Class [19]
##    Class                         NAs
##    <fct>                       <int>
##  1 2-4-d-injury                  450
##  2 alternarialeaf-spot             0
##  3 anthracnose                     0
##  4 bacterial-blight                0
##  5 bacterial-pustule               0
##  6 brown-spot                      0
##  7 brown-stem-rot                  0
##  8 charcoal-rot                    0
##  9 cyst-nematode                 336
## 10 diaporthe-pod-&-stem-blight   177
## 11 diaporthe-stem-canker           0
## 12 downy-mildew                    0
## 13 frog-eye-leaf-spot              0
## 14 herbicide-injury              160
## 15 phyllosticta-leaf-spot          0
## 16 phytophthora-rot             1214
## 17 powdery-mildew                  0
## 18 purple-seed-stain               0
## 19 rhizoctonia-root-rot            0
ggplot(soybean3,aes(x=reorder(Class,-NAs),y=NAs))+
  geom_bar(stat="identity")+
  theme(axis.text.x=element_text(angle=90))+
  xlab("Explanatory Variable")+
  ylab("Number of NAs")

We can see that there are many missing values from phythophthora-rot, 2-4-d-injury, cyst-nembatode, and diaporthe-pod-&-stem-blight.

colSums(is.na(Soybean))
##           Class            date     plant.stand          precip 
##               0               1              36              38 
##            temp            hail       crop.hist        area.dam 
##              30             121              16               1 
##           sever        seed.tmt            germ    plant.growth 
##             121             121             112              16 
##          leaves       leaf.halo       leaf.marg       leaf.size 
##               0              84              84              84 
##     leaf.shread       leaf.malf       leaf.mild            stem 
##             100              84             108              16 
##         lodging    stem.cankers   canker.lesion fruiting.bodies 
##             121              38              38             106 
##       ext.decay        mycelium    int.discolor       sclerotia 
##              38              38              38              38 
##      fruit.pods     fruit.spots            seed     mold.growth 
##              84             106              92              92 
##   seed.discolor       seed.size      shriveling           roots 
##             106              92             106              31

The predictors with the most missing values are fruiting bodies, leaf-mild, leaf.shread, lodging, sever, seed.tmt, germ, hail and seed.discolor. These are all likely missing the explanatory variables from above.

c.

To deal with the large number of explanatory variables, I would omit phythophthora-rot from the model and then use a modeling technique that is not sensative to NA variables, such as a tree-based model.