Assignment 4

library(mlbench) 
library(corrplot)
library(ggplot2)
require(gridExtra)
library(car)
library(caret)
library(tidyverse)
library(DT)

Question 3.1

The UC Irvine Machine Learning Repository6 contains a data set related to glass identiﬁcation. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:

data(Glass) 
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

Distributions

require(gridExtra)
grid.arrange(ggplot(Glass, aes(RI)) + geom_histogram(binwidth = .001), #very narrow distribution, somewhat normal
             ggplot(Glass, aes(Na)) + geom_histogram(binwidth = .5), #normal distribution
             ggplot(Glass, aes(Mg)) + geom_histogram(binwidth = .25), #right skewed distribution
             ggplot(Glass, aes(Al)) + geom_histogram(binwidth = .25), #normal distribution
             ggplot(Glass, aes(Si)) + geom_histogram(binwidth = .5), #nomal distribution
             ggplot(Glass, aes(K)) + geom_histogram(binwidth = .25), #right skewed distribution, seems to have an outlier
             ggplot(Glass, aes(Ca)) + geom_histogram(binwidth = 1), #normal distribution
             ggplot(Glass, aes(Ba)) + geom_histogram(binwidth = .25), #left-skewed distribution
             ggplot(Glass, aes(Fe)) + geom_histogram(binwidth = .05), #left-skewed distribution
             ncol=3)

QQplot

Taking a further look at the distributions of the feature variables, it looks like the distributions of Mg, Fe, Ba, and Ca are highly skewed.

colnames(subset(Glass, select = -c(Type)))

## [1] "RI" "Na" "Mg" "Al" "Si" "K"  "Ca" "Ba" "Fe"

par(mfrow=c(3,3), cex=.8, mai=c(0,0,0.2,0))
invisible(qqPlot(~ RI, data = Glass, main = "RI"))
invisible(qqPlot(~ Na, data = Glass, main = "Na"))
invisible(qqPlot(~ Mg, data = Glass, main = "Mg"))
invisible(qqPlot(~ Al, data = Glass, main = "Al"))
invisible(qqPlot(~ Si, data = Glass, main = "Si"))
invisible(qqPlot(~ K, data = Glass, main = "K"))
invisible(qqPlot(~ Ca, data = Glass, main = "Ca"))
invisible(qqPlot(~ Ba, data = Glass, main = "Ba"))
invisible(qqPlot(~ Fe, data = Glass, main = "Fe"))

Boxplots

In the histograms, a few outliers were spotted. Let’s take a look at boxplots of the features to identify more outliers.

It looks like besides Mg, all fields have outliers.

grid.arrange(ggplot(Glass, aes(x="x", y = RI)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Na)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Mg)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Al)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Si)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = K)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Ca)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Ba)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ggplot(Glass, aes(x="x", y = Fe)) + geom_boxplot(outlier.colour="red", outlier.shape=16, outlier.size=1),
             ncol=3)

Correlation Plot

From this correlation plot of glass dataset features, the following correlations can be identified:

Strong positive correlation between Ca and RI
Strong negative correlation between Si and RI
Moderate positive correlation between Ba and Mg
Moderate negative correlation between Ba and Al

A VIF statistic can also indicate multicollinearity in the data.

corrplot(cor(subset(Glass, select = -c(Type)), use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)

Scatter Plots

Let’s take a look at scatter plots of some of the highly correlated features.

From these scatter plots, it seems like the CA and RI / Si and RI correlations may cause issues in the model.

grid.arrange(ggplot(Glass, aes(Ca, RI)) + geom_point(),
             ggplot(Glass, aes(Si, RI)) + geom_point(), 
             ggplot(Glass, aes(Mg, Ba)) + geom_point(), 
             ggplot(Glass, aes(Al, Ba)) + geom_point(), 
             ncol=2)

Do there appear to be any outliers in the data? Are any predictors skewed?

Based on the boxplots, it looks like all fields other than Mg have outliers. The histrograms and qqplots indicate that the distributions of Mg, Fe, Ba, and Ca are highly skewed.

Are there any relevant transformations of one or more predictors that might improve the classiﬁcation model?

To address skewness:

A good way to address skewed features is boxcox, log, sqrt, or inverse transformations. However, none of these transformations seen to help significantly improve this model. Scaling may also help center a distribution, but it did not help in this case. Since some features have many values near 0, it may help to bin those predictors.

Apply a boxcox transform to Na, Ca, Si and Al.

BoxCoxTrans(Glass$Na)

## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.73   12.91   13.30   13.41   13.82   17.38 
## 
## Largest/Smallest: 1.62 
## Sample Skewness: 0.448 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations

BoxCoxTrans(Glass$Ca)

## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.430   8.240   8.600   8.957   9.172  16.190 
## 
## Largest/Smallest: 2.98 
## Sample Skewness: 2.02 
## 
## Estimated Lambda: -1.1

BoxCoxTrans(Glass$Si)

## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.81   72.28   72.79   72.65   73.09   75.41 
## 
## Largest/Smallest: 1.08 
## Sample Skewness: -0.72 
## 
## Estimated Lambda: 2

BoxCoxTrans(Glass$Al)

## Box-Cox Transformation
## 
## 214 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.290   1.190   1.360   1.445   1.630   3.500 
## 
## Largest/Smallest: 12.1 
## Sample Skewness: 0.895 
## 
## Estimated Lambda: 0.5

To address collinearity:

RI has multicollinearlity with other features in the dataset. An approach to resolve this would be to remove this predictor, or apply PCA to the dataset.

Glass = subset(Glass, select = -c(RI))
pca <- prcomp(subset(Glass, select = -c(Type)), scale. = TRUE, center = TRUE)$x

To address outliers:

We can apply a spatial sign transformation.

library(caret)
sst <- spatialSign(subset(Glass, select = -c(Type)))
Type <- Glass$Type
Glass <- as.data.frame(cbind(sst, Type))

Classification Model

library(caTools) 
  
set.seed(123) 
Type <- Glass$Type
Glass <- as.data.frame(cbind(pca, Type))
split = sample.split(Glass$Type, SplitRatio = 0.75) 
  
training_set = subset(Glass, split == TRUE) 
test_set = subset(Glass, split == FALSE) 

library("e1071")
classifier = svm(formula = Type ~ ., 
                 data = training_set, 
                 type = 'C-classification', 
                 kernel = 'linear')

y_pred = predict(classifier, newdata = test_set[-9]) 
cm = table(test_set[, 9], y_pred)
prop.table(cm, margin = 2)

##    y_pred
##              1          2 3          4          5          6
##   1 0.70000000 0.18181818   0.00000000 0.00000000 0.00000000
##   2 0.20000000 0.59090909   0.40000000 0.00000000 0.00000000
##   3 0.05000000 0.13636364   0.00000000 0.00000000 0.00000000
##   4 0.00000000 0.00000000   0.60000000 0.00000000 0.00000000
##   5 0.00000000 0.04545455   0.00000000 1.00000000 0.00000000
##   6 0.05000000 0.04545455   0.00000000 0.00000000 1.00000000

#check how many accurate predictions there were
summary(test_set[, 9] ==  y_pred)

##    Mode   FALSE    TRUE 
## logical      17      36

Question 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

data(Soybean) 
## See ?Soybean for details

Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

leaf.malf: fraction of unique values is low leaf.mild: fraction of unique values is low lodging: fraction of unique values is low mycelium: fraction of unique values is low sclerotia: fraction of unique values is low shriveling: fraction of unique values is low

These predictors should probably be removed.

summary(Soybean)

##                  Class          date     plant.stand  precip      temp    
##  brown-spot         : 92   5      :149   0   :354    0   : 74   0   : 80  
##  alternarialeaf-spot: 91   4      :131   1   :293    1   :112   1   :374  
##  frog-eye-leaf-spot : 91   3      :118   NA's: 36    2   :459   2   :199  
##  phytophthora-rot   : 88   2      : 93               NA's: 38   NA's: 30  
##  anthracnose        : 44   6      : 90                                    
##  brown-stem-rot     : 44   (Other):101                                    
##  (Other)            :233   NA's   :  1                                    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  0   :435   0   : 65   0   :123   0   :195   0   :305   0   :165   0   :441    
##  1   :127   1   :165   1   :227   1   :322   1   :222   1   :213   1   :226    
##  NA's:121   2   :219   2   :145   2   : 45   2   : 35   2   :193   NA's: 16    
##             3   :218   3   :187   NA's:121   NA's:121   NA's:112               
##             NA's: 16   NA's:  1                                                
##                                                                                
##                                                                                
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##  0: 77   0   :221   0   :357   0   : 51   0   :487    0   :554   0   :535  
##  1:606   1   : 36   1   : 21   1   :327   1   : 96    1   : 45   1   : 20  
##          2   :342   2   :221   2   :221   NA's:100    NA's: 84   2   : 20  
##          NA's: 84   NA's: 84   NA's: 84                          NA's:108  
##                                                                            
##                                                                            
##                                                                            
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  0   :296   0   :520   0   :379     0   :320      0   :473        0   :497  
##  1   :371   1   : 42   1   : 39     1   : 83      1   :104        1   :135  
##  NA's: 16   NA's:121   2   : 36     2   :177      NA's:106        2   : 13  
##                        3   :191     3   : 65                      NA's: 38  
##                        NA's: 38     NA's: 38                                
##                                                                             
##                                                                             
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  0   :639   0   :581     0   :625   0   :407   0   :345    0   :476  
##  1   :  6   1   : 44     1   : 20   1   :130   1   : 75    1   :115  
##  NA's: 38   2   : 20     NA's: 38   2   : 14   2   : 57    NA's: 92  
##             NA's: 38                3   : 48   4   :100              
##                                     NA's: 84   NA's:106              
##                                                                      
##                                                                      
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  0   :524    0   :513      0   :532   0   :539   0   :551  
##  1   : 67    1   : 64      1   : 59   1   : 38   1   : 86  
##  NA's: 92    NA's:106      NA's: 92   NA's:106   2   : 15  
##                                                  NA's: 31  
##                                                            
##                                                            
##

Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

sever, seed.tmt, germ, leaf.halo, leaf.marg, leaf.size, leaf.shread, leaf.malf, leaf.mild, fruiting.bodies, fruit.pods, fruit.spots, seed, mold.growth, seed.discolor, seed.size, shriveling have the most NA values
phytophthora-rot has the most missing values out of the classes
2-4-d-injury and cyst-nematode also had many missing values
See table below for more detail

library(tidyr)
library(reshape)

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:dplyr':
## 
##     rename

## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths

res <- Soybean %>%
  gather("predictor", "value", 2:36) %>%
  group_by(Class, predictor) %>%
  summarise(na.count=sum(is.na(value))) %>%
  filter(na.count > 1)
datatable(cast(res, predictor ~ Class))

## Using na.count as value column.  Use the value argument to cast to override this choice

Develop a strategy for handling missing data, either by eliminating predictors or imputation.

There are 19 classes, only the first 15 of which have been used in prior work. The folklore seems to be that the last four classes are unjustified by the data since they have so few examples. There are 35 categorical attributes, some nominal and some ordered. The value “dna” means does not apply. The values for attributes are encoded numerically, with the first value encoded as “0,” the second as “1,” and so forth.

The following table shows the missing rate of each predictor.

res <- Soybean %>%
  gather("predictor", "value", 2:36) %>%
  group_by(predictor) %>%
  summarise(na.percent=sum(is.na(value))/n()) %>%
  arrange(desc(na.percent))

## Warning: attributes are not identical across measure variables;
## they will be dropped

res

## # A tibble: 35 x 2
##    predictor       na.percent
##    <chr>                <dbl>
##  1 hail                 0.177
##  2 lodging              0.177
##  3 seed.tmt             0.177
##  4 sever                0.177
##  5 germ                 0.164
##  6 leaf.mild            0.158
##  7 fruit.spots          0.155
##  8 fruiting.bodies      0.155
##  9 seed.discolor        0.155
## 10 shriveling           0.155
## # ... with 25 more rows

Since there seems to be overlap between rows that are missing data, try:

Dropping rows with multiple missing values
Dropping columns with the most missing values
Filling the rest with k nearest neighbors imputation

The concern with dropping rows with multiple missing values is that certain classes will have too little data to classify. Instead, I will drop columns that are degenerate and have many missing values, and impute the rest.

res <- Soybean %>%
  gather("predictor", "value", 2:36) %>%
  group_by(predictor) %>%
  summarise(na.percent=sum(is.na(value))/n()) %>%
  arrange(desc(na.percent))

## Warning: attributes are not identical across measure variables;
## they will be dropped

res

## # A tibble: 35 x 2
##    predictor       na.percent
##    <chr>                <dbl>
##  1 hail                 0.177
##  2 lodging              0.177
##  3 seed.tmt             0.177
##  4 sever                0.177
##  5 germ                 0.164
##  6 leaf.mild            0.158
##  7 fruit.spots          0.155
##  8 fruiting.bodies      0.155
##  9 seed.discolor        0.155
## 10 shriveling           0.155
## # ... with 25 more rows

library(DMwR)

## Loading required package: grid

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

new_Soybean <- knnImputation(subset(Soybean, select = -c(leaf.malf, leaf.mild, lodging, mycelium, sclerotia, shriveling)))
summary(new_Soybean)

##                  Class     date    plant.stand precip  temp    hail   
##  brown-spot         : 92   0: 26   0:383       0: 86   0: 96   0:542  
##  alternarialeaf-spot: 91   1: 76   1:300       1:136   1:382   1:141  
##  frog-eye-leaf-spot : 91   2: 93               2:461   2:205          
##  phytophthora-rot   : 88   3:118                                      
##  anthracnose        : 44   4:131                                      
##  brown-stem-rot     : 44   5:149                                      
##  (Other)            :233   6: 90                                      
##  crop.hist area.dam sever   seed.tmt germ    plant.growth leaves  leaf.halo
##  0: 71     0:124    0:242   0:356    0:176   0:444        0: 77   0:284    
##  1:167     1:227    1:390   1:292    1:263   1:239        1:606   1: 57    
##  2:227     2:145    2: 51   2: 35    2:244                        2:342    
##  3:218     3:187                                                           
##                                                                            
##                                                                            
##                                                                            
##  leaf.marg leaf.size leaf.shread stem    stem.cankers canker.lesion
##  0:431     0:107     0:587       0:300   0:397        0:327        
##  1: 31     1:351     1: 96       1:383   1: 54        1:105        
##  2:221     2:225                         2: 38        2:185        
##                                          3:194        3: 66        
##                                                                    
##                                                                    
##                                                                    
##  fruiting.bodies ext.decay int.discolor fruit.pods fruit.spots seed   
##  0:574           0:525     0:614        0:484      0:429       0:568  
##  1:109           1:145     1: 44        1:137      1: 92       1:115  
##                  2: 13     2: 25        2: 14      2: 62              
##                                         3: 48      4:100              
##                                                                       
##                                                                       
##                                                                       
##  mold.growth seed.discolor seed.size roots  
##  0:616       0:617         0:624     0:582  
##  1: 67       1: 66         1: 59     1: 86  
##                                      2: 15  
##                                             
##                                             
##                                             
##

Data 624 Homework 4: Modeling

Mary Anna Kivenson

February 9, 2020