Source Code: https://github.com/djlofland/DATA624_PredictiveAnalytics/tree/master/Homework_4

Problem 3.1

The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

The data can be accessed via:

> library(mlbench)
> data(Glass)
> str(Glass)

'data.frame': 214 obs. of 10 variables:
$ RI : num 1.52 1.52 1.52 1.52 1.52 ...
$ Na : num 13.6 13.9 13.5 13.2 13.3 ...
$ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
$ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
$ Si : num 71.8 72.7 73 72.6 73.1 ...
$ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
$ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
$ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
$ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
$ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
  1. Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
data(Glass)

# Display summary statistics
summary(Glass)
##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29
# Prepare data for ggplot (remove the Target 'Type' column)
feature_df <- Glass %>% 
  select(-Type)

feature_gather_df <- feature_df %>% 
  gather(key = 'variable', value = 'value')

# Histogram plots of each variable
ggplot(feature_gather_df) + 
  geom_histogram(aes(x=value, y = ..density..), bins=30) + 
  geom_density(aes(x=value), color='blue') +
  facet_wrap(. ~variable, scales='free', ncol=4)

# Skewness for each Predictor
(skewValues <- apply(feature_df, 2, skewness))
##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107
# Boxplots for each variable
ggplot(feature_gather_df, aes(variable, value)) + 
  geom_boxplot() + 
  facet_wrap(. ~variable, scales='free', ncol=6)