Data Exploration

We will load the data into a dataframe and get started

library(missForest)
library(corrgram)
library('caret')

dfBevMod <- read.csv("https://github.com/ChristopheHunt/CUNY-DATA624/raw/master/data/StudentData.csv", header = TRUE)
dfBevPred <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/StudentEvaluation-%20TO%20PREDICT.csv", header =TRUE)

First we will examine the training dataset we want to see how many predictor variables we are dealing with and if we are missing any data

dim(dfBevMod)
## [1] 2571   33

Looks like we have a total of 32 predictor variables and a target variable. Next we will check for any missing data

summary(dfBevMod)
##  Brand.Code  Carb.Volume     Fill.Ounces      PC.Volume      
##   : 120     Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  A: 293     1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  B:1239     Median :5.347   Median :23.97   Median :0.27133  
##  C: 304     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##  D: 615     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##             Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##             NA's   :10      NA's   :38      NA's   :39       
##  Carb.Pressure     Carb.Temp          PSC             PSC.Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC.CO2           Mnf.Flow       Carb.Pressure1  Fill.Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd.Pressure1   Hyd.Pressure2   Hyd.Pressure3   Hyd.Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler.Level    Filler.Speed   Temperature      Usage.cont   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5      
##    Carb.Flow       Density           MFR           Balling      
##  Min.   :  26   Min.   :0.240   Min.   : 31.4   Min.   :-0.170  
##  1st Qu.:1144   1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496  
##  Median :3028   Median :0.980   Median :724.0   Median : 1.648  
##  Mean   :2468   Mean   :1.174   Mean   :704.0   Mean   : 2.198  
##  3rd Qu.:3186   3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292  
##  Max.   :5104   Max.   :1.920   Max.   :868.6   Max.   : 4.012  
##  NA's   :2      NA's   :1       NA's   :212     NA's   :1       
##  Pressure.Vacuum        PH        Oxygen.Filler     Bowl.Setpoint  
##  Min.   :-6.600   Min.   :7.880   Min.   :0.00240   Min.   : 70.0  
##  1st Qu.:-5.600   1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0  
##  Median :-5.400   Median :8.540   Median :0.03340   Median :120.0  
##  Mean   :-5.216   Mean   :8.546   Mean   :0.04684   Mean   :109.3  
##  3rd Qu.:-5.000   3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0  
##  Max.   :-3.600   Max.   :9.360   Max.   :0.40000   Max.   :140.0  
##                   NA's   :4       NA's   :12        NA's   :2      
##  Pressure.Setpoint Air.Pressurer      Alch.Rel        Carb.Rel    
##  Min.   :44.00     Min.   :140.8   Min.   :5.280   Min.   :4.960  
##  1st Qu.:46.00     1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340  
##  Median :46.00     Median :142.6   Median :6.560   Median :5.400  
##  Mean   :47.62     Mean   :142.8   Mean   :6.897   Mean   :5.437  
##  3rd Qu.:50.00     3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540  
##  Max.   :52.00     Max.   :148.2   Max.   :8.620   Max.   :6.060  
##  NA's   :12                        NA's   :9       NA's   :10     
##   Balling.Lvl  
##  Min.   :0.00  
##  1st Qu.:1.38  
##  Median :1.48  
##  Mean   :2.05  
##  3rd Qu.:3.14  
##  Max.   :3.66  
##  NA's   :1

Appears that we have one nominal variable: Brand.Code

Barchart

barchart(dfBevMod[,1], col="Gold")

Nominal Variable Histogram

dfBevModH <- dfBevMod[2:ncol(dfBevMod)] #removing factor var
par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
hist(dfBevModH[,i], xlab = names(dfBevMod[i]),
  main = names(dfBevModH[i]), col="grey", ylab="")
}

Mnf.Flow and Hyd.Pressure 1,2,3 each have many values below 0 – possibly null-type entered values. Several variables are strongly skewed – some of which appear to have outliers.

Variable Scatterplots

par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
plot(dfBevModH[,i], xlab = names(dfBevModH[i]),
  main = names(dfBevModH[i]), col="grey", ylab="")
}

Due to the way the data is cut – seemingly by timeline, my hunch is that a MARS algorithm could be a strong fit. A few variables have outliers.

We should probably discuss as a group how to deal with some of the 0 values, though, especially in Mnf.Flow, which could be a function of the the underlying patterns of the variable, or a just a null type non-data input. The Hyd.Pressures (1,2,3) also start collecting non-zero type values around the same point as Mnf.Flow. Usage.cont has a similar, but reversed pattern, but doesn’t seem to be correlated, at least visually.

Density Plot

par(mfrow = c(3,5), cex = .5)
for (i in colnames(dfBevModH)) {
 smoothScatter(dfBevModH[,i], main = names(dfBevModH[i]), ylab = "", 
   xlab = "", colramp = colorRampPalette(c("white", "red")))
 }

BoxPlots

par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
boxplot(dfBevModH[,i], xlab = names(dfBevModH[i]),
  main = names(dfBevModH[i]), col="grey", ylab="")
}

Again, several variables with large skews and outliers

Principle Component Analysis

PCA <- function(X) {
  Xpca <- prcomp(na.omit(X), center = T, scale. = T) 
  M <- as.matrix(na.omit(X)); R <- as.matrix(Xpca$rotation); score <- M %*% R
  print(list("Importance of Components" = summary(Xpca)$importance[ ,1:5], 
             "Rotation (Variable Loadings)" = Xpca$rotation[ ,1:5],
             "Correlation between X and PC" = cor(na.omit(X), score)[ ,1:5]))
  par(mfrow=c(2,3))
  barplot(Xpca$sdev^2, ylab = "Component Variance")
  barplot(cor(cbind(X)), ylab = "Correlations")
  barplot(Xpca$rotation, ylab = "Loadings")  
  biplot(Xpca); barplot(M); barplot(score)
}
PCA(dfBevModH)
## $`Importance of Components`
##                             PC1      PC2      PC3     PC4      PC5
## Standard deviation     2.540088 2.449148 1.477634 1.47603 1.310421
## Proportion of Variance 0.201630 0.187450 0.068230 0.06808 0.053660
## Cumulative Proportion  0.201630 0.389070 0.457310 0.52539 0.579050
## 
## $`Rotation (Variable Loadings)`
##                            PC1          PC2          PC3           PC4
## Carb.Volume       -0.320871550  0.126525924 -0.063988169  0.0689976465
## Fill.Ounces        0.033313610 -0.010524814  0.073850141  0.2358285280
## PC.Volume          0.072479708 -0.108552900 -0.069562774 -0.3956296184
## Carb.Pressure     -0.208006297  0.060068377 -0.068128144  0.1465200114
## Carb.Temp         -0.033305138 -0.006187462 -0.040764654  0.1184540029
## PSC                0.034729843 -0.007932387  0.007672846  0.0166335686
## PSC.Fill           0.005088232 -0.025583769 -0.008017467  0.0803836356
## PSC.CO2            0.029539988  0.013537129 -0.010869845  0.1017169746
## Mnf.Flow           0.080644367  0.358177152  0.044915507  0.0252965168
## Carb.Pressure1     0.037910196  0.212302734  0.028684308  0.1765112631
## Fill.Pressure      0.162738779  0.217724987 -0.208305845  0.0873750286
## Hyd.Pressure1      0.040714287  0.156202166  0.028117601 -0.4998183585
## Hyd.Pressure2      0.057453860  0.313403614  0.076107892 -0.3208825682
## Hyd.Pressure3      0.071682634  0.348252038  0.032135037 -0.2278539437
## Hyd.Pressure4      0.268513018 -0.023674824 -0.168038357 -0.0027778884
## Filler.Level      -0.098309141 -0.276213319  0.109275148 -0.2476992005
## Filler.Speed      -0.036355308  0.001197480  0.619301867  0.0628275426
## Temperature        0.105282829 -0.066203240 -0.105105120  0.0321416038
## Usage.cont         0.053341657  0.248028048  0.106147980  0.1153232541
## Carb.Flow         -0.019747290 -0.192400631 -0.190052445  0.0123151046
## Density           -0.355713158  0.107013359 -0.067887877 -0.0009745302
## MFR               -0.040893045 -0.001083672  0.620594464  0.0666977237
## Balling           -0.354342525  0.141148693 -0.040775757 -0.0141207975
## Pressure.Vacuum   -0.046505652 -0.268137181  0.018936117  0.2043414157
## PH                -0.115017697 -0.166446153 -0.043839565 -0.1546589220
## Oxygen.Filler     -0.017291886 -0.210617922 -0.096839733 -0.0417513911
## Bowl.Setpoint     -0.103923455 -0.273585690  0.086519475 -0.2471199864
## Pressure.Setpoint  0.181047509  0.208820308 -0.105820469  0.0562644751
## Air.Pressurer      0.031569404 -0.035033996 -0.071242723  0.2609739989
## Alch.Rel          -0.364650212  0.099430202 -0.076928665 -0.0194895814
## Carb.Rel          -0.352557712  0.079691885 -0.073882285 -0.0418290381
## Balling.Lvl       -0.361905544  0.108856318 -0.069790925  0.0073156499
##                             PC5
## Carb.Volume       -0.1289819052
## Fill.Ounces       -0.1229858345
## PC.Volume          0.1598460185
## Carb.Pressure      0.5485412373
## Carb.Temp          0.6879491706
## PSC               -0.0797130932
## PSC.Fill          -0.1062661776
## PSC.CO2           -0.0212550379
## Mnf.Flow          -0.0159976226
## Carb.Pressure1    -0.0290714575
## Fill.Pressure      0.0003314597
## Hyd.Pressure1      0.0951143049
## Hyd.Pressure2      0.0947102915
## Hyd.Pressure3      0.0851266883
## Hyd.Pressure4     -0.0064600449
## Filler.Level      -0.0659270082
## Filler.Speed       0.1239047672
## Temperature        0.1020654842
## Usage.cont        -0.1185660052
## Carb.Flow          0.1730300450
## Density           -0.0203213313
## MFR                0.1118598210
## Balling           -0.0496622835
## Pressure.Vacuum   -0.0377417295
## PH                 0.0428191812
## Oxygen.Filler      0.0913810644
## Bowl.Setpoint     -0.0734787536
## Pressure.Setpoint  0.0479371223
## Air.Pressurer      0.0998622820
## Alch.Rel          -0.0372397655
## Carb.Rel          -0.0243017040
## Balling.Lvl       -0.0570128001
## 
## $`Correlation between X and PC`
##                            PC1          PC2          PC3          PC4
## Carb.Volume        0.002672235  0.111348522  0.063613298 -0.011847085
## Fill.Ounces        0.040529153  0.087715367  0.097669459  0.059771410
## PC.Volume         -0.180340769 -0.271489773 -0.199880836 -0.115598261
## Carb.Pressure     -0.066204190  0.025722112  0.036530429  0.061413374
## Carb.Temp         -0.069518362 -0.039493236 -0.003344882  0.074340078
## PSC                0.040662924  0.037176123  0.018514307  0.006021336
## PSC.Fill          -0.021548310 -0.028862504 -0.023591759  0.007699516
## PSC.CO2            0.035580707  0.022245482  0.003727000  0.020391565
## Mnf.Flow           0.723291814  0.693451292  0.295235376 -0.147391760
## Carb.Pressure1     0.375637900  0.371884050  0.168815533  0.006765179
## Fill.Pressure      0.498230248  0.300422256 -0.075514622 -0.207326749
## Hyd.Pressure1      0.275073293  0.247417853  0.076691158 -0.437731705
## Hyd.Pressure2      0.487809573  0.485370725  0.238650794 -0.300196428
## Hyd.Pressure3      0.580859072  0.545232467  0.225502363 -0.280195784
## Hyd.Pressure4      0.296714868  0.029386412 -0.206499978 -0.204162315
## Filler.Level      -0.389979312 -0.217909690  0.031229026 -0.134337700
## Filler.Speed      -0.470607794  0.059325780  0.832380594  0.798933852
## Temperature       -0.032400811 -0.131025575 -0.131935567  0.013501453
## Usage.cont         0.515662927  0.567466858  0.306198713 -0.105542769
## Carb.Flow         -0.728865476 -0.966223188 -0.604417233  0.368469211
## Density           -0.117230400 -0.008474646  0.016244812  0.022657754
## MFR               -0.455045229  0.066334864  0.812095962  0.771973495
## Balling           -0.017953566  0.121197295  0.097470770 -0.026074148
## Pressure.Vacuum   -0.457986670 -0.407097197 -0.119183474  0.240746451
## PH                -0.345239874 -0.323918002 -0.164411012 -0.017197343
## Oxygen.Filler     -0.424027239 -0.467026196 -0.247762941  0.093078019
## Bowl.Setpoint     -0.373730212 -0.214155886  0.007211244 -0.163635722
## Pressure.Setpoint  0.411830072  0.282182390  0.035998404 -0.095125936
## Air.Pressurer     -0.075890499 -0.120293496 -0.078403739  0.157033933
## Alch.Rel          -0.102425243  0.006988524  0.003604029 -0.018809036
## Carb.Rel          -0.108413466  0.008531971  0.019415972 -0.047949293
## Balling.Lvl       -0.060110506  0.060109493  0.043885246 -0.025858266
##                            PC5
## Carb.Volume       -0.093658221
## Fill.Ounces       -0.100673596
## PC.Volume          0.233542757
## Carb.Pressure      0.011754537
## Carb.Temp          0.072779034
## PSC               -0.036486635
## PSC.Fill           0.008941563
## PSC.CO2           -0.014123789
## Mnf.Flow          -0.460810554
## Carb.Pressure1    -0.260300884
## Fill.Pressure     -0.238063463
## Hyd.Pressure1     -0.154832629
## Hyd.Pressure2     -0.275314801
## Hyd.Pressure3     -0.323771761
## Hyd.Pressure4     -0.090604320
## Filler.Level       0.028038837
## Filler.Speed       0.247282208
## Temperature        0.103770602
## Usage.cont        -0.454543589
## Carb.Flow          0.950568080
## Density            0.034322442
## MFR                0.224469906
## Balling           -0.084814051
## Pressure.Vacuum    0.277242437
## PH                 0.211975600
## Oxygen.Filler      0.354347799
## Bowl.Setpoint      0.012366381
## Pressure.Setpoint -0.169959804
## Air.Pressurer      0.125926876
## Alch.Rel           0.005207278
## Carb.Rel          -0.012596007
## Balling.Lvl       -0.050934875

First two components account for most of the variance, although Mnf.Flow is highly prioritized, so I’m concerned that it may be a function of the null-like values.

Data Transformation

Before we proceed we will also remove Brand.Code as it is the only categorical variable and has a lot of empty non Na values. AG: I’m concerned about removing this variable altogether, going to just impute as nulls for now, but we should discuss

#df_imputed$Brand.Code = NULL
dfBevMod$Brand.Code[dfBevMod$Brand.Code == ""] <- NA
dfBevMod$Brand.Code <- droplevels(dfBevMod$Brand.Code)

dfBevPred$Brand.Code[dfBevPred$Brand.Code == ""] <- NA
dfBevPred$Brand.Code <- droplevels(dfBevPred$Brand.Code)

#Recode categorical factor
dfBevMod$A <- ifelse(dfBevMod$Brand.Code == "A", 1, 0)
dfBevMod$B <- ifelse(dfBevMod$Brand.Code == "B", 1, 0)
dfBevMod$C <- ifelse(dfBevMod$Brand.Code == "C", 1, 0)
dfBevMod$D <- ifelse(dfBevMod$Brand.Code == "D", 1, 0)
dfBevMod$Brand.Code <- NULL

dfBevPred$A <- ifelse(dfBevPred$Brand.Code == "A", 1, 0)
dfBevPred$B <- ifelse(dfBevPred$Brand.Code == "B", 1, 0)
dfBevPred$C <- ifelse(dfBevPred$Brand.Code == "C", 1, 0)
dfBevPred$D <- ifelse(dfBevPred$Brand.Code == "D", 1, 0)
dfBevPred$Brand.Code <- NULL

It appears we have missing data for pretty much every variable. Our target variable has 4 missing values out of 2571, so we will remove the data from those instead of imputing the missing values for the target variable. AG: I’m a little uncomfortable here too, going to model these points for now, but we should discuss

#df = subset(df, !is.na(PH))
#dim(df)

Imputation

Next we will use the missForest library to impute the missing variable of the predictor variables

#dfImpMod = missForest(dfBevMod)
#dfImpMod$OOBerror #error rate looks good?
#dfModImp <- dfImpMod$ximp

#dfImpPred <- missForest(dfBevPred)
#dfPredImp <- dfImpPred$ximp
#dfPredImp$PH <- NA #redadding PH

#write.csv(dfModImp, "TrainImputeData.csv")
#write.csv(dfPredImp, "PredictImputeData.csv")

#Stored current imputation results on github to quicken knitr iterations
dfModImp <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/TrainImputeData.csv")
dfPredImp <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/PredictImputeData.csv")

Preprocessing

Due to the different types of model imputs (some preprocess, other’s dont), creating a range of preprocessed variables

dfModImpX <- dfModImp[,!(names(dfModImp) == "PH")]
dfModImpY <- dfModImp[, names(dfModImp) == "PH"]

dfPredImpX <- dfPredImp[,!(names(dfPredImp) == "PH")]
dfPredImpY <- dfPredImp[, names(dfPredImp) == "PH"]

#Spatial Sign outlier processing
dfModImpSsX <- spatialSign(dfModImpX)
dfPredImpSsX <- spatialSign(dfPredImpX)

#BoxCox Only
transModB <- preProcess(dfModImpSsX, method = "BoxCox") #transformed all 22 variables
dfModBX <- predict(transModB, dfModImpSsX)

transPredB <- preProcess(dfPredImpSsX, method = "BoxCox") #transformed 23 variables (should we use the Modeling model from above, instead of predicting model?)
dfPredBX <- predict(transPredB, dfPredImpSsX)

#BoxCox, Centering, and Scaling
transModBCS <- preProcess(dfModImpSsX, method = c("BoxCox", "center", "scale")) #22 BC, 35 centered, 35 scaled
dfModBCSX <- predict(transModBCS, dfModImpSsX)

transPredBCS <- preProcess(dfPredImpSsX, method = c("BoxCox", "center", "scale")) #23 BC, 35 centered, 35 scaled
dfPredBCSX <- predict(transPredBCS, dfPredImpSsX)

#BoxCox, Centering, Scaling, and PCA
transModBCSP <- preProcess(dfModImpSsX, method = c("BoxCox", "center", "scale", "pca")) #22 BC, 35 centered, 35 scaled
dfModBCSPX <- predict(transModBCSP, dfModImpSsX)

transPredBCSP <- preProcess(dfPredImpSsX, method = c("BoxCox", "center", "scale", "pca")) #23 BC, 35 centered, 35 scaled
dfPredBCSPX <- predict(transPredBCSP, dfPredImpSsX)

Now we are ready to move on and review the predictor variables and attempt to reduce the number of predictor variables. We will use corrgram library to review the correlation between the variables and find the highly correlated ones that can be reduced. AG: I find the correlation matrix from KJ to be the easiest to read, but totally fine with either

#corrgram(dfModBCSX, order=TRUE,
#         upper.panel=panel.cor, main="Correlation Matrix")
library(corrplot)
correlations <- cor(dfModBCSX)
corrplot(correlations, order = "hclust", tl.cex = 0.55)

Several very highly correlated variables.

Next we will reduce our dataset and remove pairs that have correlation above 0.75

hc = findCorrelation(correlations, cutoff=0.75)
length(hc) #18 vars
## [1] 16
#Reducing
dfModBCSRX = dfModBCSX[,-c(hc)] #Box-Cox, Center, Scale
dfPredBCSRX = dfPredBCSX[,-c(hc)]

dfModBRX = dfModBX[,-c(hc)] #Box-Cox
dfPredBRX = dfPredBCSX[,-c(hc)]

dfModSRX = dfModImpSsX[,-c(hc)] #Only Spatial Sign
dfPredSRX = dfPredImpSsX[,-c(hc)]

Ready variables for modelling

set.seed(2017)
n75 <- floor(0.75 * nrow(dfBevMod)) #75$ of sample size
n <- sample(seq_len(nrow(dfBevMod)), size = n75)

#Box-Cox, Center, Scale
dfTrainBCSX <- dfModBCSRX[n,]
dfTestBCSX <- dfModBCSRX[-n,]

#Box-Cox
dfTrainBX <- dfModBX[n,]
dfTestBX <- dfModBX[-n,]

#Only Spatial Sign
dfTrainX <- dfModSRX[n,]
dfTestX <- dfModSRX[-n,]

#Response variable
dfTrainY <- dfModImpY[n]
dfTestY <- dfModImpY[-n]