We will load the data into a dataframe and get started
library(missForest)
library(corrgram)
library('caret')
dfBevMod <- read.csv("https://github.com/ChristopheHunt/CUNY-DATA624/raw/master/data/StudentData.csv", header = TRUE)
dfBevPred <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/StudentEvaluation-%20TO%20PREDICT.csv", header =TRUE)
First we will examine the training dataset we want to see how many predictor variables we are dealing with and if we are missing any data
dim(dfBevMod)
## [1] 2571 33
Looks like we have a total of 32 predictor variables and a target variable. Next we will check for any missing data
summary(dfBevMod)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## : 120 Min. :5.040 Min. :23.63 Min. :0.07933
## A: 293 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## B:1239 Median :5.347 Median :23.97 Median :0.27133
## C: 304 Mean :5.370 Mean :23.97 Mean :0.27712
## D: 615 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb.Pressure Carb.Temp PSC PSC.Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler.Level Filler.Speed Temperature Usage.cont
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36
## Median :118.4 Median :3982 Median :65.60 Median :21.79
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90
## NA's :20 NA's :57 NA's :14 NA's :5
## Carb.Flow Density MFR Balling
## Min. : 26 Min. :0.240 Min. : 31.4 Min. :-0.170
## 1st Qu.:1144 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496
## Median :3028 Median :0.980 Median :724.0 Median : 1.648
## Mean :2468 Mean :1.174 Mean :704.0 Mean : 2.198
## 3rd Qu.:3186 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292
## Max. :5104 Max. :1.920 Max. :868.6 Max. : 4.012
## NA's :2 NA's :1 NA's :212 NA's :1
## Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint
## Min. :-6.600 Min. :7.880 Min. :0.00240 Min. : 70.0
## 1st Qu.:-5.600 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0
## Median :-5.400 Median :8.540 Median :0.03340 Median :120.0
## Mean :-5.216 Mean :8.546 Mean :0.04684 Mean :109.3
## 3rd Qu.:-5.000 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0
## Max. :-3.600 Max. :9.360 Max. :0.40000 Max. :140.0
## NA's :4 NA's :12 NA's :2
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## Min. :44.00 Min. :140.8 Min. :5.280 Min. :4.960
## 1st Qu.:46.00 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340
## Median :46.00 Median :142.6 Median :6.560 Median :5.400
## Mean :47.62 Mean :142.8 Mean :6.897 Mean :5.437
## 3rd Qu.:50.00 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540
## Max. :52.00 Max. :148.2 Max. :8.620 Max. :6.060
## NA's :12 NA's :9 NA's :10
## Balling.Lvl
## Min. :0.00
## 1st Qu.:1.38
## Median :1.48
## Mean :2.05
## 3rd Qu.:3.14
## Max. :3.66
## NA's :1
Appears that we have one nominal variable: Brand.Code
barchart(dfBevMod[,1], col="Gold")
dfBevModH <- dfBevMod[2:ncol(dfBevMod)] #removing factor var
par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
hist(dfBevModH[,i], xlab = names(dfBevMod[i]),
main = names(dfBevModH[i]), col="grey", ylab="")
}
Mnf.Flow and Hyd.Pressure 1,2,3 each have many values below 0 – possibly null-type entered values. Several variables are strongly skewed – some of which appear to have outliers.
par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
plot(dfBevModH[,i], xlab = names(dfBevModH[i]),
main = names(dfBevModH[i]), col="grey", ylab="")
}
Due to the way the data is cut – seemingly by timeline, my hunch is that a MARS algorithm could be a strong fit. A few variables have outliers.
We should probably discuss as a group how to deal with some of the 0 values, though, especially in Mnf.Flow, which could be a function of the the underlying patterns of the variable, or a just a null type non-data input. The Hyd.Pressures (1,2,3) also start collecting non-zero type values around the same point as Mnf.Flow. Usage.cont has a similar, but reversed pattern, but doesn’t seem to be correlated, at least visually.
par(mfrow = c(3,5), cex = .5)
for (i in colnames(dfBevModH)) {
smoothScatter(dfBevModH[,i], main = names(dfBevModH[i]), ylab = "",
xlab = "", colramp = colorRampPalette(c("white", "red")))
}
par(mfrow = c(3,5), cex = .5)
for(i in colnames(dfBevModH)){
boxplot(dfBevModH[,i], xlab = names(dfBevModH[i]),
main = names(dfBevModH[i]), col="grey", ylab="")
}
Again, several variables with large skews and outliers
PCA <- function(X) {
Xpca <- prcomp(na.omit(X), center = T, scale. = T)
M <- as.matrix(na.omit(X)); R <- as.matrix(Xpca$rotation); score <- M %*% R
print(list("Importance of Components" = summary(Xpca)$importance[ ,1:5],
"Rotation (Variable Loadings)" = Xpca$rotation[ ,1:5],
"Correlation between X and PC" = cor(na.omit(X), score)[ ,1:5]))
par(mfrow=c(2,3))
barplot(Xpca$sdev^2, ylab = "Component Variance")
barplot(cor(cbind(X)), ylab = "Correlations")
barplot(Xpca$rotation, ylab = "Loadings")
biplot(Xpca); barplot(M); barplot(score)
}
PCA(dfBevModH)
## $`Importance of Components`
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 2.540088 2.449148 1.477634 1.47603 1.310421
## Proportion of Variance 0.201630 0.187450 0.068230 0.06808 0.053660
## Cumulative Proportion 0.201630 0.389070 0.457310 0.52539 0.579050
##
## $`Rotation (Variable Loadings)`
## PC1 PC2 PC3 PC4
## Carb.Volume -0.320871550 0.126525924 -0.063988169 0.0689976465
## Fill.Ounces 0.033313610 -0.010524814 0.073850141 0.2358285280
## PC.Volume 0.072479708 -0.108552900 -0.069562774 -0.3956296184
## Carb.Pressure -0.208006297 0.060068377 -0.068128144 0.1465200114
## Carb.Temp -0.033305138 -0.006187462 -0.040764654 0.1184540029
## PSC 0.034729843 -0.007932387 0.007672846 0.0166335686
## PSC.Fill 0.005088232 -0.025583769 -0.008017467 0.0803836356
## PSC.CO2 0.029539988 0.013537129 -0.010869845 0.1017169746
## Mnf.Flow 0.080644367 0.358177152 0.044915507 0.0252965168
## Carb.Pressure1 0.037910196 0.212302734 0.028684308 0.1765112631
## Fill.Pressure 0.162738779 0.217724987 -0.208305845 0.0873750286
## Hyd.Pressure1 0.040714287 0.156202166 0.028117601 -0.4998183585
## Hyd.Pressure2 0.057453860 0.313403614 0.076107892 -0.3208825682
## Hyd.Pressure3 0.071682634 0.348252038 0.032135037 -0.2278539437
## Hyd.Pressure4 0.268513018 -0.023674824 -0.168038357 -0.0027778884
## Filler.Level -0.098309141 -0.276213319 0.109275148 -0.2476992005
## Filler.Speed -0.036355308 0.001197480 0.619301867 0.0628275426
## Temperature 0.105282829 -0.066203240 -0.105105120 0.0321416038
## Usage.cont 0.053341657 0.248028048 0.106147980 0.1153232541
## Carb.Flow -0.019747290 -0.192400631 -0.190052445 0.0123151046
## Density -0.355713158 0.107013359 -0.067887877 -0.0009745302
## MFR -0.040893045 -0.001083672 0.620594464 0.0666977237
## Balling -0.354342525 0.141148693 -0.040775757 -0.0141207975
## Pressure.Vacuum -0.046505652 -0.268137181 0.018936117 0.2043414157
## PH -0.115017697 -0.166446153 -0.043839565 -0.1546589220
## Oxygen.Filler -0.017291886 -0.210617922 -0.096839733 -0.0417513911
## Bowl.Setpoint -0.103923455 -0.273585690 0.086519475 -0.2471199864
## Pressure.Setpoint 0.181047509 0.208820308 -0.105820469 0.0562644751
## Air.Pressurer 0.031569404 -0.035033996 -0.071242723 0.2609739989
## Alch.Rel -0.364650212 0.099430202 -0.076928665 -0.0194895814
## Carb.Rel -0.352557712 0.079691885 -0.073882285 -0.0418290381
## Balling.Lvl -0.361905544 0.108856318 -0.069790925 0.0073156499
## PC5
## Carb.Volume -0.1289819052
## Fill.Ounces -0.1229858345
## PC.Volume 0.1598460185
## Carb.Pressure 0.5485412373
## Carb.Temp 0.6879491706
## PSC -0.0797130932
## PSC.Fill -0.1062661776
## PSC.CO2 -0.0212550379
## Mnf.Flow -0.0159976226
## Carb.Pressure1 -0.0290714575
## Fill.Pressure 0.0003314597
## Hyd.Pressure1 0.0951143049
## Hyd.Pressure2 0.0947102915
## Hyd.Pressure3 0.0851266883
## Hyd.Pressure4 -0.0064600449
## Filler.Level -0.0659270082
## Filler.Speed 0.1239047672
## Temperature 0.1020654842
## Usage.cont -0.1185660052
## Carb.Flow 0.1730300450
## Density -0.0203213313
## MFR 0.1118598210
## Balling -0.0496622835
## Pressure.Vacuum -0.0377417295
## PH 0.0428191812
## Oxygen.Filler 0.0913810644
## Bowl.Setpoint -0.0734787536
## Pressure.Setpoint 0.0479371223
## Air.Pressurer 0.0998622820
## Alch.Rel -0.0372397655
## Carb.Rel -0.0243017040
## Balling.Lvl -0.0570128001
##
## $`Correlation between X and PC`
## PC1 PC2 PC3 PC4
## Carb.Volume 0.002672235 0.111348522 0.063613298 -0.011847085
## Fill.Ounces 0.040529153 0.087715367 0.097669459 0.059771410
## PC.Volume -0.180340769 -0.271489773 -0.199880836 -0.115598261
## Carb.Pressure -0.066204190 0.025722112 0.036530429 0.061413374
## Carb.Temp -0.069518362 -0.039493236 -0.003344882 0.074340078
## PSC 0.040662924 0.037176123 0.018514307 0.006021336
## PSC.Fill -0.021548310 -0.028862504 -0.023591759 0.007699516
## PSC.CO2 0.035580707 0.022245482 0.003727000 0.020391565
## Mnf.Flow 0.723291814 0.693451292 0.295235376 -0.147391760
## Carb.Pressure1 0.375637900 0.371884050 0.168815533 0.006765179
## Fill.Pressure 0.498230248 0.300422256 -0.075514622 -0.207326749
## Hyd.Pressure1 0.275073293 0.247417853 0.076691158 -0.437731705
## Hyd.Pressure2 0.487809573 0.485370725 0.238650794 -0.300196428
## Hyd.Pressure3 0.580859072 0.545232467 0.225502363 -0.280195784
## Hyd.Pressure4 0.296714868 0.029386412 -0.206499978 -0.204162315
## Filler.Level -0.389979312 -0.217909690 0.031229026 -0.134337700
## Filler.Speed -0.470607794 0.059325780 0.832380594 0.798933852
## Temperature -0.032400811 -0.131025575 -0.131935567 0.013501453
## Usage.cont 0.515662927 0.567466858 0.306198713 -0.105542769
## Carb.Flow -0.728865476 -0.966223188 -0.604417233 0.368469211
## Density -0.117230400 -0.008474646 0.016244812 0.022657754
## MFR -0.455045229 0.066334864 0.812095962 0.771973495
## Balling -0.017953566 0.121197295 0.097470770 -0.026074148
## Pressure.Vacuum -0.457986670 -0.407097197 -0.119183474 0.240746451
## PH -0.345239874 -0.323918002 -0.164411012 -0.017197343
## Oxygen.Filler -0.424027239 -0.467026196 -0.247762941 0.093078019
## Bowl.Setpoint -0.373730212 -0.214155886 0.007211244 -0.163635722
## Pressure.Setpoint 0.411830072 0.282182390 0.035998404 -0.095125936
## Air.Pressurer -0.075890499 -0.120293496 -0.078403739 0.157033933
## Alch.Rel -0.102425243 0.006988524 0.003604029 -0.018809036
## Carb.Rel -0.108413466 0.008531971 0.019415972 -0.047949293
## Balling.Lvl -0.060110506 0.060109493 0.043885246 -0.025858266
## PC5
## Carb.Volume -0.093658221
## Fill.Ounces -0.100673596
## PC.Volume 0.233542757
## Carb.Pressure 0.011754537
## Carb.Temp 0.072779034
## PSC -0.036486635
## PSC.Fill 0.008941563
## PSC.CO2 -0.014123789
## Mnf.Flow -0.460810554
## Carb.Pressure1 -0.260300884
## Fill.Pressure -0.238063463
## Hyd.Pressure1 -0.154832629
## Hyd.Pressure2 -0.275314801
## Hyd.Pressure3 -0.323771761
## Hyd.Pressure4 -0.090604320
## Filler.Level 0.028038837
## Filler.Speed 0.247282208
## Temperature 0.103770602
## Usage.cont -0.454543589
## Carb.Flow 0.950568080
## Density 0.034322442
## MFR 0.224469906
## Balling -0.084814051
## Pressure.Vacuum 0.277242437
## PH 0.211975600
## Oxygen.Filler 0.354347799
## Bowl.Setpoint 0.012366381
## Pressure.Setpoint -0.169959804
## Air.Pressurer 0.125926876
## Alch.Rel 0.005207278
## Carb.Rel -0.012596007
## Balling.Lvl -0.050934875
First two components account for most of the variance, although Mnf.Flow is highly prioritized, so I’m concerned that it may be a function of the null-like values.
Before we proceed we will also remove Brand.Code as it is the only categorical variable and has a lot of empty non Na values. AG: I’m concerned about removing this variable altogether, going to just impute as nulls for now, but we should discuss
#df_imputed$Brand.Code = NULL
dfBevMod$Brand.Code[dfBevMod$Brand.Code == ""] <- NA
dfBevMod$Brand.Code <- droplevels(dfBevMod$Brand.Code)
dfBevPred$Brand.Code[dfBevPred$Brand.Code == ""] <- NA
dfBevPred$Brand.Code <- droplevels(dfBevPred$Brand.Code)
#Recode categorical factor
dfBevMod$A <- ifelse(dfBevMod$Brand.Code == "A", 1, 0)
dfBevMod$B <- ifelse(dfBevMod$Brand.Code == "B", 1, 0)
dfBevMod$C <- ifelse(dfBevMod$Brand.Code == "C", 1, 0)
dfBevMod$D <- ifelse(dfBevMod$Brand.Code == "D", 1, 0)
dfBevMod$Brand.Code <- NULL
dfBevPred$A <- ifelse(dfBevPred$Brand.Code == "A", 1, 0)
dfBevPred$B <- ifelse(dfBevPred$Brand.Code == "B", 1, 0)
dfBevPred$C <- ifelse(dfBevPred$Brand.Code == "C", 1, 0)
dfBevPred$D <- ifelse(dfBevPred$Brand.Code == "D", 1, 0)
dfBevPred$Brand.Code <- NULL
It appears we have missing data for pretty much every variable. Our target variable has 4 missing values out of 2571, so we will remove the data from those instead of imputing the missing values for the target variable. AG: I’m a little uncomfortable here too, going to model these points for now, but we should discuss
#df = subset(df, !is.na(PH))
#dim(df)
Next we will use the missForest library to impute the missing variable of the predictor variables
#dfImpMod = missForest(dfBevMod)
#dfImpMod$OOBerror #error rate looks good?
#dfModImp <- dfImpMod$ximp
#dfImpPred <- missForest(dfBevPred)
#dfPredImp <- dfImpPred$ximp
#dfPredImp$PH <- NA #redadding PH
#write.csv(dfModImp, "TrainImputeData.csv")
#write.csv(dfPredImp, "PredictImputeData.csv")
#Stored current imputation results on github to quicken knitr iterations
dfModImp <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/TrainImputeData.csv")
dfPredImp <- read.csv("https://raw.githubusercontent.com/ChristopheHunt/CUNY-DATA624/master/data/PredictImputeData.csv")
Due to the different types of model imputs (some preprocess, other’s dont), creating a range of preprocessed variables
dfModImpX <- dfModImp[,!(names(dfModImp) == "PH")]
dfModImpY <- dfModImp[, names(dfModImp) == "PH"]
dfPredImpX <- dfPredImp[,!(names(dfPredImp) == "PH")]
dfPredImpY <- dfPredImp[, names(dfPredImp) == "PH"]
#Spatial Sign outlier processing
dfModImpSsX <- spatialSign(dfModImpX)
dfPredImpSsX <- spatialSign(dfPredImpX)
#BoxCox Only
transModB <- preProcess(dfModImpSsX, method = "BoxCox") #transformed all 22 variables
dfModBX <- predict(transModB, dfModImpSsX)
transPredB <- preProcess(dfPredImpSsX, method = "BoxCox") #transformed 23 variables (should we use the Modeling model from above, instead of predicting model?)
dfPredBX <- predict(transPredB, dfPredImpSsX)
#BoxCox, Centering, and Scaling
transModBCS <- preProcess(dfModImpSsX, method = c("BoxCox", "center", "scale")) #22 BC, 35 centered, 35 scaled
dfModBCSX <- predict(transModBCS, dfModImpSsX)
transPredBCS <- preProcess(dfPredImpSsX, method = c("BoxCox", "center", "scale")) #23 BC, 35 centered, 35 scaled
dfPredBCSX <- predict(transPredBCS, dfPredImpSsX)
#BoxCox, Centering, Scaling, and PCA
transModBCSP <- preProcess(dfModImpSsX, method = c("BoxCox", "center", "scale", "pca")) #22 BC, 35 centered, 35 scaled
dfModBCSPX <- predict(transModBCSP, dfModImpSsX)
transPredBCSP <- preProcess(dfPredImpSsX, method = c("BoxCox", "center", "scale", "pca")) #23 BC, 35 centered, 35 scaled
dfPredBCSPX <- predict(transPredBCSP, dfPredImpSsX)
Now we are ready to move on and review the predictor variables and attempt to reduce the number of predictor variables. We will use corrgram library to review the correlation between the variables and find the highly correlated ones that can be reduced. AG: I find the correlation matrix from KJ to be the easiest to read, but totally fine with either
#corrgram(dfModBCSX, order=TRUE,
# upper.panel=panel.cor, main="Correlation Matrix")
library(corrplot)
correlations <- cor(dfModBCSX)
corrplot(correlations, order = "hclust", tl.cex = 0.55)
Several very highly correlated variables.
Next we will reduce our dataset and remove pairs that have correlation above 0.75
hc = findCorrelation(correlations, cutoff=0.75)
length(hc) #18 vars
## [1] 16
#Reducing
dfModBCSRX = dfModBCSX[,-c(hc)] #Box-Cox, Center, Scale
dfPredBCSRX = dfPredBCSX[,-c(hc)]
dfModBRX = dfModBX[,-c(hc)] #Box-Cox
dfPredBRX = dfPredBCSX[,-c(hc)]
dfModSRX = dfModImpSsX[,-c(hc)] #Only Spatial Sign
dfPredSRX = dfPredImpSsX[,-c(hc)]
Ready variables for modelling
set.seed(2017)
n75 <- floor(0.75 * nrow(dfBevMod)) #75$ of sample size
n <- sample(seq_len(nrow(dfBevMod)), size = n75)
#Box-Cox, Center, Scale
dfTrainBCSX <- dfModBCSRX[n,]
dfTestBCSX <- dfModBCSRX[-n,]
#Box-Cox
dfTrainBX <- dfModBX[n,]
dfTestBX <- dfModBX[-n,]
#Only Spatial Sign
dfTrainX <- dfModSRX[n,]
dfTestX <- dfModSRX[-n,]
#Response variable
dfTrainY <- dfModImpY[n]
dfTestY <- dfModImpY[-n]