#M MAx code Chapter 18
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.1.1
data(solubility)
cor(solTrainXtrans$NumCarbon, solTrainY)
## [1] -0.6067917
#to get results for all numeric predictors, use apply
fpCols <- grepl("FP", names(solTrainXtrans))
numericPreds <- names(solTrainXtrans)[!fpCols]
corrValues <- apply(solTrainXtrans[, numericPreds],
MARGIN=2,
FUN=function(x,y) cor(x,y),
y=solTrainY)
#rank correl use method="spearman"
#The lOESS smoother
library(stats)
smoother <- loess(solTrainY ~ solTrainXtrans$NumCarbon)
smoother
## Call:
## loess(formula = solTrainY ~ solTrainXtrans$NumCarbon)
##
## Number of Observations: 951
## Equivalent Number of Parameters: 5.3
## Residual Standard Error: 1.548
library(lattice)
## Warning: package 'lattice' was built under R version 3.1.2
xyplot(solTrainY ~ solTrainXtrans$NumCarbon,
type= c("p", "smooth"),
xlab="# Carbosn", ylab="Solubility")

# for categorical predictors, the simple t-test function computes the difference in means and the p=value. For one predictor
t.test(solTrainY ~ solTrainXtrans$FP044)
##
## Welch Two Sample t-test
##
## data: solTrainY by solTrainXtrans$FP044
## t = 15.1984, df = 61.891, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.569300 4.650437
## sample estimates:
## mean in group 0 mean in group 1
## -2.472237 -6.582105
#apply to all predictors
getTstats <- function(x,y)
{
tTest <- t.test(y~x)
out <- c(tStat= tTest$statistic, p=tTest$p.value)
out
}
tVals <- apply(solTrainXtrans[ , fpCols],
MARGIN=2,
FUN=getTstats,
y=solTrainY)
tVals <- t(tVals); head(tVals)
## tStat.t p
## FP001 -4.022040 6.287404e-05
## FP002 10.286727 1.351580e-23
## FP003 -2.036442 4.198619e-02
## FP004 -4.948958 9.551772e-07
## FP005 10.282475 1.576549e-23
## FP006 -7.875838 9.287835e-15
#VARIABLE IMPORTANCE (CONTINUOUS VARIABLES)
#OPTION A
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: ggplot2
#filterVarImp with nonpara=T option creates a LOESS model for each predictor and quantifies the relationship w/ outcome
loessResults <- filterVarImp(x=solTrainXtrans[, numericPreds],
y=solTrainY,
nonpara=T)
head(loessResults)
## Overall
## MolWeight 0.4443931
## NumAtoms 0.1899315
## NumNonHAtoms 0.3406166
## NumBonds 0.2107173
## NumNonHBonds 0.3424552
## NumMultBonds 0.2307995
#OPTION B
library(minerva)
## Warning: package 'minerva' was built under R version 3.1.1
micValues <- mine(x=solTrainXtrans[, numericPreds], y=solTrainY,)
names(micValues)
## [1] "MIC" "MAS" "MEV" "MCN" "MICR2"
head(micValues$MIC)
## Y
## MolWeight 0.4679277
## NumAtoms 0.2896815
## NumNonHAtoms 0.3947092
## NumBonds 0.3268683
## NumNonHBonds 0.3919627
## NumMultBonds 0.2792600
#CATEGORICAL OUTCOMES
#filterVarImp also calcualtes AUC ROC curve for discrete variables
library(caret)
data(segmentationData)
cellData <- subset(segmentationData, Case=="Train")
cellData$Case <- cellData$Cell <- NULL
head(names(cellData))
## [1] "Class" "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2"
## [6] "AvgIntenCh3"
rocValues <- filterVarImp(x = cellData[,-1],
y=cellData$Class)
# col is created for each class
head(rocValues)
## PS WS
## AngleCh1 0.5025967 0.5025967
## AreaCh1 0.5709170 0.5709170
## AvgIntenCh1 0.7662375 0.7662375
## AvgIntenCh2 0.7866146 0.7866146
## AvgIntenCh3 0.5214098 0.5214098
## AvgIntenCh4 0.6473814 0.6473814
# the above is simple wrapper for the fns roc and auc in the pROC package
# when 3+ classes, filterVarImp will computer ROC for each class versus the others and then returns the largest AUC
# MIC statistic can be computer as before with dummy data
library(minerva)
micValues <- mine(x=cellData[,-1], y=ifelse(cellData$Class =="PS",1,0))
names(micValues)
## [1] "MIC" "MAS" "MEV" "MCN" "MICR2"
head(micValues$MIC)
## Y
## AngleCh1 0.1310570
## AreaCh1 0.1080839
## AvgIntenCh1 0.2920461
## AvgIntenCh2 0.3294846
## AvgIntenCh3 0.1354438
## AvgIntenCh4 0.1665450
#to calculate odds ratio and statistical test of association, the fisher.test fn =
# table1 <- table(training[,"class'], training[, 2])
# fisher.test(table1)
#When predictor has 2+ classes, single odds ratio cannot be computed, p=value can still be utilized
#ciTable <- table(training[,"classes (success /unsuccess)'], training[, "variable with 2+ factors""])