Tmp

#M MAx code Chapter 18

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.1.1

data(solubility)
cor(solTrainXtrans$NumCarbon, solTrainY)

## [1] -0.6067917

#to get results for all numeric predictors, use apply
fpCols <- grepl("FP", names(solTrainXtrans))
numericPreds <- names(solTrainXtrans)[!fpCols]

corrValues <- apply(solTrainXtrans[, numericPreds],
                    MARGIN=2,
                    FUN=function(x,y) cor(x,y),
                    y=solTrainY)

#rank correl use method="spearman"
#The lOESS smoother
library(stats)
smoother <- loess(solTrainY ~ solTrainXtrans$NumCarbon)
smoother

## Call:
## loess(formula = solTrainY ~ solTrainXtrans$NumCarbon)
## 
## Number of Observations: 951 
## Equivalent Number of Parameters: 5.3 
## Residual Standard Error: 1.548

library(lattice)

## Warning: package 'lattice' was built under R version 3.1.2

xyplot(solTrainY ~ solTrainXtrans$NumCarbon,
       type= c("p", "smooth"),
        xlab="# Carbosn", ylab="Solubility")

# for categorical predictors, the simple t-test function computes the difference in means and the p=value. For one predictor
t.test(solTrainY ~ solTrainXtrans$FP044)

## 
##  Welch Two Sample t-test
## 
## data:  solTrainY by solTrainXtrans$FP044
## t = 15.1984, df = 61.891, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.569300 4.650437
## sample estimates:
## mean in group 0 mean in group 1 
##       -2.472237       -6.582105

#apply to all predictors
getTstats <- function(x,y)
{
    tTest <- t.test(y~x)
    out <- c(tStat= tTest$statistic, p=tTest$p.value)
    out
}
tVals <- apply(solTrainXtrans[ , fpCols],
               MARGIN=2,
               FUN=getTstats,
               y=solTrainY)
tVals <- t(tVals); head(tVals)

##         tStat.t            p
## FP001 -4.022040 6.287404e-05
## FP002 10.286727 1.351580e-23
## FP003 -2.036442 4.198619e-02
## FP004 -4.948958 9.551772e-07
## FP005 10.282475 1.576549e-23
## FP006 -7.875838 9.287835e-15

#VARIABLE IMPORTANCE (CONTINUOUS VARIABLES)
#OPTION A
library(caret)

## Warning: package 'caret' was built under R version 3.1.3

## Loading required package: ggplot2

#filterVarImp with nonpara=T option creates a LOESS model for each predictor and quantifies the relationship w/ outcome

loessResults <- filterVarImp(x=solTrainXtrans[, numericPreds],
                             y=solTrainY,
                             nonpara=T)
head(loessResults)

##                Overall
## MolWeight    0.4443931
## NumAtoms     0.1899315
## NumNonHAtoms 0.3406166
## NumBonds     0.2107173
## NumNonHBonds 0.3424552
## NumMultBonds 0.2307995

#OPTION B
library(minerva)

## Warning: package 'minerva' was built under R version 3.1.1

micValues <- mine(x=solTrainXtrans[, numericPreds], y=solTrainY,)
names(micValues)

## [1] "MIC"   "MAS"   "MEV"   "MCN"   "MICR2"

head(micValues$MIC)

##                      Y
## MolWeight    0.4679277
## NumAtoms     0.2896815
## NumNonHAtoms 0.3947092
## NumBonds     0.3268683
## NumNonHBonds 0.3919627
## NumMultBonds 0.2792600

#CATEGORICAL OUTCOMES
#filterVarImp also calcualtes AUC ROC curve for discrete variables

library(caret)
data(segmentationData)
cellData <- subset(segmentationData, Case=="Train")
cellData$Case <- cellData$Cell <- NULL
head(names(cellData))

## [1] "Class"       "AngleCh1"    "AreaCh1"     "AvgIntenCh1" "AvgIntenCh2"
## [6] "AvgIntenCh3"

rocValues <- filterVarImp(x = cellData[,-1],
                          y=cellData$Class)
# col is created for each class
head(rocValues)

##                    PS        WS
## AngleCh1    0.5025967 0.5025967
## AreaCh1     0.5709170 0.5709170
## AvgIntenCh1 0.7662375 0.7662375
## AvgIntenCh2 0.7866146 0.7866146
## AvgIntenCh3 0.5214098 0.5214098
## AvgIntenCh4 0.6473814 0.6473814

# the above is simple wrapper for the fns roc and auc in the pROC package
# when 3+ classes, filterVarImp will computer ROC for each class versus the others and then returns the largest AUC


# MIC statistic can be computer as before with dummy data
library(minerva)
micValues <- mine(x=cellData[,-1], y=ifelse(cellData$Class =="PS",1,0))
names(micValues)

## [1] "MIC"   "MAS"   "MEV"   "MCN"   "MICR2"

head(micValues$MIC)

##                     Y
## AngleCh1    0.1310570
## AreaCh1     0.1080839
## AvgIntenCh1 0.2920461
## AvgIntenCh2 0.3294846
## AvgIntenCh3 0.1354438
## AvgIntenCh4 0.1665450

#to calculate odds ratio and statistical test of association, the fisher.test fn = 
# table1 <- table(training[,"class'], training[, 2])
# fisher.test(table1)

#When predictor has 2+ classes, single odds ratio  cannot be computed, p=value can still be utilized

#ciTable <- table(training[,"classes (success /unsuccess)'], training[, "variable with 2+ factors""])