R Codes

Amazon link for Applied Predictive Models

Book Webpage for Data and Codes

R package on the Book Content

### Applied Predictive models
library(AppliedPredictiveModeling)
data(segmentationOriginal)
head(segmentationOriginal, 2)
##        Cell  Case Class AngleCh1 AngleStatusCh1 AreaCh1 AreaStatusCh1
## 1 207827637  Test    PS 143.2477              1     185             0
## 2 207932307 Train    PS 133.7520              0     819             1
##   AvgIntenCh1 AvgIntenCh2 AvgIntenCh3 AvgIntenCh4 AvgIntenStatusCh1
## 1    15.71186    3.954802    9.548023    2.214689                 0
## 2    31.92327  205.878517   69.916880  164.153453                 0
##   AvgIntenStatusCh2 AvgIntenStatusCh3 AvgIntenStatusCh4
## 1                 2                 2                 2
## 2                 0                 0                 0
##   ConvexHullAreaRatioCh1 ConvexHullAreaRatioStatusCh1
## 1               1.124509                            0
## 2               1.263158                            1
##   ConvexHullPerimRatioCh1 ConvexHullPerimRatioStatusCh1
## 1               0.9196827                             0
## 2               0.7970801                             2
##   DiffIntenDensityCh1 DiffIntenDensityCh3 DiffIntenDensityCh4
## 1            29.51923            13.77564            6.826923
## 2            31.87500            43.12228           79.308424
##   DiffIntenDensityStatusCh1 DiffIntenDensityStatusCh3
## 1                         2                         2
## 2                         0                         0
##   DiffIntenDensityStatusCh4 EntropyIntenCh1 EntropyIntenCh3
## 1                         2        4.969781        4.371017
## 2                         0        6.087592        6.642761
##   EntropyIntenCh4 EntropyIntenStatusCh1 EntropyIntenStatusCh3
## 1        2.718884                     2                     0
## 2        7.880155                     0                     1
##   EntropyIntenStatusCh4 EqCircDiamCh1 EqCircDiamStatusCh1 EqEllipseLWRCh1
## 1                     2      15.36954                   0        3.060676
## 2                     1      32.30558                   1        1.558394
##   EqEllipseLWRStatusCh1 EqEllipseOblateVolCh1 EqEllipseOblateVolStatusCh1
## 1                     1              336.9691                           0
## 2                     0             2232.9055                           1
##   EqEllipseProlateVolCh1 EqEllipseProlateVolStatusCh1 EqSphereAreaCh1
## 1               110.0963                            0        742.1156
## 2              1432.8246                            1       3278.7256
##   EqSphereAreaStatusCh1 EqSphereVolCh1 EqSphereVolStatusCh1 FiberAlign2Ch3
## 1                     0       1900.996                    0      0.0000000
## 2                     1      17653.525                    1      0.4879354
##   FiberAlign2Ch4 FiberAlign2StatusCh3 FiberAlign2StatusCh4 FiberLengthCh1
## 1      0.0000000                    2                    2       26.98132
## 2      0.3523742                    0                    0       64.28230
##   FiberLengthStatusCh1 FiberWidthCh1 FiberWidthStatusCh1 IntenCoocASMCh3
## 1                    0      7.410365                   2      0.01118390
## 2                    1     13.167079                   0      0.02805106
##   IntenCoocASMCh4 IntenCoocASMStatusCh3 IntenCoocASMStatusCh4
## 1      0.05044801                     0                     0
## 2      0.01259498                     0                     0
##   IntenCoocContrastCh3 IntenCoocContrastCh4 IntenCoocContrastStatusCh3
## 1            40.751777            13.895439                          1
## 2             8.227953             6.984046                          0
##   IntenCoocContrastStatusCh4 IntenCoocEntropyCh3 IntenCoocEntropyCh4
## 1                          1            7.199458            5.249744
## 2                          0            6.822138            7.098988
##   IntenCoocEntropyStatusCh3 IntenCoocEntropyStatusCh4 IntenCoocMaxCh3
## 1                         0                         0      0.07741935
## 2                         0                         0      0.15321477
##   IntenCoocMaxCh4 IntenCoocMaxStatusCh3 IntenCoocMaxStatusCh4 KurtIntenCh1
## 1      0.17197452                     0                     0   -0.6567441
## 2      0.07387141                     0                     0   -0.2487691
##   KurtIntenCh3 KurtIntenCh4 KurtIntenStatusCh1 KurtIntenStatusCh3
## 1   -0.6080583    0.7258145                  0                  0
## 2   -0.3307839   -0.2652638                  0                  0
##   KurtIntenStatusCh4 LengthCh1 LengthStatusCh1 MemberAvgAvgIntenStatusCh2
## 1                  0  26.20779               0                          0
## 2                  0  47.21855               1                          0
##   MemberAvgTotalIntenStatusCh2 NeighborAvgDistCh1 NeighborAvgDistStatusCh1
## 1                            0           370.4543                        1
## 2                            0           174.4442                        2
##   NeighborMinDistCh1 NeighborMinDistStatusCh1 NeighborVarDistCh1
## 1           99.10349                        1          127.96080
## 2           30.11114                        0           81.38063
##   NeighborVarDistStatusCh1  PerimCh1 PerimStatusCh1 ShapeBFRCh1
## 1                        0  68.78338              0   0.6651480
## 2                        2 154.89876              1   0.5397584
##   ShapeBFRStatusCh1 ShapeLWRCh1 ShapeLWRStatusCh1 ShapeP2ACh1
## 1                 0    2.462450                 0    1.883006
## 2                 2    1.468181                 0    2.255810
##   ShapeP2AStatusCh1 SkewIntenCh1 SkewIntenCh3 SkewIntenCh4
## 1                 0    0.4545048    0.4603934    1.2327736
## 2                 0    0.3987047    0.6197308    0.5272631
##   SkewIntenStatusCh1 SkewIntenStatusCh3 SkewIntenStatusCh4
## 1                  0                  0                  0
## 2                  0                  0                  0
##   SpotFiberCountCh3 SpotFiberCountCh4 SpotFiberCountStatusCh3
## 1                 1                 4                       0
## 2                 4                11                       1
##   SpotFiberCountStatusCh4 TotalIntenCh1 TotalIntenCh2 TotalIntenCh3
## 1                       0          2781           700          1690
## 2                       1         24964        160997         54675
##   TotalIntenCh4 TotalIntenStatusCh1 TotalIntenStatusCh2
## 1           392                   0                   2
## 2        128368                   0                   1
##   TotalIntenStatusCh3 TotalIntenStatusCh4 VarIntenCh1 VarIntenCh3
## 1                   0                   2    12.47468    7.609035
## 2                   1                   1    18.80923   56.715352
##   VarIntenCh4 VarIntenStatusCh1 VarIntenStatusCh3 VarIntenStatusCh4
## 1      2.7141                 0                 2                 2
## 2    118.3881                 0                 0                 0
##   WidthCh1 WidthStatusCh1 XCentroid YCentroid
## 1 10.64297              2        42        14
## 2 32.16126              1       215       347
dim(segmentationOriginal)
## [1] 2019  119
## Prepare train data
segData <- subset(segmentationOriginal, Case == "Train")
dim(segData)
## [1] 1009  119
## Delete few columns
cellID <- segData$Cell
class <- segData$Class
case <- segData$Case
segData <- segData[, -(1:3)]
names(segData)
##   [1] "AngleCh1"                      "AngleStatusCh1"               
##   [3] "AreaCh1"                       "AreaStatusCh1"                
##   [5] "AvgIntenCh1"                   "AvgIntenCh2"                  
##   [7] "AvgIntenCh3"                   "AvgIntenCh4"                  
##   [9] "AvgIntenStatusCh1"             "AvgIntenStatusCh2"            
##  [11] "AvgIntenStatusCh3"             "AvgIntenStatusCh4"            
##  [13] "ConvexHullAreaRatioCh1"        "ConvexHullAreaRatioStatusCh1" 
##  [15] "ConvexHullPerimRatioCh1"       "ConvexHullPerimRatioStatusCh1"
##  [17] "DiffIntenDensityCh1"           "DiffIntenDensityCh3"          
##  [19] "DiffIntenDensityCh4"           "DiffIntenDensityStatusCh1"    
##  [21] "DiffIntenDensityStatusCh3"     "DiffIntenDensityStatusCh4"    
##  [23] "EntropyIntenCh1"               "EntropyIntenCh3"              
##  [25] "EntropyIntenCh4"               "EntropyIntenStatusCh1"        
##  [27] "EntropyIntenStatusCh3"         "EntropyIntenStatusCh4"        
##  [29] "EqCircDiamCh1"                 "EqCircDiamStatusCh1"          
##  [31] "EqEllipseLWRCh1"               "EqEllipseLWRStatusCh1"        
##  [33] "EqEllipseOblateVolCh1"         "EqEllipseOblateVolStatusCh1"  
##  [35] "EqEllipseProlateVolCh1"        "EqEllipseProlateVolStatusCh1" 
##  [37] "EqSphereAreaCh1"               "EqSphereAreaStatusCh1"        
##  [39] "EqSphereVolCh1"                "EqSphereVolStatusCh1"         
##  [41] "FiberAlign2Ch3"                "FiberAlign2Ch4"               
##  [43] "FiberAlign2StatusCh3"          "FiberAlign2StatusCh4"         
##  [45] "FiberLengthCh1"                "FiberLengthStatusCh1"         
##  [47] "FiberWidthCh1"                 "FiberWidthStatusCh1"          
##  [49] "IntenCoocASMCh3"               "IntenCoocASMCh4"              
##  [51] "IntenCoocASMStatusCh3"         "IntenCoocASMStatusCh4"        
##  [53] "IntenCoocContrastCh3"          "IntenCoocContrastCh4"         
##  [55] "IntenCoocContrastStatusCh3"    "IntenCoocContrastStatusCh4"   
##  [57] "IntenCoocEntropyCh3"           "IntenCoocEntropyCh4"          
##  [59] "IntenCoocEntropyStatusCh3"     "IntenCoocEntropyStatusCh4"    
##  [61] "IntenCoocMaxCh3"               "IntenCoocMaxCh4"              
##  [63] "IntenCoocMaxStatusCh3"         "IntenCoocMaxStatusCh4"        
##  [65] "KurtIntenCh1"                  "KurtIntenCh3"                 
##  [67] "KurtIntenCh4"                  "KurtIntenStatusCh1"           
##  [69] "KurtIntenStatusCh3"            "KurtIntenStatusCh4"           
##  [71] "LengthCh1"                     "LengthStatusCh1"              
##  [73] "MemberAvgAvgIntenStatusCh2"    "MemberAvgTotalIntenStatusCh2" 
##  [75] "NeighborAvgDistCh1"            "NeighborAvgDistStatusCh1"     
##  [77] "NeighborMinDistCh1"            "NeighborMinDistStatusCh1"     
##  [79] "NeighborVarDistCh1"            "NeighborVarDistStatusCh1"     
##  [81] "PerimCh1"                      "PerimStatusCh1"               
##  [83] "ShapeBFRCh1"                   "ShapeBFRStatusCh1"            
##  [85] "ShapeLWRCh1"                   "ShapeLWRStatusCh1"            
##  [87] "ShapeP2ACh1"                   "ShapeP2AStatusCh1"            
##  [89] "SkewIntenCh1"                  "SkewIntenCh3"                 
##  [91] "SkewIntenCh4"                  "SkewIntenStatusCh1"           
##  [93] "SkewIntenStatusCh3"            "SkewIntenStatusCh4"           
##  [95] "SpotFiberCountCh3"             "SpotFiberCountCh4"            
##  [97] "SpotFiberCountStatusCh3"       "SpotFiberCountStatusCh4"      
##  [99] "TotalIntenCh1"                 "TotalIntenCh2"                
## [101] "TotalIntenCh3"                 "TotalIntenCh4"                
## [103] "TotalIntenStatusCh1"           "TotalIntenStatusCh2"          
## [105] "TotalIntenStatusCh3"           "TotalIntenStatusCh4"          
## [107] "VarIntenCh1"                   "VarIntenCh3"                  
## [109] "VarIntenCh4"                   "VarIntenStatusCh1"            
## [111] "VarIntenStatusCh3"             "VarIntenStatusCh4"            
## [113] "WidthCh1"                      "WidthStatusCh1"               
## [115] "XCentroid"                     "YCentroid"
## Delete columns with particular TERM
statusColNum <- grep("Status", names(segData))
statusColNum
##  [1]   2   4   9  10  11  12  14  16  20  21  22  26  27  28  30  32  34
## [18]  36  38  40  43  44  46  48  51  52  55  56  59  60  63  64  68  69
## [35]  70  72  73  74  76  78  80  82  84  86  88  92  93  94  97  98 103
## [52] 104 105 106 110 111 112 114
segData <- segData[, -statusColNum]
dim(segData)
## [1] 1009   58
## Check skewness
library(e1071)
skewness(segData$AngleCh1)
## [1] -0.02426252
skewValues <- apply(segData, 2, skewness)
head(skewValues)
##    AngleCh1     AreaCh1 AvgIntenCh1 AvgIntenCh2 AvgIntenCh3 AvgIntenCh4 
## -0.02426252  3.52510745  2.95918524  0.84816033  2.20234214  1.90047128
## Transformation
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
Ch1AreaTrans <- BoxCoxTrans(segData$AreaCh1)
Ch1AreaTrans
## Box-Cox Transformation
## 
## 1009 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   150.0   194.0   256.0   325.1   376.0  2186.0 
## 
## Largest/Smallest: 14.6 
## Sample Skewness: 3.53 
## 
## Estimated Lambda: -0.9
## original data
head(segData$AreaCh1)
## [1] 819 431 298 256 258 358
# After transformation
predict(Ch1AreaTrans, head(segData$AreaCh1))
## [1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523
pcaObject <- prcomp(segData, center = TRUE, scale. = TRUE)
str(pcaObject)
## List of 5
##  $ sdev    : num [1:58] 3.48 3.14 2.63 2.12 1.7 ...
##  $ rotation: num [1:58, 1:58] 0.00121 0.22917 -0.10271 -0.15483 -0.05804 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
##   .. ..$ : chr [1:58] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:58] 91.1 325.1 127.9 185.2 96.1 ...
##   ..- attr(*, "names")= chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
##  $ scale   : Named num [1:58] 48.9 216.6 164 154 93.5 ...
##   ..- attr(*, "names")= chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
##  $ x       : num [1:1009, 1:58] 5.099 -0.255 1.293 -1.465 -0.876 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:1009] "2" "3" "4" "12" ...
##   .. ..$ : chr [1:58] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
# Calculate the cumulative percentage of variance which each component
# accounts for.
percentVariance <- pcaObject$sd^2/sum(pcaObject$sd^2)*100
percentVariance[1:3]
## [1] 20.91236 17.01330 11.88689
head(pcaObject$x[, 1:5])
##           PC1        PC2         PC3       PC4        PC5
## 2   5.0985749  4.5513804 -0.03345155 -2.640339  1.2783212
## 3  -0.2546261  1.1980326 -1.02059569 -3.731079  0.9994635
## 4   1.2928941 -1.8639348 -1.25110461 -2.414857 -1.4914838
## 12 -1.4646613 -1.5658327  0.46962088 -3.388716 -0.3302324
## 15 -0.8762771 -1.2790055 -1.33794261 -3.516794  0.3936099
## 16 -0.8615416 -0.3286842 -0.15546723 -2.206636  1.4731658
head(pcaObject$rotation[, 1:3])
##                      PC1         PC2          PC3
## AngleCh1     0.001213758 -0.01284461  0.006816473
## AreaCh1      0.229171873  0.16061734  0.089811727
## AvgIntenCh1 -0.102708778  0.17971332  0.067696745
## AvgIntenCh2 -0.154828672  0.16376018  0.073534399
## AvgIntenCh3 -0.058042158  0.11197704 -0.185473286
## AvgIntenCh4 -0.117343465  0.21039086 -0.105060977
trans <- preProcess(segData,method = c("BoxCox", "center", "scale", "pca"))
trans
## 
## Call:
## preProcess.default(x = segData, method = c("BoxCox", "center",
##  "scale", "pca"))
## 
## Created from 1009 samples and 58 variables
## Pre-processing: Box-Cox transformation, centered, scaled,
##  principal component signal extraction 
## 
## Lambda estimates for Box-Cox transformation:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -2.00000 -0.50000 -0.10000  0.05106  0.30000  2.00000       11 
## 
## PCA needed 19 components to capture 95 percent of the variance
# Apply the transformations:
transformed <- predict(trans, segData)
# These values are different than the previous PCA components since
# they were transformed prior to PCA
head(transformed[, 1:5])
##           PC1        PC2        PC3       PC4        PC5
## 2   1.5684742  6.2907855 -0.3333299 -3.063327 -1.3415782
## 3  -0.6664055  2.0455375 -1.4416841 -4.701183 -1.7422020
## 4   3.7500055 -0.3915610 -0.6690260 -4.020753  1.7927777
## 12  0.3768509 -2.1897554  1.4380167 -5.327116 -0.4066757
## 15  1.0644951 -1.4646516 -0.9900478 -5.627351 -0.8650174
## 16 -0.3798629  0.2173028  0.4387980 -2.069880 -1.9363920
nearZeroVar(segData)
## integer(0)
correlations <- cor(segData)
dim(correlations)
## [1] 58 58
correlations[1:4, 1:4]
##                 AngleCh1      AreaCh1 AvgIntenCh1 AvgIntenCh2
## AngleCh1     1.000000000 -0.002627172 -0.04300776 -0.01944681
## AreaCh1     -0.002627172  1.000000000 -0.02529739 -0.15330301
## AvgIntenCh1 -0.043007757 -0.025297394  1.00000000  0.52521711
## AvgIntenCh2 -0.019446810 -0.153303007  0.52521711  1.00000000
library(corrplot)
corrplot(correlations, order = "hclust",  tl.cex=0.6,
         tl.col = "black")

highCorr <- findCorrelation(correlations, cutoff = .75)
length(highCorr)
## [1] 33
head(highCorr)
## [1] 23 40 43 36  7 15
filteredSegData <- segData[, -highCorr]


correlations1 <- cor(filteredSegData)
dim(correlations1)
## [1] 25 25
correlations1[1:4, 1:4]
##                            AngleCh1 ConvexHullPerimRatioCh1
## AngleCh1                 1.00000000              0.04173698
## ConvexHullPerimRatioCh1  0.04173698              1.00000000
## EntropyIntenCh1         -0.01182293              0.29070898
## EqEllipseProlateVolCh1  -0.00844706             -0.39545727
##                         EntropyIntenCh1 EqEllipseProlateVolCh1
## AngleCh1                    -0.01182293            -0.00844706
## ConvexHullPerimRatioCh1      0.29070898            -0.39545727
## EntropyIntenCh1              1.00000000             0.10960130
## EqEllipseProlateVolCh1       0.10960130             1.00000000
library(corrplot)
corrplot(correlations1, order = "hclust",  tl.cex=0.7,
         tl.col = "black")

Compiled by: Subasish Das