Amazon link for Applied Predictive Models
Book Webpage for Data and Codes
### Applied Predictive models
library(AppliedPredictiveModeling)
data(segmentationOriginal)
head(segmentationOriginal, 2)
## Cell Case Class AngleCh1 AngleStatusCh1 AreaCh1 AreaStatusCh1
## 1 207827637 Test PS 143.2477 1 185 0
## 2 207932307 Train PS 133.7520 0 819 1
## AvgIntenCh1 AvgIntenCh2 AvgIntenCh3 AvgIntenCh4 AvgIntenStatusCh1
## 1 15.71186 3.954802 9.548023 2.214689 0
## 2 31.92327 205.878517 69.916880 164.153453 0
## AvgIntenStatusCh2 AvgIntenStatusCh3 AvgIntenStatusCh4
## 1 2 2 2
## 2 0 0 0
## ConvexHullAreaRatioCh1 ConvexHullAreaRatioStatusCh1
## 1 1.124509 0
## 2 1.263158 1
## ConvexHullPerimRatioCh1 ConvexHullPerimRatioStatusCh1
## 1 0.9196827 0
## 2 0.7970801 2
## DiffIntenDensityCh1 DiffIntenDensityCh3 DiffIntenDensityCh4
## 1 29.51923 13.77564 6.826923
## 2 31.87500 43.12228 79.308424
## DiffIntenDensityStatusCh1 DiffIntenDensityStatusCh3
## 1 2 2
## 2 0 0
## DiffIntenDensityStatusCh4 EntropyIntenCh1 EntropyIntenCh3
## 1 2 4.969781 4.371017
## 2 0 6.087592 6.642761
## EntropyIntenCh4 EntropyIntenStatusCh1 EntropyIntenStatusCh3
## 1 2.718884 2 0
## 2 7.880155 0 1
## EntropyIntenStatusCh4 EqCircDiamCh1 EqCircDiamStatusCh1 EqEllipseLWRCh1
## 1 2 15.36954 0 3.060676
## 2 1 32.30558 1 1.558394
## EqEllipseLWRStatusCh1 EqEllipseOblateVolCh1 EqEllipseOblateVolStatusCh1
## 1 1 336.9691 0
## 2 0 2232.9055 1
## EqEllipseProlateVolCh1 EqEllipseProlateVolStatusCh1 EqSphereAreaCh1
## 1 110.0963 0 742.1156
## 2 1432.8246 1 3278.7256
## EqSphereAreaStatusCh1 EqSphereVolCh1 EqSphereVolStatusCh1 FiberAlign2Ch3
## 1 0 1900.996 0 0.0000000
## 2 1 17653.525 1 0.4879354
## FiberAlign2Ch4 FiberAlign2StatusCh3 FiberAlign2StatusCh4 FiberLengthCh1
## 1 0.0000000 2 2 26.98132
## 2 0.3523742 0 0 64.28230
## FiberLengthStatusCh1 FiberWidthCh1 FiberWidthStatusCh1 IntenCoocASMCh3
## 1 0 7.410365 2 0.01118390
## 2 1 13.167079 0 0.02805106
## IntenCoocASMCh4 IntenCoocASMStatusCh3 IntenCoocASMStatusCh4
## 1 0.05044801 0 0
## 2 0.01259498 0 0
## IntenCoocContrastCh3 IntenCoocContrastCh4 IntenCoocContrastStatusCh3
## 1 40.751777 13.895439 1
## 2 8.227953 6.984046 0
## IntenCoocContrastStatusCh4 IntenCoocEntropyCh3 IntenCoocEntropyCh4
## 1 1 7.199458 5.249744
## 2 0 6.822138 7.098988
## IntenCoocEntropyStatusCh3 IntenCoocEntropyStatusCh4 IntenCoocMaxCh3
## 1 0 0 0.07741935
## 2 0 0 0.15321477
## IntenCoocMaxCh4 IntenCoocMaxStatusCh3 IntenCoocMaxStatusCh4 KurtIntenCh1
## 1 0.17197452 0 0 -0.6567441
## 2 0.07387141 0 0 -0.2487691
## KurtIntenCh3 KurtIntenCh4 KurtIntenStatusCh1 KurtIntenStatusCh3
## 1 -0.6080583 0.7258145 0 0
## 2 -0.3307839 -0.2652638 0 0
## KurtIntenStatusCh4 LengthCh1 LengthStatusCh1 MemberAvgAvgIntenStatusCh2
## 1 0 26.20779 0 0
## 2 0 47.21855 1 0
## MemberAvgTotalIntenStatusCh2 NeighborAvgDistCh1 NeighborAvgDistStatusCh1
## 1 0 370.4543 1
## 2 0 174.4442 2
## NeighborMinDistCh1 NeighborMinDistStatusCh1 NeighborVarDistCh1
## 1 99.10349 1 127.96080
## 2 30.11114 0 81.38063
## NeighborVarDistStatusCh1 PerimCh1 PerimStatusCh1 ShapeBFRCh1
## 1 0 68.78338 0 0.6651480
## 2 2 154.89876 1 0.5397584
## ShapeBFRStatusCh1 ShapeLWRCh1 ShapeLWRStatusCh1 ShapeP2ACh1
## 1 0 2.462450 0 1.883006
## 2 2 1.468181 0 2.255810
## ShapeP2AStatusCh1 SkewIntenCh1 SkewIntenCh3 SkewIntenCh4
## 1 0 0.4545048 0.4603934 1.2327736
## 2 0 0.3987047 0.6197308 0.5272631
## SkewIntenStatusCh1 SkewIntenStatusCh3 SkewIntenStatusCh4
## 1 0 0 0
## 2 0 0 0
## SpotFiberCountCh3 SpotFiberCountCh4 SpotFiberCountStatusCh3
## 1 1 4 0
## 2 4 11 1
## SpotFiberCountStatusCh4 TotalIntenCh1 TotalIntenCh2 TotalIntenCh3
## 1 0 2781 700 1690
## 2 1 24964 160997 54675
## TotalIntenCh4 TotalIntenStatusCh1 TotalIntenStatusCh2
## 1 392 0 2
## 2 128368 0 1
## TotalIntenStatusCh3 TotalIntenStatusCh4 VarIntenCh1 VarIntenCh3
## 1 0 2 12.47468 7.609035
## 2 1 1 18.80923 56.715352
## VarIntenCh4 VarIntenStatusCh1 VarIntenStatusCh3 VarIntenStatusCh4
## 1 2.7141 0 2 2
## 2 118.3881 0 0 0
## WidthCh1 WidthStatusCh1 XCentroid YCentroid
## 1 10.64297 2 42 14
## 2 32.16126 1 215 347
dim(segmentationOriginal)
## [1] 2019 119
## Prepare train data
segData <- subset(segmentationOriginal, Case == "Train")
dim(segData)
## [1] 1009 119
## Delete few columns
cellID <- segData$Cell
class <- segData$Class
case <- segData$Case
segData <- segData[, -(1:3)]
names(segData)
## [1] "AngleCh1" "AngleStatusCh1"
## [3] "AreaCh1" "AreaStatusCh1"
## [5] "AvgIntenCh1" "AvgIntenCh2"
## [7] "AvgIntenCh3" "AvgIntenCh4"
## [9] "AvgIntenStatusCh1" "AvgIntenStatusCh2"
## [11] "AvgIntenStatusCh3" "AvgIntenStatusCh4"
## [13] "ConvexHullAreaRatioCh1" "ConvexHullAreaRatioStatusCh1"
## [15] "ConvexHullPerimRatioCh1" "ConvexHullPerimRatioStatusCh1"
## [17] "DiffIntenDensityCh1" "DiffIntenDensityCh3"
## [19] "DiffIntenDensityCh4" "DiffIntenDensityStatusCh1"
## [21] "DiffIntenDensityStatusCh3" "DiffIntenDensityStatusCh4"
## [23] "EntropyIntenCh1" "EntropyIntenCh3"
## [25] "EntropyIntenCh4" "EntropyIntenStatusCh1"
## [27] "EntropyIntenStatusCh3" "EntropyIntenStatusCh4"
## [29] "EqCircDiamCh1" "EqCircDiamStatusCh1"
## [31] "EqEllipseLWRCh1" "EqEllipseLWRStatusCh1"
## [33] "EqEllipseOblateVolCh1" "EqEllipseOblateVolStatusCh1"
## [35] "EqEllipseProlateVolCh1" "EqEllipseProlateVolStatusCh1"
## [37] "EqSphereAreaCh1" "EqSphereAreaStatusCh1"
## [39] "EqSphereVolCh1" "EqSphereVolStatusCh1"
## [41] "FiberAlign2Ch3" "FiberAlign2Ch4"
## [43] "FiberAlign2StatusCh3" "FiberAlign2StatusCh4"
## [45] "FiberLengthCh1" "FiberLengthStatusCh1"
## [47] "FiberWidthCh1" "FiberWidthStatusCh1"
## [49] "IntenCoocASMCh3" "IntenCoocASMCh4"
## [51] "IntenCoocASMStatusCh3" "IntenCoocASMStatusCh4"
## [53] "IntenCoocContrastCh3" "IntenCoocContrastCh4"
## [55] "IntenCoocContrastStatusCh3" "IntenCoocContrastStatusCh4"
## [57] "IntenCoocEntropyCh3" "IntenCoocEntropyCh4"
## [59] "IntenCoocEntropyStatusCh3" "IntenCoocEntropyStatusCh4"
## [61] "IntenCoocMaxCh3" "IntenCoocMaxCh4"
## [63] "IntenCoocMaxStatusCh3" "IntenCoocMaxStatusCh4"
## [65] "KurtIntenCh1" "KurtIntenCh3"
## [67] "KurtIntenCh4" "KurtIntenStatusCh1"
## [69] "KurtIntenStatusCh3" "KurtIntenStatusCh4"
## [71] "LengthCh1" "LengthStatusCh1"
## [73] "MemberAvgAvgIntenStatusCh2" "MemberAvgTotalIntenStatusCh2"
## [75] "NeighborAvgDistCh1" "NeighborAvgDistStatusCh1"
## [77] "NeighborMinDistCh1" "NeighborMinDistStatusCh1"
## [79] "NeighborVarDistCh1" "NeighborVarDistStatusCh1"
## [81] "PerimCh1" "PerimStatusCh1"
## [83] "ShapeBFRCh1" "ShapeBFRStatusCh1"
## [85] "ShapeLWRCh1" "ShapeLWRStatusCh1"
## [87] "ShapeP2ACh1" "ShapeP2AStatusCh1"
## [89] "SkewIntenCh1" "SkewIntenCh3"
## [91] "SkewIntenCh4" "SkewIntenStatusCh1"
## [93] "SkewIntenStatusCh3" "SkewIntenStatusCh4"
## [95] "SpotFiberCountCh3" "SpotFiberCountCh4"
## [97] "SpotFiberCountStatusCh3" "SpotFiberCountStatusCh4"
## [99] "TotalIntenCh1" "TotalIntenCh2"
## [101] "TotalIntenCh3" "TotalIntenCh4"
## [103] "TotalIntenStatusCh1" "TotalIntenStatusCh2"
## [105] "TotalIntenStatusCh3" "TotalIntenStatusCh4"
## [107] "VarIntenCh1" "VarIntenCh3"
## [109] "VarIntenCh4" "VarIntenStatusCh1"
## [111] "VarIntenStatusCh3" "VarIntenStatusCh4"
## [113] "WidthCh1" "WidthStatusCh1"
## [115] "XCentroid" "YCentroid"
## Delete columns with particular TERM
statusColNum <- grep("Status", names(segData))
statusColNum
## [1] 2 4 9 10 11 12 14 16 20 21 22 26 27 28 30 32 34
## [18] 36 38 40 43 44 46 48 51 52 55 56 59 60 63 64 68 69
## [35] 70 72 73 74 76 78 80 82 84 86 88 92 93 94 97 98 103
## [52] 104 105 106 110 111 112 114
segData <- segData[, -statusColNum]
dim(segData)
## [1] 1009 58
## Check skewness
library(e1071)
skewness(segData$AngleCh1)
## [1] -0.02426252
skewValues <- apply(segData, 2, skewness)
head(skewValues)
## AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2 AvgIntenCh3 AvgIntenCh4
## -0.02426252 3.52510745 2.95918524 0.84816033 2.20234214 1.90047128
## Transformation
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
Ch1AreaTrans <- BoxCoxTrans(segData$AreaCh1)
Ch1AreaTrans
## Box-Cox Transformation
##
## 1009 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150.0 194.0 256.0 325.1 376.0 2186.0
##
## Largest/Smallest: 14.6
## Sample Skewness: 3.53
##
## Estimated Lambda: -0.9
## original data
head(segData$AreaCh1)
## [1] 819 431 298 256 258 358
# After transformation
predict(Ch1AreaTrans, head(segData$AreaCh1))
## [1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523
pcaObject <- prcomp(segData, center = TRUE, scale. = TRUE)
str(pcaObject)
## List of 5
## $ sdev : num [1:58] 3.48 3.14 2.63 2.12 1.7 ...
## $ rotation: num [1:58, 1:58] 0.00121 0.22917 -0.10271 -0.15483 -0.05804 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
## .. ..$ : chr [1:58] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:58] 91.1 325.1 127.9 185.2 96.1 ...
## ..- attr(*, "names")= chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
## $ scale : Named num [1:58] 48.9 216.6 164 154 93.5 ...
## ..- attr(*, "names")= chr [1:58] "AngleCh1" "AreaCh1" "AvgIntenCh1" "AvgIntenCh2" ...
## $ x : num [1:1009, 1:58] 5.099 -0.255 1.293 -1.465 -0.876 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:1009] "2" "3" "4" "12" ...
## .. ..$ : chr [1:58] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
# Calculate the cumulative percentage of variance which each component
# accounts for.
percentVariance <- pcaObject$sd^2/sum(pcaObject$sd^2)*100
percentVariance[1:3]
## [1] 20.91236 17.01330 11.88689
head(pcaObject$x[, 1:5])
## PC1 PC2 PC3 PC4 PC5
## 2 5.0985749 4.5513804 -0.03345155 -2.640339 1.2783212
## 3 -0.2546261 1.1980326 -1.02059569 -3.731079 0.9994635
## 4 1.2928941 -1.8639348 -1.25110461 -2.414857 -1.4914838
## 12 -1.4646613 -1.5658327 0.46962088 -3.388716 -0.3302324
## 15 -0.8762771 -1.2790055 -1.33794261 -3.516794 0.3936099
## 16 -0.8615416 -0.3286842 -0.15546723 -2.206636 1.4731658
head(pcaObject$rotation[, 1:3])
## PC1 PC2 PC3
## AngleCh1 0.001213758 -0.01284461 0.006816473
## AreaCh1 0.229171873 0.16061734 0.089811727
## AvgIntenCh1 -0.102708778 0.17971332 0.067696745
## AvgIntenCh2 -0.154828672 0.16376018 0.073534399
## AvgIntenCh3 -0.058042158 0.11197704 -0.185473286
## AvgIntenCh4 -0.117343465 0.21039086 -0.105060977
trans <- preProcess(segData,method = c("BoxCox", "center", "scale", "pca"))
trans
##
## Call:
## preProcess.default(x = segData, method = c("BoxCox", "center",
## "scale", "pca"))
##
## Created from 1009 samples and 58 variables
## Pre-processing: Box-Cox transformation, centered, scaled,
## principal component signal extraction
##
## Lambda estimates for Box-Cox transformation:
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -2.00000 -0.50000 -0.10000 0.05106 0.30000 2.00000 11
##
## PCA needed 19 components to capture 95 percent of the variance
# Apply the transformations:
transformed <- predict(trans, segData)
# These values are different than the previous PCA components since
# they were transformed prior to PCA
head(transformed[, 1:5])
## PC1 PC2 PC3 PC4 PC5
## 2 1.5684742 6.2907855 -0.3333299 -3.063327 -1.3415782
## 3 -0.6664055 2.0455375 -1.4416841 -4.701183 -1.7422020
## 4 3.7500055 -0.3915610 -0.6690260 -4.020753 1.7927777
## 12 0.3768509 -2.1897554 1.4380167 -5.327116 -0.4066757
## 15 1.0644951 -1.4646516 -0.9900478 -5.627351 -0.8650174
## 16 -0.3798629 0.2173028 0.4387980 -2.069880 -1.9363920
nearZeroVar(segData)
## integer(0)
correlations <- cor(segData)
dim(correlations)
## [1] 58 58
correlations[1:4, 1:4]
## AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2
## AngleCh1 1.000000000 -0.002627172 -0.04300776 -0.01944681
## AreaCh1 -0.002627172 1.000000000 -0.02529739 -0.15330301
## AvgIntenCh1 -0.043007757 -0.025297394 1.00000000 0.52521711
## AvgIntenCh2 -0.019446810 -0.153303007 0.52521711 1.00000000
library(corrplot)
corrplot(correlations, order = "hclust", tl.cex=0.6,
tl.col = "black")
highCorr <- findCorrelation(correlations, cutoff = .75)
length(highCorr)
## [1] 33
head(highCorr)
## [1] 23 40 43 36 7 15
filteredSegData <- segData[, -highCorr]
correlations1 <- cor(filteredSegData)
dim(correlations1)
## [1] 25 25
correlations1[1:4, 1:4]
## AngleCh1 ConvexHullPerimRatioCh1
## AngleCh1 1.00000000 0.04173698
## ConvexHullPerimRatioCh1 0.04173698 1.00000000
## EntropyIntenCh1 -0.01182293 0.29070898
## EqEllipseProlateVolCh1 -0.00844706 -0.39545727
## EntropyIntenCh1 EqEllipseProlateVolCh1
## AngleCh1 -0.01182293 -0.00844706
## ConvexHullPerimRatioCh1 0.29070898 -0.39545727
## EntropyIntenCh1 1.00000000 0.10960130
## EqEllipseProlateVolCh1 0.10960130 1.00000000
library(corrplot)
corrplot(correlations1, order = "hclust", tl.cex=0.7,
tl.col = "black")
Compiled by: Subasish Das