This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
library(earth)
data(etitanic)
head(model.matrix(survived ~ ., data = etitanic))
dummies <- dummyVars(survived ~ ., data = etitanic)
head(predict(dummies, newdata = etitanic))
data(mdrr)
data.frame(table(mdrrDescr$nR11))
nzv <- nearZeroVar(mdrrDescr, saveMetrics= TRUE)
nzv[nzv$nzv,][1:10,]
dim(mdrrDescr)
[1] 528 342
nzv <- nearZeroVar(mdrrDescr)
filteredDescr <- mdrrDescr[, -nzv]
dim(filteredDescr)
[1] 528 297
descrCor <- cor(filteredDescr)
highCorr <- sum(abs(descrCor[upper.tri(descrCor)]) > .999)
descrCor <- cor(filteredDescr)
summary(descrCor[upper.tri(descrCor)])
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.99610 -0.05373 0.25010 0.26080 0.65530 1.00000
highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)
filteredDescr <- filteredDescr[,-highlyCorDescr]
descrCor2 <- cor(filteredDescr)
summary(descrCor2[upper.tri(descrCor2)])
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.70730 -0.05378 0.04418 0.06692 0.18860 0.74460
ltfrDesign <- matrix(0, nrow=6, ncol=6)
ltfrDesign[,1] <- c(1, 1, 1, 1, 1, 1)
ltfrDesign[,2] <- c(1, 1, 1, 0, 0, 0)
ltfrDesign[,3] <- c(0, 0, 0, 1, 1, 1)
ltfrDesign[,4] <- c(1, 0, 0, 1, 0, 0)
ltfrDesign[,5] <- c(0, 1, 0, 0, 1, 0)
ltfrDesign[,6] <- c(0, 0, 1, 0, 0, 1)
comboInfo <- findLinearCombos(ltfrDesign)
comboInfo
$linearCombos
$linearCombos[[1]]
[1] 3 1 2
$linearCombos[[2]]
[1] 6 1 4 5
$remove
[1] 3 6
ltfrDesign[, -comboInfo$remove]
[,1] [,2] [,3] [,4]
[1,] 1 1 1 0
[2,] 1 1 0 1
[3,] 1 1 0 0
[4,] 1 0 1 0
[5,] 1 0 0 1
[6,] 1 0 0 0
set.seed(96)
inTrain <- sample(seq(along = mdrrClass), length(mdrrClass)/2)
training <- filteredDescr[inTrain,]
test <- filteredDescr[-inTrain,]
trainMDRR <- mdrrClass[inTrain]
testMDRR <- mdrrClass[-inTrain]
preProcValues <- preProcess(training, method = c("center", "scale"))
trainTransformed <- predict(preProcValues, training)
testTransformed <- predict(preProcValues, test)
library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
plotSubset <- data.frame(scale(mdrrDescr[, c("nC", "X4v")]))
xyplot(nC ~ X4v,
data = plotSubset,
groups = mdrrClass,
auto.key = list(columns = 2))
transformed <- spatialSign(plotSubset)
transformed <- as.data.frame(transformed)
xyplot(nC ~ X4v,
data = transformed,
groups = mdrrClass,
auto.key = list(columns = 2))
preProcValues2 <- preProcess(training, method = "BoxCox")
trainBC <- predict(preProcValues2, training)
testBC <- predict(preProcValues2, test)
preProcValues2
Created from 264 samples and 31 variables
Pre-processing:
- Box-Cox transformation (31)
- ignored (0)
Lambda estimates for Box-Cox transformation:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.0000 -0.2000 0.3000 0.4097 1.7000 2.0000
library(AppliedPredictiveModeling)
data(schedulingData)
str(schedulingData)
'data.frame': 4331 obs. of 8 variables:
$ Protocol : Factor w/ 14 levels "A","C","D","E",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Compounds : num 997 97 101 93 100 100 105 98 101 95 ...
$ InputFields: num 137 103 75 76 82 82 88 95 91 92 ...
$ Iterations : num 20 20 10 20 20 20 20 20 20 20 ...
$ NumPending : num 0 0 0 0 0 0 0 0 0 0 ...
$ Hour : num 14 13.8 13.8 10.1 10.4 ...
$ Day : Factor w/ 7 levels "Mon","Tue","Wed",..: 2 2 4 5 5 3 5 5 5 3 ...
$ Class : Factor w/ 4 levels "VF","F","M","L": 2 1 1 1 1 1 1 1 1 1 ...
pp_hpc <- preProcess(schedulingData[, -8],
method = c("center", "scale", "YeoJohnson"))
pp_hpc
Created from 4331 samples and 7 variables
Pre-processing:
- centered (5)
- ignored (2)
- scaled (5)
- Yeo-Johnson transformation (5)
Lambda estimates for Yeo-Johnson transformation:
-0.08, -0.03, -1.05, -1.1, 1.44
transformed <- predict(pp_hpc, newdata = schedulingData[, -8])
head(transformed)
mean(schedulingData$NumPending == 0)
[1] 0.7561764
pp_no_nzv <- preProcess(schedulingData[, -8],
method = c("center", "scale", "YeoJohnson", "nzv"))
pp_no_nzv
Created from 4331 samples and 7 variables
Pre-processing:
- centered (4)
- ignored (2)
- removed (1)
- scaled (4)
- Yeo-Johnson transformation (4)
Lambda estimates for Yeo-Johnson transformation:
-0.08, -0.03, -1.05, 1.44
predict(pp_no_nzv, newdata = schedulingData[1:6, -8])
centroids <- classDist(trainBC, trainMDRR)
distances <- predict(centroids, testBC)
distances <- as.data.frame(distances)
head(distances)
xyplot(dist.Active ~ dist.Inactive,
data = distances,
groups = testMDRR,
auto.key = list(columns = 2))