The caret package (short for
Classification And
REgression Training) is a front end
package that wraps around a lot of the prediction algorithms and tools
in the R programming language.
The package contains tools for:
preProcesscreateDataPartitioncreateResamplecreateTimeSlicestrainpredictconfusionMatrixMachine Learning algorithms in R
obj Class |
Package | predict Function
Syntax |
|---|---|---|
lda |
MASS | predict(obj) (no options needed) |
glm |
stats | predict(obj, type = 'response') |
gbm |
gbm | predict(obj, type = 'response', n.trees) |
mda |
mda | predict(obj, type = 'posterior') |
rpart |
rpart | predict(obj, type = 'prob') |
Weka |
RWeka | predict(obj, type = 'probability') |
LogitBoost |
caTools | predict(obj, type = 'raw', nIter) |
library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
p = 0.75,
list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
## [1] 3451 58
set.seed(32343)
modelFit <- train(type ~.,
data = training,
method = 'glm')
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9210735 0.8340096
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address all
## -1.536e+00 -4.366e-01 -1.484e-01 1.270e-01
## num3d our over remove
## 2.215e+00 6.430e-01 1.333e+00 1.755e+00
## internet order mail receive
## 7.594e-01 6.331e-01 5.945e-02 -8.875e-02
## will people report addresses
## -1.597e-01 -1.176e-01 1.001e-01 9.088e-01
## free business email you
## 9.775e-01 9.257e-01 1.955e-01 7.291e-02
## credit your font num000
## 9.326e-01 1.964e-01 1.761e-01 1.905e+00
## money hp hpl george
## 3.249e-01 -2.059e+00 -1.535e+00 -7.255e+00
## num650 lab labs telnet
## 5.522e-01 -1.919e+00 -6.869e-03 -1.844e-01
## num857 data num415 num85
## -1.009e+02 -6.956e-01 -1.100e+01 -2.058e+00
## technology num1999 parts pm
## 6.014e-01 -9.794e-02 -7.144e-01 -1.064e+00
## direct cs meeting original
## -2.442e-01 -5.387e+02 -3.151e+00 -9.577e-01
## project re edu table
## -1.716e+00 -7.717e-01 -1.280e+00 -2.568e+00
## conference charSemicolon charRoundbracket charSquarebracket
## -3.242e+00 -1.374e+00 -4.010e-01 -4.556e-01
## charExclamation charDollar charHash capitalAve
## 2.605e-01 4.501e+00 2.572e+00 1.003e-01
## capitalLong capitalTotal
## 5.976e-03 7.274e-04
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1399 AIC: 1515
predictions <- predict(modelFit,
newdata = testing)
predictions[1:30]
## [1] spam spam spam spam spam spam spam spam spam
## [10] nonspam spam spam spam spam spam spam spam spam
## [19] spam nonspam spam spam spam spam spam spam spam
## [28] nonspam spam nonspam
## Levels: nonspam spam
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 665 50
## spam 32 403
##
## Accuracy : 0.9287
## 95% CI : (0.9123, 0.9429)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8496
##
## Mcnemar's Test P-Value : 0.06047
##
## Sensitivity : 0.9541
## Specificity : 0.8896
## Pos Pred Value : 0.9301
## Neg Pred Value : 0.9264
## Prevalence : 0.6061
## Detection Rate : 0.5783
## Detection Prevalence : 0.6217
## Balanced Accuracy : 0.9219
##
## 'Positive' Class : nonspam
##
library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
p = 0.75,
list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
## [1] 3451 58
set.seed(32323)
folds <- createFolds(y = spam$type,
k = 10,
list = TRUE,
returnTrain = TRUE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 4140 4142 4141 4140 4141 4141 4142 4141 4141 4140
folds[[1]][1:10]
## [1] 1 2 4 5 6 7 8 9 10 11
folds[[2]][1:10]
## [1] 1 2 3 4 6 7 8 10 11 12
set.seed(32323)
folds <- createFolds(y = spam$type,
k = 10,
list = TRUE,
returnTrain = FALSE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 461 459 460 461 460 460 459 460 460 461
folds[[1]][1:10]
## [1] 3 19 33 38 44 51 66 67 72 83
folds[[2]][1:10]
## [1] 5 9 24 30 37 46 52 55 62 69
set.seed(32323)
folds <- createResample(y = spam$type,
times = 10,
list = TRUE)
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06 Resample07
## 4601 4601 4601 4601 4601 4601 4601
## Resample08 Resample09 Resample10
## 4601 4601 4601
folds[[1]][1:10]
## [1] 1 1 2 2 3 3 4 5 5 7
folds[[2]][1:10]
## [1] 3 5 7 8 8 9 11 11 11 11
set.seed(32323)
time <- 1:1000
folds <- createTimeSlices(y = time,
initialWindow = 20,
horizon = 10)
names(folds)
## [1] "train" "test"
folds$train[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
## [1] 21 22 23 24 25 26 27 28 29 30
library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
p = 0.75,
list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
modelFit <- train(type ~. ,
data = training,
method = 'glm')
args(train)
## function (x, ...)
## NULL
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("[d_]cv$", method), 1, NA),
## p = 0.75, search = "grid", initialWindow = NULL, horizon = 1,
## fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE,
## returnResamp = "final", savePredictions = FALSE, classProbs = FALSE,
## summaryFunction = defaultSummary, selectionFunction = "best",
## preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5,
## freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL,
## index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0,
## predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5,
## alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE,
## allowParallel = TRUE)
## NULL
trainControl resamplingboot - bootstrappingboot632 - bootstrapping with adjustmentcv - cross validationreaptedcv - repeated cross validationLOOOCV - leave one out cross validationlibrary(ISLR)
library(ggplot2)
library(caret)
data(Wage)
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins logwage wage
## 1. <=Good : 858 1. Yes:2083 Min. :3.000 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447 1st Qu.: 85.38
## Median :4.653 Median :104.92
## Mean :4.654 Mean :111.70
## 3rd Qu.:4.857 3rd Qu.:128.68
## Max. :5.763 Max. :318.34
##
dim(Wage)
## [1] 3000 11
inTrain <- createDataPartition(y = Wage$wage,
p = 0.7,
list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102 11
dim(testing)
## [1] 898 11
featurePlot(x = training[,c('age', 'education', 'jobclass')],
y = training$wage,
plot = 'pairs')
library(ggplot2)
qplot(age, wage, data = training)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
plot(training$age, training$wage)
library(ggplot2)
qplot(age, wage,
colour = jobclass,
data = training)
q <- qplot(age, wage,
colour = education,
data = training)
q + geom_smooth(method = 'lm',
formula = y ~ x)
cut2)library(Hmisc)
cutWage <- cut2(training$wage,
g = 3)
table(cutWage)
## cutWage
## [ 20.1, 91.7) [ 91.7,118.9) [118.9,318.3]
## 701 728 673
c(min(training$wage), quantile(training$wage, probs = c(1/3, 2/3)), max(training$wage))
## 33.33333% 66.66667%
## 20.08554 91.70003 118.88436 318.34243
cutWage2 <- cut(training$wage,
breaks = c(min(training$wage), quantile(training$wage, probs = c(1/3, 2/3)), max(training$wage)), right = FALSE)
table(cutWage2)
## cutWage2
## [20.1,91.7) [91.7,119) [119,318)
## 701 641 759
g1 <- qplot(cutWage, age,
data = training,
fill = cutWage,
geom = 'boxplot')
g1
library(gridExtra)
library(grid)
g2 <- qplot(cutWage, age,
data = training,
fill = cutWage,
geom = c('boxplot','jitter'))
grid.arrange(g1, g2, ncol = 2)
t1 <- table(cutWage, training$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 442 259
## [ 91.7,118.9) 370 358
## [118.9,318.3] 254 419
prop.table(t1,1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 0.6305278 0.3694722
## [ 91.7,118.9) 0.5082418 0.4917582
## [118.9,318.3] 0.3774146 0.6225854
qplot(wage, color = education, data = training, geom = 'density')
library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
p = 0.75,
list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,
main = '',
xlab = 'ave. capital run length')
mean(training$capitalAve)
## [1] 5.452216
sd(training$capitalAve)
## [1] 32.04734
trainCapAve <- training$capitalAve
trainCapAveSt <- (trainCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveSt)
## [1] 5.80977e-16
sd(trainCapAveSt)
## [1] 1
testCapAve <- testing$capitalAve
testCapAveSt <- (testCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveSt)
## [1] -0.03254657
sd(testCapAveSt)
## [1] 0.9597104
preProcess functionpreObj <- preProcess(training[,-58],
method = c('center', 'scale'))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
mean(trainCapAveS)
## [1] 5.80977e-16
sd(trainCapAveS)
## [1] 1
testCapAveS <- predict(preObj, testing[,-58])$capitalAve
mean(testCapAveS)
## [1] -0.03254657
sd(testCapAveS)
## [1] 0.9597104
preProcess argumentset.seed(32343)
modelFit <- train(type ~ .,
data = training,
preProcess=c('center','scale'),
method = 'glm')
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9147189 0.8208559
preObj <- preProcess(training[,-58],
method = c("BoxCox"))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
par(mfrow = c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)
set.seed(13343)
# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],
size = 1,
prob = 0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
preObj <- preProcess(training[,-58], method = 'knnImpute')
capAve <- predict(preObj, training[,-58])$capAve
# Standardize
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -1.512283460 0.001751426 0.002148402 0.002328065 0.708605908
quantile((capAve - capAveTruth)[selectNA])
## 0% 25% 50% 75% 100%
## -1.512283460 -0.006134518 0.007921762 0.027510466 0.708605908
quantile((capAve - capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.279640397 0.001761607 0.002140206 0.002313019 0.002468544
Covariates (predictors/features) are the variables included in the model used (combined) to predict the outcome of interest.
Two levels of covariate (predictor/feature)
library(ISLR)
library(caret)
data(Wage)
inTraining <- createDataPartition(y = Wage$wage,
p = 0.7,
list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
Basic idea: convert factor variables to indicator variables
table(training$jobclass)
##
## 1. Industrial 2. Information
## 1151 1106
dummies <- dummyVars(wage ~ jobclass,
data = training)
head(predict(dummies, newdata = training))
## jobclass.1. Industrial jobclass.2. Information
## 231655 1 0
## 86582 0 1
## 161300 1 0
## 155159 0 1
## 11443 0 1
## 450601 1 0
nsv <- nearZeroVar(training, saveMetrics = TRUE)
nsv
## freqRatio percentUnique zeroVar nzv
## year 1.037736 0.20283976 FALSE FALSE
## age 1.129870 1.73862649 FALSE FALSE
## maritl 3.246888 0.14488554 FALSE FALSE
## race 8.440909 0.11590843 FALSE FALSE
## education 1.457594 0.14488554 FALSE FALSE
## region 0.000000 0.02897711 TRUE TRUE
## jobclass 1.040687 0.05795422 FALSE FALSE
## health 2.466974 0.05795422 FALSE FALSE
## health_ins 2.233524 0.05795422 FALSE FALSE
## logwage 1.033708 12.40220226 FALSE FALSE
## wage 1.033708 12.40220226 FALSE FALSE
library(splines)
bsBasis <- bs(training$age,
df = 3)
head(bsBasis)
## 1 2 3
## [1,] 0.0000000 0.00000000 0.000000000
## [2,] 0.2368501 0.02537679 0.000906314
## [3,] 0.4163380 0.32117502 0.082587862
## [4,] 0.4308138 0.29109043 0.065560908
## [5,] 0.3625256 0.38669397 0.137491189
## [6,] 0.4241549 0.30633413 0.073747105
lm1 <- lm(wage ~ bsBasis,
data = training)
plot(training$age, training$wage, pch = 19, cex = 0.5)
points(training$age, predict(lm1, newdata = training), col = 'red', pch = 19, cex = 0.5)
head(predict(bsBasis, age = testing$age))
## 1 2 3
## [1,] 0.0000000 0.00000000 0.000000000
## [2,] 0.2368501 0.02537679 0.000906314
## [3,] 0.4163380 0.32117502 0.082587862
## [4,] 0.4308138 0.29109043 0.065560908
## [5,] 0.3625256 0.38669397 0.137491189
## [6,] 0.4241549 0.30633413 0.073747105
preProcess in caret will
handle some preprocessinginTrain <- createDataPartition(y = spam$type,
p = 0.8,
list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8, arr.ind = T)
## row col
## num857 32 31
## num415 34 31
## telnet 31 32
## num415 34 32
## direct 40 32
## telnet 31 34
## num857 32 34
## direct 40 34
## num857 32 40
## num415 34 40
names(spam)[c(31,32,34,40)]
## [1] "telnet" "num857" "num415" "direct"
plot(spam[,31], spam[,32],
xlab = names(spam)[31],
ylab = names(spam)[32])
par(mar=c(3.8, 3.8, 0.5, 0.7),
mfrow = c(2,2))
plot(spam[,31], spam[,32],
xlab = names(spam)[31],
ylab = names(spam)[32])
plot(spam[,32], spam[,34],
xlab = names(spam)[32],
ylab = names(spam)[34])
plot(spam[,34], spam[,40],
xlab = names(spam)[34],
ylab = names(spam)[40])
plot(spam[,32], spam[,40],
xlab = names(spam)[32],
ylab = names(spam)[40])
\[ X = 0.71 \times \text{num415} + 0.71 \times \text{num857} \\ Y = 0.71 \times \text{num415} - 0.71 \times \text{num857} \]
cor(training$num415,training$num857)
## [1] 0.9955794
X <- 0.71 * training$num415 + 0.71 * training$num857
Y <- 0.71 * training$num415 - 0.71 * training$num857
plot(X,Y)
prcompsmallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2],)
prComp$rotation
## PC1 PC2
## num415 0.7080625 0.7061498
## num857 0.7061498 -0.7080625
typeColor <- ((spam$type == 'spam') * 1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1], prComp$x[,2],
col = typeColor,
xlab = 'PC1', ylab = 'PC2')
preProc <- preProcess(log10(spam[,-58]+1),
method = 'pca',
pcaComp = 2)
spamPCA <- predict(preProc, log10(spam[,-58]+1))
plot(spamPCA[,1],spamPCA[,2],
col = typeColor)
preProc <- preProcess(log10(training[,-58]+1),
method = 'pca',
pcaComp = 2)
trainPC <- predict(preProc,
log10(training[,-58]+1))
trainPC <- cbind(training$type, trainPC)
colnames(trainPC) <- c('type',"PC1", "PC2")
modelFit <- train(type ~ .,
method = 'glm',
data = trainPC)
testPC <- predict(preProc,
log10(testing[,-58]+1))
confusionMatrix(testing$type,
predict(modelFit, testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 526 31
## spam 53 309
##
## Accuracy : 0.9086
## 95% CI : (0.8881, 0.9264)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8065
##
## Mcnemar's Test P-Value : 0.02195
##
## Sensitivity : 0.9085
## Specificity : 0.9088
## Pos Pred Value : 0.9443
## Neg Pred Value : 0.8536
## Prevalence : 0.6300
## Detection Rate : 0.5724
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9086
##
## 'Positive' Class : nonspam
##
modelFit <- train(type ~ .,
method = 'glm',
preProcess = 'pca',
data = training)
confusionMatrix(testing$type, predict(modelFit, testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 532 25
## spam 36 326
##
## Accuracy : 0.9336
## 95% CI : (0.9155, 0.9489)
## No Information Rate : 0.6181
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8602
##
## Mcnemar's Test P-Value : 0.2004
##
## Sensitivity : 0.9366
## Specificity : 0.9288
## Pos Pred Value : 0.9551
## Neg Pred Value : 0.9006
## Prevalence : 0.6181
## Detection Rate : 0.5789
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9327
##
## 'Positive' Class : nonspam
##
library(caret)
data(faithful)
set.seed(333)
inTrain <- createDataPartition(y = faithful$waiting,
p = 0.5,
list = FALSE)
trainFaith <- faithful[inTrain,]
testFaith <- faithful[-inTrain,]
head(trainFaith)
## eruptions waiting
## 3 3.333 74
## 6 2.883 55
## 7 4.700 88
## 8 3.600 85
## 9 1.950 51
## 11 1.833 54
plot(trainFaith$waiting,
trainFaith$eruptions,
pch = 19, col = 'blue',
xlab = 'Waiting', ylab = 'Duration')
\[ \text{ED}_{i} = \beta_0 + \beta_1 \text{WT}_{i} + \epsilon_i \]
lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.13375 -0.36778 0.06064 0.36578 0.96057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.648629 0.226603 -7.275 2.55e-11 ***
## waiting 0.072211 0.003136 23.026 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4941 on 135 degrees of freedom
## Multiple R-squared: 0.7971, Adjusted R-squared: 0.7956
## F-statistic: 530.2 on 1 and 135 DF, p-value: < 2.2e-16
par(mfrow = c(2,2), mar=c(3.8, 3.8, 1.4, 1.4))
plot(trainFaith$waiting,
trainFaith$eruptions,
pch = 19, col = 'lightblue',
xlab = 'Waiting', ylab = 'Duration')
lines(trainFaith$waiting, lm1$fitted, lwd = 3)
plot(lm1, pch = 19, col = 'orange', which = 1)
plot(lm1, pch = 19, col = 'slateblue', which = 2)
plot(testFaith$waiting,
testFaith$eruptions,
pch = 19, col = 'salmon',
xlab = 'Waiting', ylab = 'Duration')
lines(testFaith$waiting, predict(lm1, testFaith), lwd = 3)
\[ \widehat{\text{ED}} = \widehat{\beta}_0 + \widehat{\beta}_1 \text{WT} \]
coef(lm1)[1] + coef(lm1)[2] * 80
## (Intercept)
## 4.128276
newdata <- data.frame(waiting = 80)
predict(lm1, newdata)
## 1
## 4.128276
sqrt(sum((lm1$fitted - trainFaith$eruptions)**2))
## [1] 5.740844
sqrt(sum((predict(lm1, testFaith) - testFaith$eruptions)**2))
## [1] 5.853745
pred1 <- predict(lm1, newdata = testFaith, interval = 'prediction')
ord <- order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions,
pch = 19, col = 'blue')
matlines(testFaith$waiting[ord],
pred1[ord,], type = 'l', col = c(1,2,2),
lty = c(1,1,1), lwd = 3)
caretmodFit <- train(eruptions ~ waiting,
data = trainFaith,
method = 'lm')
summary(modFit$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.13375 -0.36778 0.06064 0.36578 0.96057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.648629 0.226603 -7.275 2.55e-11 ***
## waiting 0.072211 0.003136 23.026 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4941 on 135 degrees of freedom
## Multiple R-squared: 0.7971, Adjusted R-squared: 0.7956
## F-statistic: 530.2 on 1 and 135 DF, p-value: < 2.2e-16
data(Wage)
Wage <- subset(Wage, select = -c(logwage))
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins wage
## 1. <=Good : 858 1. Yes:2083 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y = Wage$wage,
p = 0.7,
list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102 10
dim(testing)
## [1] 898 10
featurePlot(x = training[,c('age','education','jobclass')],
y = training$wage,
plot = 'pairs')
qplot(age, wage, colour = jobclass, data=training)
qplot(age, wage, colour = education, data=training)
\[ \text{ED}_i + \beta_0 + \beta_1 \text{age} + \beta_2 I \left( \text{jobclass}_i = \text{information} \right) + \sum_{k = 1}^{4} \gamma_k I \left( \text{education}_i = \text{level}_k \right) \]
fitM <- train(wage ~ age + jobclass + education,
method = 'lm', data=training)
fitMFinal <- fitM$finalModel
print(fitM)
## Linear Regression
##
## 2102 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 35.56759 0.2589245 24.87554
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
plot(fitMFinal,1,pch = 19, cex = 0.5, col = 'gray')
plot(fitMFinal,1,pch = 19, cex = 0.5, col = 'gray')
qplot(fitMFinal$fitted, fitMFinal$residuals,
colour = race, data = training)
plot(fitMFinal$residuals, pch = 19)
pred <- predict(fitM, testing)
qplot(wage, pred, color = year, data = testing)
fitAll <- train(wage ~ .,
data = training,
method = 'lm')
pred <- predict(fitAll, testing)
qplot(wage, pred, data = testing)