Load example data
library(ISLR); library(caret); data(Wage);
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
Common covariates to add, dummy variables
Basic idea - convert factor variables to indicator variables
table(training$jobclass)
##
## 1. Industrial 2. Information
## 1050 1052
dummies <- dummyVars(wage ~ jobclass,data=training)
str(dummies)
## List of 9
## $ call : language dummyVars.default(formula = wage ~ jobclass, data = training)
## $ form :Class 'formula' language wage ~ jobclass
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ vars : chr [1:2] "wage" "jobclass"
## $ facVars : chr "jobclass"
## $ lvls :List of 1
## ..$ jobclass: chr [1:2] "1. Industrial" "2. Information"
## $ sep : chr "."
## $ terms :Classes 'terms', 'formula' language wage ~ jobclass
## .. ..- attr(*, "variables")= language list(wage, jobclass)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "wage" "jobclass"
## .. .. .. ..$ : chr "jobclass"
## .. ..- attr(*, "term.labels")= chr "jobclass"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(wage, jobclass)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:2] "wage" "jobclass"
## $ levelsOnly: logi FALSE
## $ fullRank : logi FALSE
## - attr(*, "class")= chr "dummyVars"
head(predict(dummies,newdata=training))
## jobclass.1. Industrial jobclass.2. Information
## 86582 0 1
## 155159 0 1
## 11443 0 1
## 376662 0 1
## 377954 0 1
## 228963 0 1
Removing zero covariates
nsv <- nearZeroVar(training,saveMetrics=TRUE)
nsv
## freqRatio percentUnique zeroVar nzv
## year 1.055394 0.33301618 FALSE FALSE
## age 1.055556 2.90199810 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.173246 0.23786870 FALSE FALSE
## race 8.685000 0.19029496 FALSE FALSE
## education 1.389691 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.001905 0.09514748 FALSE FALSE
## health 2.451560 0.09514748 FALSE FALSE
## health_ins 2.233846 0.09514748 FALSE FALSE
## logwage 1.135802 19.60038059 FALSE FALSE
## wage 1.135802 19.60038059 FALSE FALSE
Spline basis
library(splines)
bsBasis <- bs(training$age,df=3) # df is the order of the polinormilar (order of the fit)
bsBasis [1:10, ]
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4308138 0.29109043 0.065560908
## [3,] 0.3625256 0.38669397 0.137491189
## [4,] 0.3063341 0.42415495 0.195763821
## [5,] 0.3776308 0.09063140 0.007250512
## [6,] 0.4403553 0.25969672 0.051051492
## [7,] 0.3355376 0.40743849 0.164915579
## [8,] 0.4163380 0.32117502 0.082587862
## [9,] 0.4333314 0.16370296 0.020614447
## [10,] 0.4443582 0.22759810 0.038858212
Fitting curves with splines
lm1 <- lm(wage ~ bsBasis,data=training)
plot(training$age,training$wage,pch=19,cex=0.5)
points(training$age,predict(lm1,newdata=training),col="red",pch=19, cex=0.5)

Splines on the test set
predt <- predict(bsBasis,age=testing$age)
predt[1:10, ]
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4308138 0.29109043 0.065560908
## [3,] 0.3625256 0.38669397 0.137491189
## [4,] 0.3063341 0.42415495 0.195763821
## [5,] 0.3776308 0.09063140 0.007250512
## [6,] 0.4403553 0.25969672 0.051051492
## [7,] 0.3355376 0.40743849 0.164915579
## [8,] 0.4163380 0.32117502 0.082587862
## [9,] 0.4333314 0.16370296 0.020614447
## [10,] 0.4443582 0.22759810 0.038858212