Load example data

library(ISLR); library(caret); data(Wage);
inTrain <- createDataPartition(y=Wage$wage,
                              p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]

Common covariates to add, dummy variables

Basic idea - convert factor variables to indicator variables

table(training$jobclass)

## 
##  1. Industrial 2. Information 
##           1050           1052

dummies <- dummyVars(wage ~ jobclass,data=training)
str(dummies)

## List of 9
##  $ call      : language dummyVars.default(formula = wage ~ jobclass, data = training)
##  $ form      :Class 'formula'  language wage ~ jobclass
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ vars      : chr [1:2] "wage" "jobclass"
##  $ facVars   : chr "jobclass"
##  $ lvls      :List of 1
##   ..$ jobclass: chr [1:2] "1. Industrial" "2. Information"
##  $ sep       : chr "."
##  $ terms     :Classes 'terms', 'formula'  language wage ~ jobclass
##   .. ..- attr(*, "variables")= language list(wage, jobclass)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "wage" "jobclass"
##   .. .. .. ..$ : chr "jobclass"
##   .. ..- attr(*, "term.labels")= chr "jobclass"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(wage, jobclass)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:2] "wage" "jobclass"
##  $ levelsOnly: logi FALSE
##  $ fullRank  : logi FALSE
##  - attr(*, "class")= chr "dummyVars"

head(predict(dummies,newdata=training))

##        jobclass.1. Industrial jobclass.2. Information
## 86582                       0                       1
## 155159                      0                       1
## 11443                       0                       1
## 376662                      0                       1
## 377954                      0                       1
## 228963                      0                       1

Removing zero covariates

nsv <- nearZeroVar(training,saveMetrics=TRUE)
nsv

##            freqRatio percentUnique zeroVar   nzv
## year        1.055394    0.33301618   FALSE FALSE
## age         1.055556    2.90199810   FALSE FALSE
## sex         0.000000    0.04757374    TRUE  TRUE
## maritl      3.173246    0.23786870   FALSE FALSE
## race        8.685000    0.19029496   FALSE FALSE
## education   1.389691    0.23786870   FALSE FALSE
## region      0.000000    0.04757374    TRUE  TRUE
## jobclass    1.001905    0.09514748   FALSE FALSE
## health      2.451560    0.09514748   FALSE FALSE
## health_ins  2.233846    0.09514748   FALSE FALSE
## logwage     1.135802   19.60038059   FALSE FALSE
## wage        1.135802   19.60038059   FALSE FALSE

Spline basis

library(splines)
bsBasis <- bs(training$age,df=3) # df is the order of the polinormilar (order of the fit)
bsBasis [1:10, ]

##               1          2           3
##  [1,] 0.2368501 0.02537679 0.000906314
##  [2,] 0.4308138 0.29109043 0.065560908
##  [3,] 0.3625256 0.38669397 0.137491189
##  [4,] 0.3063341 0.42415495 0.195763821
##  [5,] 0.3776308 0.09063140 0.007250512
##  [6,] 0.4403553 0.25969672 0.051051492
##  [7,] 0.3355376 0.40743849 0.164915579
##  [8,] 0.4163380 0.32117502 0.082587862
##  [9,] 0.4333314 0.16370296 0.020614447
## [10,] 0.4443582 0.22759810 0.038858212

Fitting curves with splines

lm1 <- lm(wage ~ bsBasis,data=training)
plot(training$age,training$wage,pch=19,cex=0.5)
points(training$age,predict(lm1,newdata=training),col="red",pch=19, cex=0.5)

Splines on the test set

predt <- predict(bsBasis,age=testing$age)
predt[1:10, ]

##               1          2           3
##  [1,] 0.2368501 0.02537679 0.000906314
##  [2,] 0.4308138 0.29109043 0.065560908
##  [3,] 0.3625256 0.38669397 0.137491189
##  [4,] 0.3063341 0.42415495 0.195763821
##  [5,] 0.3776308 0.09063140 0.007250512
##  [6,] 0.4403553 0.25969672 0.051051492
##  [7,] 0.3355376 0.40743849 0.164915579
##  [8,] 0.4163380 0.32117502 0.082587862
##  [9,] 0.4333314 0.16370296 0.020614447
## [10,] 0.4443582 0.22759810 0.038858212

Covariate Creation

Duc Nguyen

Load example data

Common covariates to add, dummy variables

Basic idea - convert factor variables to indicator variables

Removing zero covariates

Spline basis

Fitting curves with splines

Splines on the test set