Exercise 5: Trees & MARS

Punto 1:Caricamento e pulizia dati

Importiamo e “modifichiamo” il dataset TL contenente dati assicurativi

set.seed(77)
data<-read.csv("TL.csv",header=TRUE,row.names = 1)

data$INCOME<-log(data$INCOME)
data$FACE<-log(data$FACE)

data$MARSTAT<-as.factor(data$MARSTAT)
levels(data$MARSTAT)<-c("Not single","Single")

In più dividiamolo già in train e test per le analisi successive

c<-sample(275,75)
test<-data[c,]
train<-data[-c,]

Punto 2: Stima del primo albero

Senza troppe indicazioni stimiamo il primo albero dal train set

library(tree)

t<-tree(train$FACE~.,train)
plot(t) 
text (t , pretty = 0,cex=0.5)

summary(t)

## 
## Regression tree:
## tree(formula = train$FACE ~ ., data = train)
## Variables actually used in tree construction:
## [1] "INCOME"    "EDUCATION" "NUMHH"     "AGE"      
## Number of terminal nodes:  13 
## Residual mean deviance:  1.722 = 322.1 / 187 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.10700 -0.60640  0.09494  0.00000  0.75170  4.14400

Dal summary dell’albero è possibile vedere quali siano le variabili effettivamente usate e il numero di nodi terminali generati; si vede come non tutte le variabili vengano utilizzate (viene fatta una sorta di “scelta”), infatti la covariata MARSTAT non è presente all’interno di nessun nodo interno.

s<-summary(t)
s$used

## [1] INCOME    EDUCATION NUMHH     AGE      
## Levels: <leaf> AGE MARSTAT EDUCATION NUMHH INCOME

s$size

## [1] 13

Per confrontare i risultati ottenuti stimiamo altri modelli com gli stessi dati

mod1<-lm(train$FACE~.,train)
summary(mod1)

## 
## Call:
## lm(formula = train$FACE ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5529 -0.8397  0.1247  0.8379  4.5714 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.8389793  1.1569590   2.454   0.0150 *  
## AGE            0.0009559  0.0097844   0.098   0.9223    
## MARSTATSingle -0.2415010  0.3206639  -0.753   0.4523    
## EDUCATION      0.1956800  0.0438493   4.463 1.37e-05 ***
## NUMHH          0.2250992  0.0885853   2.541   0.0118 *  
## INCOME         0.5119170  0.0962342   5.319 2.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.544 on 194 degrees of freedom
## Multiple R-squared:  0.3276, Adjusted R-squared:  0.3102 
## F-statistic:  18.9 on 5 and 194 DF,  p-value: 2.718e-15

Il modello lineare rispecchia quello che abbiamo visto con il tree: la covariata MARSTAT risutla non significativa come anche AGE che infatti viene utilizzata una sola volta nel tree per un nodo interno.

library(glmnet)

## Caricamento del pacchetto richiesto: Matrix

## Loaded glmnet 4.1-6

set.seed(77)
x<-model.matrix(FACE~.,data = train)
y<-train$FACE

lambda<-cv.glmnet(x,y,alpha=0)
minlambda<-lambda$lambda.min

rmod<-glmnet (x,y, alpha = 1,lambda = minlambda)
predict(rmod , type = "coefficients", s =minlambda)

## 7 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)   5.26633832
## (Intercept)   .         
## AGE           .         
## MARSTATSingle .         
## EDUCATION     0.12154206
## NUMHH         0.09866721
## INCOME        0.42400109

Nel caso di una regressione penalizzante di tipo Lasso, vengono ancora confermate le considerazioni fatte precedentemente.

Punto 3: Potatura albero

par(mfrow=c(1,2))
tp<-prune.tree(t)
plot(tp) 

cvt<-cv.tree(t)
min(cvt$dev)

## [1] 521.4501

plot(cvt$dev)

cvt$dev

##  [1] 524.6626 527.1557 534.0402 536.5577 531.7822 525.2594 532.5376 548.3671
##  [9] 548.3671 521.4501 574.3371 704.2635

tp<-prune.tree(t,best=8)
plot(tp)
text (tp , pretty = 0,cex=0.5)
plot(t) 
text (t , pretty = 0,cex=0.5)

s2<-summary(tp)

s$used

## [1] INCOME    EDUCATION NUMHH     AGE      
## Levels: <leaf> AGE MARSTAT EDUCATION NUMHH INCOME

s2$used

## [1] INCOME    EDUCATION AGE       NUMHH    
## Levels: <leaf> AGE MARSTAT EDUCATION NUMHH INCOME

s$size

## [1] 13

s2$size

## [1] 8

Punto 4: Calcolo del MSE e grafico delle significatività delle covariate

Calcoliamo l’MSE degli alberi fino ad avere 12 nodi finali, in modo tale da verificare ulteriormente se l’albero selezionato precedentemente come il migliore ( quello con 8 nodi finali) risulta tale anche per questa quantità.

par(mfrow=c(2,2))
d<-c()
rss<-c()
MSE<-c()
for(n in 1:11){
  tp<-prune.tree(t,best=n+1)
  plot(tp)
  text (tp , pretty = 0,cex=0.5)
  pred<-predict(tp,test)
for(i in 1:75){
 d[i]<-((test$FACE[i]-pred[i])^2)
  }
 rss[n]<-sum(d)
 MSE[n]<-rss[n]/75
}

which.min(MSE)+1#+1 in quanto il vettore parte da 2,non si può fare un albero a singolo nodo

## [1] 8

par(mfrow=c(1,2))

plot(rss,xlab="Tree size")
plot(MSE,xlab="Tree size")

Proviamo a riprodurre il grafico della slide 31 dove si confrontano gli MSE del train e teest set e quello della CV

par(mfrow=c(1,1))

d<-c()
rsstrain<-c()
MSEtrain<-c()
for(n in 1:11){
  tp<-prune.tree(t,best=n+1)
  pred<-predict(tp,train)
  for(i in 1:200){
    d[i]<-((train$FACE[i]-pred[i])^2)
  }
  rsstrain[n]<-sum(d)
  MSEtrain[n]<-rss[n]/200
}

sdMSE<-sd(MSE)
sdMSEt<-sd(MSEtrain)
cv.t<-cv.tree(t, FUN = prune.tree)
MSEcv<-(cv.t$dev[1:11]/275)
sdMSEcv<-sd(MSEcv)
x<-c(1:12)
plot(MSE,ylim=c(0.5,3),xlab="Tree size")
lines(MSE,lwd=2)
arrows(x, MSE-sdMSE, x, MSE+sdMSE, length=0.05, angle=90, code=3)
points(MSEtrain,col="red")
lines(MSEtrain,col="red",lwd=2)
arrows(x, MSEtrain-sdMSEt, x, MSEtrain+sdMSEt, length=0.05, angle=90, code=3,col="red")
points(MSEcv,col="green")
lines(MSEcv,col="green",lwd=2)
arrows(x, MSEcv-sdMSEcv, x, MSEcv+sdMSEcv, length=0.05, angle=90, code=3,col="green")
legend("top",legend=c("Test","CV","Train"), col=c("black","green","red"),box.lty=0,cex=0.6,lwd=2)

Punto 5: Modello MARS

library(earth)

## Caricamento del pacchetto richiesto: Formula

## Caricamento del pacchetto richiesto: plotmo

## Caricamento del pacchetto richiesto: plotrix

## Caricamento del pacchetto richiesto: TeachingDemos

library(vip)

## 
## Caricamento pacchetto: 'vip'

## Il seguente oggetto è mascherato da 'package:utils':
## 
##     vi

library(ggplot2)
Mars<-earth(train$FACE~.,train)
summary(Mars)

## Call: earth(formula=train$FACE~., data=train)
## 
##                   coefficients
## (Intercept)         13.4708094
## h(AGE-30)            2.1001764
## h(AGE-31)           -3.1253671
## h(AGE-34)            0.9536454
## h(55-AGE)           -0.1236887
## h(EDUCATION-13)      0.2878854
## h(NUMHH-2)           0.4307723
## h(NUMHH-5)          -0.6460907
## h(INCOME-10.5966)    0.7393625
## 
## Selected 9 of 15 terms, and 4 of 5 predictors
## Termination condition: Reached nk 21
## Importance: INCOME, EDUCATION, AGE, NUMHH, MARSTATSingle-unused
## Number of terms at each degree of interaction: 1 8 (additive model)
## GCV 2.295721    RSS 384.4069    GRSq 0.3388421    RSq 0.4408849

p<-vip(Mars) + ggtitle("Variable importance")

Mars2<-earth(train$FACE~.,train,degree=2)
summary(Mars2)

## Call: earth(formula=train$FACE~., data=train, degree=2)
## 
##                               coefficients
## (Intercept)                     11.4779209
## h(EDUCATION-13)                  0.5282926
## h(5-NUMHH)                      -0.3071351
## h(INCOME-10.5966)                0.6577009
## h(55-AGE) * h(EDUCATION-13)     -0.0192132
## h(AGE-55) * h(EDUCATION-13)     -0.0306185
## h(43-AGE) * h(INCOME-10.5966)    0.1164980
## 
## Selected 7 of 20 terms, and 4 of 5 predictors
## Termination condition: Reached nk 21
## Importance: INCOME, NUMHH, EDUCATION, AGE, MARSTATSingle-unused
## Number of terms at each degree of interaction: 1 3 3
## GCV 2.351142    RSS 398.0013    GRSq 0.322881    RSq 0.421112

p2<-vip(Mars2) + ggtitle("Variable importance w/degree=2")

p