In this lesson students will learn how to implement…
Citation:
Examples for this lesson come from
https://bookdown.org/tpinto_home/Beyond-Additivity/
Facts:
Sources:
bmd.data<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/bmd.csv")
The bone mineral density dataset contains 169 records with the following variables:
id
: patient’s numberage
: patient’s agefracture
: hip fracture (fracture / no fracture)weight_kg
: weight measured in Kgheight_cm
: height measure in cmwaiting_time
: time the patient had to wait for the
densitometry (in minutes)bmd
: bone mineral density measure in the hipBody mass index (BMI) is a common metric used as a medical screening tool that measures a ratio between weight and height (squared).
The bmd
data are in metric units, kilograms (kg) and
centimeters (cm), for weight and height respectively. Therefore, we must
the following equation to calculate BMI
\[BMI = \frac{weight (kg)}{[height
(m)]^2}\] Create a new variable for bmi
in this
dataframe.
bmd.data$bmi <- bmd.data$weight_kg / (bmd.data$height_cm/100)^2
Make a pairs plot using GGally
to explore relationships
between bmd
, age
, sex
, and
bmi
. The color aesthetic should be mapped to
sex
.
library(tidyverse)
library(GGally)
ggpairs(bmd.data, columns=c("bmd", "age", "sex", "bmi"),
ggplot2::aes(color=sex))
Let’s dive further into the relationship between bmd
and
age
. Make a scatterplot to show the relationship between
bmd
and age.
ggplot(bmd.data, aes(x=age, y=bmd))+
geom_point()
bmd
and age
appear to be linear?## FIT A MODEL
lmMod <- lm(bmd ~ age,
data = bmd.data)
## SUMMARY
summary(lmMod)
##
## Call:
## lm(formula = bmd ~ age, data = bmd.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34435 -0.10913 -0.01152 0.10327 0.60449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0460037 0.0643356 16.259 < 2e-16 ***
## age -0.0041316 0.0009926 -4.162 5.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.159 on 167 degrees of freedom
## Multiple R-squared: 0.09399, Adjusted R-squared: 0.08856
## F-statistic: 17.32 on 1 and 167 DF, p-value: 5.036e-05
## DIAGNOSTICS
plot(lmMod)
Use geom_smooth()
to fit a loess model to these
data.
ggplot(bmd.data, aes(x=age, y=bmd))+
geom_point()+
geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
It appears that these data are not linear. So now what?!
Decision trees can be make for continuous or discrete outcomes and are sometimes referred to as CART (Classification And Regression Trees).
Trees create a set of binary decision rules to partition the feature space into \(J\) distinct regions.
Optimal partitions are found by minimizing the loss function
\[\sum_j^J \sum_{i \in R_j} (y_i-\bar{y}_{R_j})^2\]
How should we find the optimal partition?
### MAKE A SEQUENCE TO PARITION
ageSeq<-seq(36, 88, by=1)
length(ageSeq)
## [1] 53
### CALCULATE THE MEANS FOR THE REGIONS
low<-c()
up<-c()
for(i in 1:length(ageSeq)){
lowerMean<-mean(bmd.data$bmd[bmd.data$age<ageSeq[i]])
upperMean<-mean(bmd.data$bmd[bmd.data$age>=ageSeq[i]])
low<-c(low, lowerMean)
up<-c(up, upperMean)
}
### MAKE DATA FRAMES
minDF<-data.frame(x = rep(min(bmd.data$age), length(ageSeq)),
y = low,
group=1:length(ageSeq),
type=rep("lower", length(ageSeq)))
mid1DF<-data.frame(x = ageSeq,
y = low,
group=1:length(ageSeq),
type=rep("lower", length(ageSeq)))
mid2DF<-data.frame(x = ageSeq+0.1,
y = up,
group=1:length(ageSeq),
type=rep("upper", length(ageSeq)))
maxDF<-data.frame(x = rep(max(bmd.data$age), length(ageSeq)),
y = up,
group=1:length(ageSeq),
type=rep("upper", length(ageSeq)))
meanDF<-minDF%>%
rbind(mid1DF)%>%
rbind(mid2DF)%>%
rbind(maxDF)
### USE PLOTLY
#install.packages("plotly")
library(plotly)
p<-ggplot()+
geom_line(data=meanDF, aes(x=x, y=y,color=type, frame=group),
lwd=2)+
geom_point(data=bmd.data, aes(x=age, y=bmd))
### WOW! COOL!
ggplotly(p)
Calculate the residual sum of squares
RSS<-c()
for(i in 1:length(ageSeq)){
lowerRSS<-sum((bmd.data$bmd[bmd.data$age<ageSeq[i]]-mean(bmd.data$bmd[bmd.data$age<ageSeq[i]]))^2)
upperRSS<-sum((bmd.data$bmd[bmd.data$age>=ageSeq[i]]-mean(bmd.data$bmd[bmd.data$age>=ageSeq[i]]))^2)
RSS<-c(RSS, lowerRSS+upperRSS)
}
rssDF<-data.frame(ageSeq, RSS)
ggplot(rssDF, aes(x=ageSeq, y=RSS))+
geom_point()+
geom_line()
What the best location for a partition?
## MIN RSS
min(RSS)
## [1] 4.063506
## WHICH INDEX
which.min(RSS)
## [1] 35
## WHICH CUT OFF
ageSeq[which.min(RSS)]
## [1] 70
### SHOW THE PARITION
meanDF%>%
filter(group==35)%>%
ggplot()+
geom_line(aes(x=x, y=y,color=type),
lwd=2)+
geom_point(data=bmd.data, aes(x=age, y=bmd))
This is how trees work!
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart) #library for CART
library(rpart.plot)
### Read about the defaults
?rpart.control
defaultTree <- rpart(bmd ~ age,
data = bmd.data,
method = "anova")
rpart.plot(defaultTree)
fullTree <- rpart(bmd ~ age,
data = bmd.data,
method = "anova", #indicates the outcome is continuous
control = rpart.control(
minsplit = 1, # min number of observ for a split
minbucket = 1, # min nr of obs in terminal nodes
cp=0) #decrease in complex for a split
)
rpart.plot(fullTree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
plotcp(fullTree)
#printcp(fullTree)
## MIN ERROR
which.min(fullTree$cptable[,"xerror"])
## 2
## 2
## WHICH CP
fullTree$cptable[which.min(fullTree$cptable[,"xerror"]),"CP"]
## [1] 0.02285745
pruneTree <- prune(fullTree, cp=0.02) #prune the tree with cp=0.02
printcp(pruneTree)
##
## Regression tree:
## rpart(formula = bmd ~ age, data = bmd.data, method = "anova",
## control = rpart.control(minsplit = 1, minbucket = 1, cp = 0))
##
## Variables actually used in tree construction:
## [1] age
##
## Root node error: 4.659/169 = 0.027568
##
## n= 169
##
## CP nsplit rel error xerror xstd
## 1 0.150111 0 1.00000 1.01969 0.11357
## 2 0.022857 1 0.84989 0.92695 0.11965
## 3 0.021625 2 0.82703 1.19597 0.16941
## 4 0.020621 8 0.69728 1.22676 0.17072
## 5 0.020000 11 0.62374 1.30664 0.18115
rpart.plot(pruneTree)
What do the partitions look like?
ggplot( bmd.data, aes(x=age, y=bmd))+
geom_point()+
geom_vline(xintercept=c(71, 84, 85, 45, 54, 53, 63, 69, 70),
color="blue", lty=2)
Individual observations:
predict(pruneTree, #prediction using the tree
newdata = data.frame(age=70))
## 1
## 0.8070857
predict(lmMod, #prediction using the linear model
newdata = data.frame(age=70))
## 1
## 0.7567922
Sequence the space:
pred.tree <- predict(pruneTree,
newdata = data.frame(age=seq(40,90,1)))
treePred<-data.frame(age=seq(40,90,1),
tree=pred.tree)
ggplot( bmd.data, aes(x=age, y=bmd))+
geom_point()+
geom_vline(xintercept=c(71, 84, 85, 45, 54, 53, 63, 69, 70),
color="blue", lty=2)+
geom_line(data=treePred,aes(x=age, y=tree),
color="red", lwd=2)+
theme_bw()
Compare LM to Tree:
pred.lm <-predict(lmMod,
newdata = data.frame(age=seq(40,90,1)))
treeDF<-data.frame(age=seq(40,90,1),
tree=pred.tree,
lm=pred.lm)%>%
gather(type, y, -age)
ggplot(treeDF, aes(x=age, y=y, color=type))+
geom_point()+
geom_line()
The caret
package (short for Classification And
REgression Training) is a popular machine learning library that
streamlines and standardizes output for fitting machine learning methods
and performs cross-validation.
Fit a model for bmd
with age
,
sex
, and bmi
.
#install.packages("caret")
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
caretTree <- train(bmd ~ age + sex + bmi,
data = bmd.data,
method = "rpart",
trControl=trctrl,
tuneGrid = expand.grid(cp=seq(0.001, 0.1, 0.001))
)
#Plot the RMSE versus the CP
plot(caretTree)
#Plot the tree with the optimal CP
rpart.plot(caretTree$finalModel)
caretTree$finalModel$cptable
## CP nsplit rel error
## 1 0.26960125 0 1.0000000
## 2 0.07577601 1 0.7303988
## 3 0.06448418 2 0.6546227
## 4 0.05400352 3 0.5901386
## 5 0.03300000 4 0.5361350
trctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 10)
caretLM<- train(bmd ~ age + sex + bmi,
data = bmd.data,
method = "lm",
trControl=trctrl
)
summary(caretLM)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38207 -0.07669 -0.00654 0.07888 0.51256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6063945 0.0834051 7.270 1.36e-11 ***
## age -0.0041579 0.0008625 -4.821 3.23e-06 ***
## sexM 0.0949602 0.0213314 4.452 1.56e-05 ***
## bmi 0.0155913 0.0024239 6.432 1.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.138 on 165 degrees of freedom
## Multiple R-squared: 0.3254, Adjusted R-squared: 0.3131
## F-statistic: 26.53 on 3 and 165 DF, p-value: 4.677e-14
caretLM$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.1385106 0.3302763 0.1044492 0.01718041 0.123744 0.0111556
#extracts the row with the RMSE and R2 from the table of results
#corresponding to the cp with lowest RMSE (best tune)
caretTree$results[caretTree$results$cp==caretTree$bestTune[1,1], ]
## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 33 0.033 0.1336586 0.3852993 0.1034391 0.02688769 0.1822124 0.01969716