Students will learn how to use R to generate random variables and become acquainted with the trade-offs inherent in models building.
## using functions built into R
## generating random variables
?rnorm()
## parameters
# mean
mu<-0
# sd
sigma<-1
## arguments
# n
rnorm(n=50, mean=0, sd=1)
## [1] -1.898900576 1.408376258 -0.120659148 -0.481476202 -1.066574410
## [6] -0.233780106 1.172258616 -1.518097982 0.646387204 0.919133520
## [11] 0.629635351 0.101691199 0.843749632 -0.471649244 -2.043506057
## [16] 0.479996696 0.594708032 -1.022418629 -0.671545747 1.604899285
## [21] 0.846667205 1.042462688 -0.486645648 0.454915846 -0.351388294
## [26] 0.373048194 -0.735789975 -0.667169710 -0.315879755 1.044997482
## [31] 1.271434362 -0.001348166 -0.077827324 0.485810048 0.394420296
## [36] -1.353386777 -0.821398263 0.535602045 -2.163637344 -1.552826658
## [41] -0.048124829 -0.036504472 -0.374310098 2.845298192 -1.286169550
## [46] -1.173176923 -0.608060437 0.346164843 0.039372202 -0.412197631
## woah thats a lot...
## we should store it as a variable
a<-rnorm(n=100, mean=0, sd=1)
## can we look at this?
hist(a)
## sample statistics
mean(a)
## [1] -0.1364287
sd(a)
## [1] 1.055486
## bias = difference between observed and expected
mean(a)-mu
## [1] -0.1364287
Notice that when we use a random generator in R we get different results each time we run the code. If we want our results to be reproducible we should set a seed.
set.seed(314)
b<-rnorm(n=100, mean=0, sd=1)
head(b)
## [1] -1.2882358 0.7274610 -0.8329249 -0.7036765 0.1272481 -0.3349414
Broadly, the class the models that are used to predict relationships with numeric outputs are known as regression models.
library(tidyverse)
this_df<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/class1B.csv")
## TRAINING SET
train_dat<-this_df%>%
filter(test==FALSE)
## TESTING SET
test_dat<-this_df%>%
filter(test==TRUE)
ggplot(train_dat, aes(x, y))+
geom_point(size=3)+
theme_bw()+
coord_fixed()
Sketch a modelby hand. What would it look like? What would be your strategy?
ggplot(train_dat, aes(x, y))+
geom_point(size=3)+
geom_smooth(method="lm", se=FALSE)+
theme_bw()+
coord_fixed()
ggplot(train_dat, aes(x, y))+
geom_point(size=3)+
geom_smooth(se=FALSE, span=.2)+
theme_bw()+
coord_fixed()
The fit of a model is determined by the error between what is observed and what the fitted model expected. We refer to this difference as the residual.
## LINEAR MODEL
lm_mod<-lm(y~x, data=train_dat)
lm_pred<-predict(lm_mod)
## FORMAT DATA FOR PLOT
lm_dat<-train_dat%>%
cbind(lm_pred)%>%
gather(key=y_type, value=ys, -c(x,test))
## PLOT with Residuals
ggplot(data=train_dat, aes(x, y))+
geom_point(size=3)+
geom_smooth(method="lm", se=FALSE)+
geom_line(data=lm_dat, aes(x, ys, group=x))+
theme_bw()+
coord_fixed()
When creating a model it is often our goal to use that model to
predict future observations. We can use the predict()
function in R to get the point estimate for where we might expect to
observe a new observation given the set of inputs.
lm_mod<-lm(y~x, data=train_dat)
lm_pred<-predict(lm_mod, test_dat)
## FORMAT DATA FOR PLOT
lm_dat<-test_dat%>%
cbind(lm_pred)%>%
gather(key=y_type, value=ys, -c(x,test))
ggplot(train_dat, aes(x, y))+
geom_point(size=3)+
geom_smooth(method="lm", se=FALSE)+
geom_point(data=test_dat, aes(x, y), size=3, color="red")+
geom_line(data=lm_dat, aes(x, ys, group=x))+
theme_bw()+
coord_fixed()
## TEST MSE
mean((test_dat$y-lm_pred)^2)
## [1] 1.834881
## LOESS MODEL
loess_mod<-loess(y~x, data=train_dat, span=.2)
loess_pred<-predict(loess_mod, test_dat)
## FORMAT DATA FOR PLOT
loess_dat<-test_dat%>%
cbind(loess_pred)%>%
gather(key=y_type, value=ys, -c(x, test))
## PLOT with Residuals
ggplot(data=train_dat, aes(x, y))+
geom_point(size=3)+
geom_smooth(se=FALSE, span=0.2)+
geom_point(data=test_dat, aes(x, y), size=3, color="red")+
geom_line(data=loess_dat, aes(x, ys, group=x))+
theme_bw()+
coord_fixed()
## TEST MSE
mean((test_dat$y-loess_pred)^2)
## [1] 1.96736
LOESS (locally estimated scatterplot smoothing) is a smoothing technique that uses data points close to the value in question to create a model. When using LOESS we must specify how much data around a given point to use. This is known as the span.
In the following code, we perform a grid search for the span to decide the appropriate amount of “wiggle” to give to the line.
In this simulation we use 5 different random splits (rows) to show how the models may look when fit to different data. Note how different the resulting LOESS lines can be given different random splits.
Do you observe any patterns?
span_seq<-seq(.2, 1, .2)
plotlist<-list()
for(j in 1:5){
that_df<-this_df%>%
select(x, y)%>%
mutate(test=FALSE)
this_test_ind<-sample(1:19, 7)
that_df$test[this_test_ind]<-TRUE
this_train_dat<-that_df[-this_test_ind,]
this_test_dat<-that_df[this_test_ind,]
for(i in 1:length(span_seq)){
p<-ggplot()+
geom_point(data=this_train_dat, aes(x, y))+
geom_point(data=this_test_dat, aes(x, y), color="gray")+
geom_smooth(data=this_train_dat, aes(x, y), se=FALSE, span=span_seq[i])+
theme_bw()+
coord_fixed()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y =element_blank(),
axis.ticks.y=element_blank(),
plot.margin = rep(unit(0,"null"),4),
panel.spacing = unit(0,"null"),
axis.ticks.length = unit(0,"null"),
axis.ticks.margin = unit(0,"null"))
#+
#ggtitle(paste("Split:", j, ", Span = ", span_seq[i]))+
#print(p)
#p
plotlist <- c(plotlist, list(p))
#ggsave(paste("split", j, "Span", i, ".pdf", sep=""),
# width=6, height=4)
}
}
library(gridExtra)
do.call(grid.arrange, c(plotlist, list(ncol = 5)))
If we only use the data that was used to fit the model to assess model fit, we will have an underestimation of how well the model will perform when when faced with new data. Therefore, we split or data into training sets. The training set it used to fit the model and the testing set it used as a proxy for new data.
This process is called cross-validation. Cross-validation can be used to protect against over or underfitting.
See if you can find the “Goldilocks Zone” in the plot below to pick the appropriate span for our LOESS model.
### DF
split_lab<-c()
span_lab<-c()
train_mse<-c()
test_mse<-c()
for(j in 1:50){
set.seed(j*314)
that_df<-this_df%>%
select(x, y)%>%
mutate(test=FALSE)
this_test_ind<-sample(1:19, 7)
that_df$test[this_test_ind]<-TRUE
this_train_dat<-that_df[-this_test_ind,]
this_test_dat<-that_df[this_test_ind,]
for(i in 1:length(span_seq)){
### STORE
split_lab<-c(split_lab, j)
span_lab<-c(span_lab, span_seq[i])
## TRAIN MSE
thisLOESS<-loess(y ~ x, data=this_train_dat, span=span_seq[i])
trainMSE<-mean((thisLOESS$res)^2)
train_mse<-c(train_mse, trainMSE)
## TEST MSE
testPred<-predict(thisLOESS, this_test_dat)
testMSE<-mean((this_test_dat$y-testPred)^2)
test_mse<-c(test_mse, testMSE)
}
}
simDat<-data.frame(split_lab,
span_lab,
train_mse,
test_mse)%>%
gather(key=mse_type, value=mse, -c(split_lab, span_lab))
#View(simDat)
##
ggplot(simDat, aes(x=as.factor(span_lab), y=mse, fill=mse_type))+
#geom_point()+
geom_boxplot()
## feature (vector)
x<-seq(1, 20, 1)
true<-1.5*sin(.7*x)+.5*x
# outcome has some noise
set.seed(3.14)
y<-true+rnorm(n=length(x), sd=1)
this_df<-data.frame(x, true, y)%>%
mutate(test=FALSE)
# 1/3 test
test_ind<-sample(1:19, 7)
this_df$test[test_ind]<-TRUE
train_dat<-this_df[-test_ind,]
test_dat<-this_df[test_ind,]
library(tidyverse)
ggplot()+
geom_point(data=this_df, aes(x, y, color=test), size=3)+
geom_line(data=this_df, aes(x, true))+
theme_bw()+
coord_fixed()