Question 11.1 Using the crime data set uscrime.txt from Questions 8.2, 9.1, and 10.1, build a regression model using: 1. Stepwise regression 2. Lasso 3. Elastic net
set.seed(123)
data <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
head(data)
s_data = cbind(as.data.frame(scale(data[,1])),
as.data.frame(data[,2]),
as.data.frame(scale(data[,c(3,4,5,6,7,8,9,10,11,12,13,14,15)]))
,
as.data.frame(data[,16]))
colnames(s_data) = colnames(data)
model.lm = lm(Crime~., data=data)
model.step.aic = step(model.lm, direction='backward', trace=F)
summary(model.step.aic)
model.lm2 = lm(formula = Crime ~ M + Ed + Po1 + U2 + Ineq + Prob,
data = data)
summary(model.lm2)
library(sme)
model.step.bic = step(model.lm, direction='backward', k= log(47), trace=F)
summary(model.step.bic)
coefficients(model.step.bic)
library(DAAG)
# do 5-fold cross-validation
c.lm2 = cv.lm(data,model.lm2,m=5)
c.step.aic = cv.lm(data,model.step.aic,m=5)
# total sum of squared differences between data and its mean
SStot = sum((data$Crime - mean(data$Crime))^2)
SSres_model.lm2 = sum(model.lm2$residuals^2)
SSres_model.step.aic = sum(model.step.aic$residuals^2)
SSres_c.lm2 = attr(c.lm2,"ms")*nrow(data)
SSres_c.step.aic = attr(c.step.aic,"ms")*nrow(data)
# Calculate R-squareds for models and cross-validation
r2 = c()
r2[1] = 1 - SSres_model.lm2/SStot
r2[2] = 1 - SSres_model.step.aic/SStot
r2[3] = 1 - SSres_c.lm2/SStot # cross-validated lm model
r2[4] = 1 - SSres_c.step.aic/SStot #cross-validation aic model
r2
#Lasso
rm(list = ls())
set.seed(123)
data <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
head(data)
#scale the data; does not scaled col #2 as it is binary and col #16 as it is
the response
s_data = cbind(as.data.frame(scale(data[,1])),
as.data.frame(data[,2]),
as.data.frame(scale(data[,c(3,4,5,6,7,8,9,10,11,12,13,14,15)]))
,
as.data.frame(data[,16]))
#fix the column names
colnames(s_data) = colnames(data)
#display scaled data
head(s_data)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
#get lasso model
model.lasso=cv.glmnet(x=as.matrix(s_data[,-16]),
y=as.matrix(s_data$Crime),alpha=1,
nfolds = 5,
type.measure="mse",
family="gaussian")
model.lasso$lambda
model.lasso$cvm
plot(model.lasso$lambda, model.lasso$cvm, main='Cross-validated MSE vs lamba'
)
abline (v = model.lasso$lambda.min, col = 'red', lty =2)
##find the lamba with the smallest cvm
x = model.lasso$cvm
which(x == min(x)) #41
## [1] 41
#lamba for the smallest cvm
model.lasso$lambda[which(x == min(x))]
## [1] 6.367246
#or
model.lasso$lambda.min
## [1] 6.367246
plot(model.lasso$lambda, model.lasso$nzero, main =' Number of predictor varia
bles vs. lambda')
abline (v = model.lasso$lambda.min, col = 'red', lty =2)
coefficients(model.lasso, s=model.lasso$lambda.min)
model.lasso.lm = lm(Crime ~M+So+Ed+Po1+M.F+Pop+NW+U1+U2+Wealth+Ineq+Prob, dat
a = s_data)
summary(model.lasso.lm)
#remove factor which p>0.05
model.lasso.lm2 = lm(Crime ~M+ Ed+ Po1+ U2+ Ineq+Prob, data = s_data)
summary(model.lasso.lm2)
library(DAAG)
## Loading required package: lattice
# do 5-fold cross-validation
c.lasso.lm = cv.lm(s_data,model.lasso.lm, m=5)
c.lasso.lm2 = cv.lm(data,model.lasso.lm2, m=5)
# total sum of squared differences between data and its mean
SStot_s = sum((s_data$Crime - mean(s_data$Crime))^2)
SSres_model.lasso.lm = sum(model.lasso.lm$residuals^2)
SSres_model.lasso.lm2 = sum(model.lasso.lm2$residuals^2)
SSres_c.lasso.lm = attr(c.lasso.lm,"ms")*nrow(s_data)
SSres_c.lasso.lm2 = attr(c.lasso.lm2,"ms")*nrow(s_data)
# Calculate R-squareds for models and cross-validation
r2 = c()
r2[1] = 1 - SSres_model.lasso.lm/SStot_s
r2[2] = 1 - SSres_model.lasso.lm2/SStot_s
r2[3] = 1 - SSres_c.lasso.lm/SStot_s # cross-validated lm model
r2[4] = 1 - SSres_c.lasso.lm2/SStot_s #cross-validation lm2 model
r2
## [1] 0.797 0.766 0.532 0.638
ar2=c()
ar2[1] = get_adj_R2(r2[1], 47, 12)
ar2[2] = get_adj_R2(r2[2], 47, 6)
ar2[3] = get_adj_R2(r2[3], 47, 12)
ar2[4] = get_adj_R2(r2[4], 47, 6)
ar2
## [1] 0.726 0.731 0.367 0.584
#Elastic Net
#clean & setup the environment
rm(list = ls())
set.seed(123)
data <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
head(data)
#scale the data; does not scaled col #2 as it is binary and col #16 as it is
the response
s_data = cbind(as.data.frame(scale(data[,1])),
as.data.frame(data[,2]),
as.data.frame(scale(data[,c(3,4,5,6,7,8,9,10,11,12,13,14,15)]))
,
as.data.frame(data[,16]))
#fix the column names
colnames(s_data) = colnames(data)
#display scaled data
head(s_data)
r2=c()
for (i in 0:100) {
model.elastic = cv.glmnet(x=as.matrix(s_data[,-16]),
y=as.matrix(s_data$Crime),
alpha=i/100,
nfolds = 5,
type.measure="mse",
family="gaussian")
#dev.ratio is the percentage of deviance explained
#min index for the dev.ratio of the model
m = which(model.elastic$glmnet.fit$lambda == model.elastic$lambda.min)
r2 = cbind(r2, model.elastic$glmnet.fit$dev.ratio[m])
}
r2
#get the best alpha
#Best value of alpha
alpha = (which.max(r2)-1)/100
alpha
#get model with alphas = 0.05
model.elastic = cv.glmnet(x=as.matrix(s_data[,-16]),
y=as.matrix(s_data$Crime),
alpha=0.05,
nfolds = 5,
type.measure="mse",
family="gaussian")
model.elastic.lm = lm(Crime ~M+So+Ed+Po1+Po2+LF+M.F+NW+U1+U2+Wealth+Ineq+Prob
+Time, data = s_data)
summary(model.elastic.lm)
model.elastic.lm2 = lm(Crime ~M+Ed+U2+Ineq+Prob, data = s_data)
summary(model.elastic.lm2)
# do 5-fold cross-validation
c.elastic.lm = cv.lm(s_data,model.elastic.lm, m=5)
c.elastic.lm2 = cv.lm(data,model.elastic.lm2, m=5)
get_adj_R2 <- function(r2, n, p) {
return (r2 - (1 - r2) * p / (n - p - 1))
}
Hide
# total sum of squared differences between data and its mean
SStot_s = sum((s_data$Crime - mean(s_data$Crime))^2)
SSres_model.elastic.lm = sum(model.elastic.lm$residuals^2)
SSres_model.elastic.lm2 = sum(model.elastic.lm2$residuals^2)
SSres_c.elastic.lm = attr(c.elastic.lm,"ms")*nrow(s_data)
SSres_c.elastic.lm2 = attr(c.elastic.lm2,"ms")*nrow(s_data)
Hide
# Calculate R-squareds for models and cross-validation
r2 = c()
r2[1] = 1 - SSres_model.elastic.lm/SStot_s
r2[2] = 1 - SSres_model.elastic.lm2/SStot_s
r2[3] = 1 - SSres_c.elastic.lm/SStot_s # cross-validated lm model
r2[4] = 1 - SSres_c.elastic.lm2/SStot_s #cross-validation lm2 model
r2
[1] 0.801 0.356 0.423 0.174
Hide
ar2 = c()
ar2[1] = get_adj_R2(r2[1], 47, 14)
ar2[2] = get_adj_R2(r2[2], 47, 5)
ar2[3] = get_adj_R2(r2[3], 47, 14)
ar2[4] = get_adj_R2(r2[4], 47, 5)
ar2
[1] 0.7140 0.2780 0.1703 0.0732
Question 12.1 Describe a situation or problem from your job, everyday life, current events, etc., for which a design of experiments approach would be appropriate.
My tech company is always aggressively hiring new people. They need to find the best candidates for the job in order to reduce turnover. One approach would be a design of experiments for determining which LinkedIn profiles or resumes they should consider for interviews.
Question 12.2 To determine the value of 10 different yes/no features to the market value of a house (large yard, solar roof, etc.), a real estate agent plans to survey 50 potential buyers, showing a fictitious house with different combinations of features. To reduce the survey size, the agent wants to show just 16 fictitious houses. Use R’s FrF2 function (in the FrF2 package) to find a fractional factorial design for this experiment: what set of features should each of the 16 fictitious houses have? Note: the output of FrF2 is “1” (include) or “-1” (don’t include) for each feature.
library(FrF2)
## Loading required package: DoE.base
## Loading required package: grid
## Loading required package: conf.design
## Registered S3 method overwritten by 'DoE.base':
## method from
## factorize.factor conf.design
##
## Attaching package: 'DoE.base'
## The following objects are masked from 'package:stats':
##
## aov, lm
## The following object is masked from 'package:graphics':
##
## plot.design
## The following object is masked from 'package:base':
##
## lengths
FrF2(16, 10)
## A B C D E F G H J K
## 1 -1 -1 -1 -1 1 1 1 1 -1 1
## 2 -1 -1 1 1 1 -1 -1 -1 -1 1
## 3 1 1 1 1 1 1 1 1 1 1
## 4 -1 1 -1 1 -1 1 -1 -1 -1 1
## 5 -1 1 -1 -1 -1 1 -1 1 1 -1
## 6 -1 1 1 -1 -1 -1 1 1 -1 1
## 7 1 -1 -1 -1 -1 -1 1 -1 -1 -1
## 8 1 1 -1 1 1 -1 -1 1 -1 -1
## 9 1 1 1 -1 1 1 1 -1 -1 -1
## 10 -1 -1 -1 1 1 1 1 -1 1 -1
## 11 -1 -1 1 -1 1 -1 -1 1 1 -1
## 12 1 1 -1 -1 1 -1 -1 -1 1 1
## 13 1 -1 1 1 -1 1 -1 1 -1 -1
## 14 1 -1 1 -1 -1 1 -1 -1 1 1
## 15 -1 1 1 1 -1 -1 1 -1 1 -1
## 16 1 -1 -1 1 -1 -1 1 1 1 1
## class=design, type= FrF2
Question 13.1 For each of the following distributions, give an example of data that you would expect to follow this distribution (besides the examples already discussed in class). a. Binomial An example of a binomial distribution could be students taking a class pass or fail. Out of n students, the ones who pass versus fail could follow this distribution,
Geometric An example of a geometric distribution could be the number of houses a couple looks at before they make an offer on one of them.
Poisson The expected number of suicides daily might follow the poisson distribution
Exponential The time between suicides might follow the exponential distribution
Weibull Back to the example of a couple looking at house, with k<1 the time between looking at houses could follow a Weibull distribution with the more time passes the more likely they are to buy.