college<-read.csv("College.csv")
#view(college)
#fix(college)
rownames(college)=college[,1] #column 1 will be turned in the name of the row,
#R will not make operation on this column, this is not a data column.
college=college[,-1]
#i.
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
#ii.
pairs(college[,1:10])
#iii.
plot(college$Private,college$Outstate)
#iv.
Elite=rep("No",nrow(college))
Elite[college$Top10perc>50]="Yes"
Elite=as.factor(Elite)
college=data.frame(college,Elite)
# Delete Top10perc column? use dplyr
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate Elite
## Min. : 10.00 No :699
## 1st Qu.: 53.00 Yes: 78
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
str(college)
## 'data.frame': 777 obs. of 19 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : int 1660 2186 1428 417 193 587 353 1899 1038 582 ...
## $ Accept : int 1232 1924 1097 349 146 479 340 1720 839 498 ...
## $ Enroll : int 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : int 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : int 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: int 2885 2683 1036 510 249 678 416 1594 973 799 ...
## $ P.Undergrad: int 537 1227 99 63 869 41 230 32 306 78 ...
## $ Outstate : int 7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
## $ Room.Board : int 3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
## $ Books : int 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : int 2200 1500 1165 875 1500 675 1500 850 500 1800 ...
## $ PhD : int 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : int 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: int 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : int 7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
## $ Grad.Rate : int 60 56 54 59 15 55 63 73 80 52 ...
## $ Elite : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
plot(college$Elite,college$Outstate)
# v.
par(mfrow=c(2,2))
hist(college$Accept)
hist(college$Enroll)
hist(college$Personal)
hist(college$Expend)
# vi.
#...Continue exploring the data, and provide a brief summary of what you discover.
auto<-read.csv("Auto.csv")
#fix(auto)
summary(auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 150 : 22 Min. :1613
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.0 90 : 20 1st Qu.:2223
## Median :23.00 Median :4.000 Median :146.0 88 : 19 Median :2800
## Mean :23.52 Mean :5.458 Mean :193.5 110 : 18 Mean :2970
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0 100 : 17 3rd Qu.:3609
## Max. :46.60 Max. :8.000 Max. :455.0 75 : 14 Max. :5140
## (Other):287
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 ford pinto : 6
## 1st Qu.:13.80 1st Qu.:73.00 1st Qu.:1.000 amc matador : 5
## Median :15.50 Median :76.00 Median :1.000 ford maverick : 5
## Mean :15.56 Mean :75.99 Mean :1.574 toyota corolla: 5
## 3rd Qu.:17.10 3rd Qu.:79.00 3rd Qu.:2.000 amc gremlin : 4
## Max. :24.80 Max. :82.00 Max. :3.000 amc hornet : 4
## (Other) :368
str(auto)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
sum(complete.cases(auto)) # All the rows are 'complete cases'
## [1] 397
#However I can see that "?" is a value in the data set for column "horsepower".
# It does not make much sense to have 'hoursepower' as factor.
auto$horsepower<-as.numeric(as.character(auto$horsepower))
## Warning: NAs introduced by coercion
#as.numeric(levels(f))[f] more efficient as the number of levels<number of obs.
sum(is.na(auto$horsepower)) #the same can now be seen with summary() function.
## [1] 5
#auto[rowSums(is.na(auto))>0,] would work too.
auto<-na.omit(auto)
Which of the predictors are quantitative, and which are qualitative?
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int 33 127 331 337 355
## ..- attr(*, "names")= chr "33" "127" "331" "337" ...
unique(auto$origin)
## [1] 1 3 2
unique(auto$cylinders)
## [1] 8 4 6 3 5
auto$cylinders<-as.factor(auto$cylinders)
auto$origin<-factor(auto$origin, levels = c(1, 2, 3), labels = c('American', 'European', 'Japanese'))
Using the function ‘str()’ we can see that ‘name’ is the only qualitative variable but looking at the description of the data set (?Auto) it’s possible to notice that variable origin and cylinders should be set as factor as they can take on only fixed discrete values.
What is the range of each quantitative predictor? You can answer this using the range() function.
sum_num<-auto %>% select_if(is.numeric) %>% summary()
sum_num[c(1,6),] #select only Min and Max
## mpg displacement horsepower weight acceleration
## Min. : 9.00 Min. : 68.0 Min. : 46.0 Min. :1613 Min. : 8.00
## Max. :46.60 Max. :455.0 Max. :230.0 Max. :5140 Max. :24.80
## year
## Min. :70.00
## Max. :82.00
What is the mean and standard deviation of each quantitative predictor?
sum_num[4,]
## mpg displacement horsepower weight
## "Mean :23.45 " "Mean :194.4 " "Mean :104.5 " "Mean :2978 "
## acceleration year
## "Mean :15.54 " "Mean :75.98 "
#or
auto %>% select_if(is.numeric) %>% sapply(mean)
## mpg displacement horsepower weight acceleration year
## 23.44592 194.41199 104.46939 2977.58418 15.54133 75.97959
auto %>% select_if(is.numeric) %>% sapply(sd) #standard deviation
## mpg displacement horsepower weight acceleration year
## 7.805007 104.644004 38.491160 849.402560 2.758864 3.683737
Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
auto_minus<-auto[-c(10:85),]
sum_num<-auto_minus %>% select_if(is.numeric) %>% summary()
sum_num[c(1,6),]
## mpg displacement horsepower weight acceleration
## Min. :11.00 Min. : 68.0 Min. : 46.0 Min. :1649 Min. : 8.50
## Max. :46.60 Max. :455.0 Max. :230.0 Max. :4997 Max. :24.80
## year
## Min. :70.00
## Max. :82.00
auto_minus %>% select_if(is.numeric) %>% sapply(mean)
## mpg displacement horsepower weight acceleration year
## 24.40443 187.24051 100.72152 2935.97152 15.72690 77.14557
auto_minus %>% select_if(is.numeric) %>% sapply(sd)
## mpg displacement horsepower weight acceleration year
## 7.867283 99.678367 35.708853 811.300208 2.693721 3.106217
Using the full data set, investigate the predictors graphically, using scatter plots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
auto %>% dplyr::select(-name) %>% pairs()
ggcorr(auto,label=T)
## Warning in ggcorr(auto, label = T): data in column(s) 'cylinders', 'origin',
## 'name' are not numeric and were ignored
hw<-ggplot(auto,aes(x=horsepower,y=weight, col=cylinders)) +
geom_point()
ca<-ggplot(auto,aes(x=cylinders,y=acceleration, col=origin)) +
geom_boxplot() + labs(x="cylinders",colour = "origin")
wc<-ggplot(auto,aes(x=(year+1900),y=weight,col=cylinders)) +
geom_point() +
scale_x_continuous(breaks= scales::pretty_breaks()) +
labs(x="year")
mw<-ggplot(auto,aes(x=weight,y=mpg, col=origin)) +
geom_point()
grid.arrange(hw,ca,wc,mw)
The correlation matrix and the scatterplot matrix are the first tools used to study the relationship among the predictors. Horsepowen, displacement and cylinders are highly correlated with each others and negatively with mpg. We use then scatterplot and boxplot to highlight the relationship between 3 variables at the time (x axis, y axis and color).
Weight-Horsepower, color=Cylinders
From the first plot, we can see that as horsepower increases weight increases as well. However the relationship is not linear. In addition, it can be noticed that more powerful cars have higher number of cylnders.
Cylinders-Acceleration, color=Origin
The second plot tells us that some auto with 3,5 and 8 cylinders are specific to a single country (namely: Japan, Europe, America). 5 cylinders seems to score higher than the others in acceleration.
Year-Weight, color=Cylinders
Cars with less cylinders tend to weigh less. Car weight seems to decrease over the year as the number of cars with 8 cylinders.
Weight-Mpg, color=Origin
Not surprisingly, mpg decreases as weight increases (not linearly). American cars tends to be heavier than Japanese and European.
Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
from the scatter plot matrix and the correlation matrix we see that mpg is strongly correlated to cylinders, displacement, horsepower and weight. The relationship is negative: mpg decreases as these variables (cylinders, displacement, horsepower and weight) increase. That being said these variable are also strongly correlated with each others, we need to take this in consideration when building the model predicting mpg.
auto<-read.csv("Auto.csv")
dim(auto)
## [1] 397 9
pairs(auto)
#View(auto)
Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, which is qualitative.
#auto<-na.omit(auto)
data(Auto)
Auto %>% dplyr::select(-name) %>% ggcorr(label=T)
auto_num<-select_if(auto,is.numeric)
auto_fit<-lm(mpg~.,auto_num)
summary(auto_fit)
##
## Call:
## lm(formula = mpg ~ ., data = auto_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5573 -2.1745 -0.0456 1.8454 12.9946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.014e+01 4.145e+00 -4.858 1.72e-06 ***
## cylinders -4.198e-01 3.203e-01 -1.311 0.1908
## displacement 1.742e-02 7.189e-03 2.423 0.0158 *
## weight -6.928e-03 5.781e-04 -11.983 < 2e-16 ***
## acceleration 1.591e-01 7.741e-02 2.055 0.0405 *
## year 7.703e-01 4.934e-02 15.613 < 2e-16 ***
## origin 1.356e+00 2.691e-01 5.040 7.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.333 on 390 degrees of freedom
## Multiple R-squared: 0.8214, Adjusted R-squared: 0.8186
## F-statistic: 298.9 on 6 and 390 DF, p-value: < 2.2e-16
variables like cylinder, horsepower and weight have a negative coefficient. This means that increasing one of them (singularly) by 1 unit,holding the other constant/fixed, corresponds to a decrease of the response mpg equal to the corresponding coefficient. E.g: If we add 1 cylinder to a car (increase variable cylinder by 1 unit) and keep all the other variable constant. Another thing we can note is that the p-value for cylinder is quite high. This ‘might’ indicate that there is no significant relationship between the variable cylinders and the response mpg. F-statistic is very low, therefore we can reject the null hypothesis that all the coefficient are equal to 0. R-squared and Adjusted R-squared are pretty high which means that there is a pretty clear linear relationship between Y and X’s.
Use the plot() function to produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?
par(mfrow=c(2,2))
plot(auto_fit)
The relationship between Y and the predictor X’s might not be linear as the errors terms plotted against the fitted values seem to scatter around curvilinear line. Further to this variance of the error terms is not constant.
According to Cook’s distance method there are no influencial outliers
Use the * and : symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?
auto_fit2<-lm(mpg~ (. -name)^2, data=Auto) #This creates all combinations of two-way interactions between the predictors (name excluded)
summary(auto_fit2)$coefficients %>%
data.frame() %>%
subset(Pr...t..<=0.05) %>%
dplyr::select(everything(),p_value=Pr...t..)
## Estimate Std..Error t.value p_value
## displacement -0.478538689 0.189353429 -2.527225 0.011920695
## acceleration -5.859173212 2.173621188 -2.695582 0.007353578
## origin -20.895570401 7.097090511 -2.944245 0.003445892
## displacement:year 0.005933802 0.002390716 2.482019 0.013515633
## acceleration:year 0.055621508 0.025581747 2.174265 0.030330641
## acceleration:origin 0.458316099 0.156659694 2.925552 0.003654670
#subset keeps row names
Displacement:year, acceleration:year, acceleration:origin seem to be statistically significant according to p-value criterion.
Try a few different transformations of the variables, such as \(\log(x), \sqrt{x}, x^2\), Comment on your findings.
lm.fit<-lm(mpg ~ displacement + horsepower + weight, data=Auto)
lm.fit.log<-lm(mpg ~ log(displacement) + log(horsepower) + log(weight), data=Auto)
lm.fit.sqrt<-lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight), data=Auto)
lm.fit.p2<-lm(mpg ~ poly(displacement,2) + poly(horsepower,2) + poly(weight,2), data=Auto)
summary(lm.fit)$adj.r.squared
## [1] 0.7046897
summary(lm.fit.log)$adj.r.squared
## [1] 0.7401928
summary(lm.fit.sqrt)$adj.r.squared
## [1] 0.7254835
summary(lm.fit.p2)$adj.r.squared
## [1] 0.7494586
par(mfrow=c(2,2))
plot(lm.fit)
par(mfrow=c(2,2))
plot(lm.fit.log)
par(mfrow=c(2,2))
plot(lm.fit.sqrt)
par(mfrow=c(2,2))
plot(lm.fit.p2)
For simplicity’s sake we consider 3 variables: displacement, horsepower, weight and perform the transformation.
adjusted R^2 for the different models are all quite similar. The linear model performs a little bit worse than the others with transformed predictor.
The residual plots are all quite similar, all the models somehow suffer form non-constant variance of the residuals and the error term might not be normally distributed (skewed right). These 2 facts suggest that a transformation of the response might lead to better results. We can try with log(y) which is called ‘variance stabilizing’ transformation.
lm.logy<-lm(log(mpg) ~ displacement + horsepower + weight, data=Auto)
summary(lm.logy)
##
## Call:
## lm(formula = log(mpg) ~ displacement + horsepower + weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47436 -0.10001 -0.00689 0.09827 0.52821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.063e+00 4.420e-02 91.922 < 2e-16 ***
## displacement -3.506e-04 2.433e-04 -1.441 0.15
## horsepower -2.215e-03 4.736e-04 -4.677 4.02e-06 ***
## weight -2.235e-04 2.633e-05 -8.487 4.54e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1568 on 388 degrees of freedom
## Multiple R-squared: 0.7891, Adjusted R-squared: 0.7874
## F-statistic: 483.8 on 3 and 388 DF, p-value: < 2.2e-16
This question should be answered using the Carseats data set.
data("Carseats")
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
lm.fit<-lm(Sales~Price + Urban + US, data=Carseats)
summary(lm.fit)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!
PRICE: low p-value suggest that there is a significant relationship between price and sales. I reject the underlying null hypothesis beta=0. The estimated coefficient is negative meaning that when price increases sale decreases. every unit increase in price leads to a decrease in saleas by -0.0544588 holding the other predictors constant.
URBANYES: p-values is very high. We fail to reject the underlying null hypothesis (beta=0) and conclude that there is no relationship between variable urban and sales.
USYES: low p-value suggest that there is a significant relationship between the variable US and sales. I reject the underlying null hypothesis beta=0 and conclude that if store is in the US, sales are 1.2005727 higher regardless of price.
Write out the model in equation form, being careful to handle the qualitative variables properly.
\(Sales = 13.043 + 1.2X_{us} - 0.022X_{urban} - 0.054X_{price}\)
with \(\cases{X_{us} =1 &\text{in US} \\X_{us}=0 &\text{outside US}}\) and \(\cases{X_{urban} =1 &\text{urban} \\X_{urban}=0 &\text{non-urban}}\)
For which of the predictors can you reject the null hypothesis \(H_0 : \beta_{j} = 0\) ?
USYes and Price. The related p-values is lower than any standard level of significance.
On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.
lm.fit2<-lm(Sales~Price + US, data=Carseats)
summary(lm.fit2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
How well do the models in (a) and (e) fit the data?
Both the model do not fit the date very well, the adjusted R^2 is quite low.
Using the model from (e), obtain 95% confidence intervals for the coefficient(s).
confint(lm.fit2, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
Is there evidence of outliers or high leverage observations in the model from (e)?
par(mfrow=c(2,2))
plot(lm.fit2)
Observation 51,69 and 377 might be outliers as the corresponding residual is high. However there is no influential case according to Cook’s distance as no observation is beyond the dotted line.
PS: somehow with par(mfrow=c(2,2)) the appearance of the plots change and it seems that there are observation beyond the dotted line for Cook’s distance. I trust the plot without par(mfrow=c(2,2)).
This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010
Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
data("Weekly")
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
pairs(Weekly)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
Weekly %>% select_if(is.numeric) %>% ggcorr()
According to the scatter plot matrix it seems that there is a positive relationship between ‘Volume’ and ‘Year’. At this point we cannot spot any other pattern. As year passes the volume increases. The correlation matrix confirm that only Volume and Year there correlation.
ggplot(Weekly,mapping=aes(x=Year,y=Volume)) + geom_point()
The average number of shares traded daily increased from 2001 to 2005.
Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
levels(Weekly$Direction)
## [1] "Down" "Up"
W.fit<-glm(Direction~Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, Weekly, family = binomial)
summary(W.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only the variable ‘Lag2’ can be considered as statistically significant for a level of significance alfa = 0.05 (p-value_Lag2<0.05). All the other p-values are higher than usual significance values. This means that for all the predictor, except for ‘Lag2’, we fail to reject the null hypothesis (beta=0).
Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
W.fit.pred<-predict(W.fit,type="response")
contrasts(Weekly$Direction) #to check how R has coded "up" and "down".
## Up
## Down 0
## Up 1
Weekly$fit_pred<-W.fit.pred
Weekly$fit_dir<-ifelse(Weekly$fit_pred>0.5,"Up","Down")#we have set 0.5 as threshold.
conf<-table(Weekly$fit_dir,Weekly$Direction)
conf
##
## Down Up
## Down 54 48
## Up 430 557
#fp<-conf["Up","Down"]/colSums(conf)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
table(Weekly$Direction)
##
## Down Up
## 484 605
#sum(diag(conf))/sum(conf)
correct<-mean(Weekly$Direction == Weekly$fit_dir)
correct
## [1] 0.5610652
paste('Correct prediction:', correct)
## [1] "Correct prediction: 0.561065197428834"
The confusion matrix compares my prediction with the actual value. The prediction was generated by a model build on the training data set. The predicted values (based on the training data set) are compared to the actual response in the training data set.
On the main diagonal I have the correct prediction: Responses predicted UP or DOWN and the actual response is UP or DOWN respectively.
On the off-diagonal, I have the errors: values predicted UP or DOWN but the actual value is opposite.
With the confusion matrix we can calculate the False Positive Ratio: Responses that I have predicted as Positive/True/1 but the real values is Negative/False/0. This number is than divided by the number of real negative values.
Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
Weekly_tr<-filter(Weekly,Year<=2008) %>% dplyr::select(-fit_dir,-fit_pred) ## MASS package clashes with dplyr
#summary(Weekly_tr)
w.fit.tr<-glm(Direction~Lag2,Weekly_tr,family=binomial) #Model built using training data set.
#summary(w.fit.tr)
Weekly_test<-filter(Weekly,Year>2008) %>% dplyr::select(-fit_dir,-fit_pred)
W.fit.pred.test<-predict.glm(w.fit.tr, newdata = Weekly_test, type="response") # vector of probabilities. I am applying the model (function) found fitting a glm on training data on the test dataset
Weekly_test$fit_pred<-W.fit.pred.test #add a column to the data set
Weekly_test$fit_dir<-ifelse(Weekly_test$fit_pred>0.5,"Up","Down")
#we have set 0.5 as threshold.
conf_t<-table(Weekly_test$fit_dir,Weekly_test$Direction)
conf_t
##
## Down Up
## Down 9 5
## Up 34 56
fp_t<-conf_t["Up","Down"]/colSums(conf_t)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t
## Down
## 0.7906977
correct_t<-mean(Weekly_test$Direction == Weekly_test$fit_dir)
#same as:sum(diag(conf_t))/sum(conf_t)
paste('Correct prediction:', correct_t)
## [1] "Correct prediction: 0.625"
Fraction of correct prediction increased. False Positive ratio decreased.
Repeat (d) using LDA.
#alternative approach for test and training data set:
#add a column with value [train,test] instead of duplicating the dataset and use argument 'subset=train' in glm, lda etc etc functions.
w.fit.tr.lda<-lda(Direction~Lag2,Weekly_tr) #Model built using training data set.
#summary(w.fit.tr.lda)
lda.predict<-predict(w.fit.tr.lda, newdata = Weekly_test) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "clasS" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant
Weekly_test$fit_dir.lda<-lda.predict$class
conf_t.lda<-table(Weekly_test$fit_dir.lda,Weekly_test$Direction)
conf_t.lda
##
## Down Up
## Down 9 5
## Up 34 56
fp_t.lda<-conf_t.lda["Up","Down"]/colSums(conf_t.lda)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.lda
## Down
## 0.7906977
correct_t.lda<-mean(Weekly_test$Direction == Weekly_test$fit_dir.lda)
#same as: sum(diag(conf_t.lda))/sum(conf_t.lda)
paste('Correct prediction:', correct_t.lda)
## [1] "Correct prediction: 0.625"
LDA and logistic regression return the same results
Repeat (d) using QDA.
w.fit.tr.qda<-qda(Direction~Lag2,Weekly_tr) #Model built using training data set.
#summary(w.fit.tr.qda)
qda.predict<-predict(w.fit.tr.qda, newdata = Weekly_test) #predict applied to QDA returns a list with 3 element.
#"class", "posterior", "X".
# "clasS" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant
Weekly_test$fit_dir.qda<-qda.predict$class
conf_t.qda<-table(Weekly_test$fit_dir.qda, Weekly_test$Direction)
conf_t.qda
##
## Down Up
## Down 0 0
## Up 43 61
fp_t.qda<-conf_t.qda["Up","Down"]/colSums(conf_t.qda)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.qda
## Down
## 1
correct_t.qda<-mean(Weekly_test$Direction == Weekly_test$fit_dir.qda)
#same as: sum(diag(conf_t.qda))/sum(conf_t.qda)
paste('Correct prediction:', correct_t.qda)
## [1] "Correct prediction: 0.586538461538462"
Repeat (d) using KNN with K = 1.
library(class)
set.seed(1)
knn.pred<-knn(as.data.frame(Weekly_tr$Lag2), as.data.frame(Weekly_test$Lag2), Weekly_tr$Direction, 1)
# knn wants as input train and test as matrix/df.
# train = matrix/df of training set
# test = " " of test set
# cl = factor or true classification of training set
conf_t.knn<-table(knn.pred, Weekly_test$Direction)
conf_t.knn
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
fp_t.knn<-conf_t.knn["Up","Down"]/colSums(conf_t.knn)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.knn
## Down
## 0.5116279
correct_t.knn<-mean(Weekly_test$Direction == knn.pred)
#same as: sum(diag(conf_t.knn))/sum(conf_t.knn)
paste('Correct prediction:', correct_t.knn)
## [1] "Correct prediction: 0.5"
Which of these methods appears to provide the best results on this data?
Logistic regression and linear discriminant seems to guarantee the higher level of correctness however the false positive for KNN algorithm is lower than the other methods.
Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
k.val<-rep(NA,15)
knn.correct<-k.val
set.seed(1)
for( i in seq_along(k.val)) {
knn.pred<-knn(as.data.frame(Weekly_tr$Lag2), as.data.frame(Weekly_test$Lag2), Weekly_tr$Direction, k=i)
knn.correct[i]<-mean(Weekly_test$Direction == knn.pred)
}
data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
ggplot(aes(k, cor_per)) +
geom_col() +
labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
scale_x_continuous(breaks = 1:length(k.val)) +
coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
max(knn.correct)
## [1] 0.6057692
which.max(knn.correct)
## [1] 12
lda.mod <- lda(Direction ~ Lag2 + Lag3, data = Weekly_tr)
pred<- predict(lda.mod, newdata = Weekly_test)
pred.val <- pred$class
paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.625"
table(pred.val, Weekly_test$Direction)
##
## pred.val Down Up
## Down 8 4
## Up 35 57
lda.mod <- lda(Direction ~ Lag2:Lag5, data = Weekly_tr)
pred<- predict(lda.mod, newdata = Weekly_test)
pred.val <- pred$class
paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(pred.val, Weekly_test$Direction)
##
## pred.val Down Up
## Down 0 0
## Up 43 61
qda.mod <- qda(Direction ~ Lag2 + log(abs(Lag2)), data = Weekly_tr)
pred<- predict(qda.mod, newdata = Weekly_test)
pred.val <- pred$class
paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(pred.val, Weekly_test$Direction)
##
## pred.val Down Up
## Down 11 11
## Up 32 50
l.mod = glm(Direction ~ Lag2:Lag1, data = Weekly_tr, family = binomial)
l.probs = predict(l.mod, newdata = Weekly_test, type = "response")
l.pred<-ifelse(l.probs >= 0.5, 'Up', 'Down')
paste('Correct prediction:', mean(l.pred == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(l.pred, Weekly_test$Direction)
##
## l.pred Down Up
## Down 1 1
## Up 42 60
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
data(Auto)
Auto$mpg01<-ifelse(Auto$mpg>median(Auto$mpg),1,0)
mean(Auto$mpg01) # it should return 0.5
## [1] 0.5
Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
Auto %>% dplyr::select(-name, -mpg) %>% plot()
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
Auto %>% dplyr::select(-name,-mpg) %>% ggcorr(palette = "RdBu", label = TRUE)
Cylinder, Displacement, horsepower and Weight seem to have the strongest relationship with mpg01.
#ggplot(Auto,aes(x=as.factor(mpg01), y=cylinders)) + geom_boxplot()
base<-ggplot(Auto) + labs(x = "mpg01")
d<-base + geom_boxplot(aes(x=as.factor(mpg01), y=displacement)) + coord_flip()
h<-base + geom_boxplot(aes(x=as.factor(mpg01), y=horsepower)) + coord_flip()
w<-base + geom_boxplot(aes(x=as.factor(mpg01), y=weight)) + coord_flip()
grid.arrange(d, h, w, nrow=3)
boxplots confirm what found with the correlation matrix and the matrix scatterplot.
Split the data into a training set and a test set.
set.seed(1)
train<-sample(nrow(Auto), nrow(Auto)/2)
train.set<-Auto[train,]
test.set<-Auto[-train,]
Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
lda.fit<-lda(mpg01~ cylinders + displacement + horsepower + weight, data = train.set) #Model built using training data set.
#summary(w.fit.tr.lda)
lda.predict<-predict(lda.fit, newdata = test.set) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant
correct_t.lda<-mean(test.set$mpg01 == lda.predict$class)
paste('Correct prediction:', correct_t.lda)
## [1] "Correct prediction: 0.872448979591837"
paste('test error:', 1-correct_t.lda)
## [1] "test error: 0.127551020408163"
Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
qda.fit<-qda(mpg01~ cylinders + displacement + horsepower + weight, data = train.set) #Model built using training data set.
qda.predict<-predict(qda.fit, newdata = test.set) #predict applied to QDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant
correct_t.qda<-mean(test.set$mpg01 == qda.predict$class)
paste('Correct prediction:', correct_t.qda)
## [1] "Correct prediction: 0.88265306122449"
paste('test error:', 1-correct_t.qda)
## [1] "test error: 0.11734693877551"
QDA does a little bit worse than LDA
Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
log.reg = glm(mpg01~ cylinders + displacement + horsepower + weight, data = train.set, family = binomial)
log.probs = predict(log.reg, newdata = test.set, type = "response")
log.pred<-ifelse(log.probs > 0.5, 1, 0)
correct_t.log<-mean(log.pred == test.set$mpg01)
paste('Correct prediction:', correct_t.log)
## [1] "Correct prediction: 0.877551020408163"
paste('test error:', 1-correct_t.log)
## [1] "test error: 0.122448979591837"
Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?
k.val<-rep(NA,20)
knn.correct<-k.val
var_sel<-c('cylinders', 'displacement', 'horsepower', 'weight')
x_train<-train.set %>% dplyr::select(var_sel)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(var_sel)` instead of `var_sel` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
y_train<-train.set$mpg01
x_test<-test.set %>% dplyr::select(var_sel)
set.seed(1)
for( i in seq_along(k.val)) {
knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)
knn.correct[i]<-mean(test.set$mpg01 == knn.pred)
}
data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
ggplot(aes(k, cor_per)) +
geom_col() +
labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
scale_x_continuous(breaks = 1:length(k.val)) +
coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))
paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 3"
paste('Correct prediction:', max(knn.correct))
## [1] "Correct prediction: 0.877551020408163"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.122448979591837"
This problem involves writing functions.
Write a function, Power(), that prints out the result of raising 2 to the 3rd power. In other words, your function should compute \(2^3\) and print out the results.
Hint: Recall that x^a raises x to the power a. Use the print() function to output the result.
Power<-function(a = 2, b = 3) print(a^b)
Power()
## [1] 8
Create a new function, Power2(), that allows you to pass any two numbers, x and a, and prints out the value of x^a. You can do this by beginning your function with the line
Power2 =function (x,a){
You should be able to call your function by entering, for instance,
Power2 (3,8)
on the command line. This should output the value of \(3^8\), namely, 6,561.
Power2 <- function(x, a) print(x^a)
Power2(3,8)
## [1] 6561
Using the Power2() function that you just wrote, compute \(10^3\), \(8^{17}\), and \(131^3\)
Power2(10,3)
## [1] 1000
Power2(8,17)
## [1] 2.2518e+15
Power2(131,3)
## [1] 2248091
Now create a new function, Power3(), that actually returns the result x^a as an R object, rather than simply printing it to the screen. That is, if you store the value x^a in an object called result within your function, then you can simply return() this result, using the following line:
return (result )
The line above should be the last line in your function, before the } symbol.
Power3 = function(x, a) {
result = x^a
return(result)
}
Now using the Power3() function, create a plot of \(f(x) = x^2\). The x-axis should display a range of integers from 1 to 10, and the y-axis should display \(x^2\). Label the axes appropriately, and use an appropriate title for the figure. Consider displaying either the x-axis, the y-axis, or both on the log-scale. You can do this by using log=‘‘x’’, log=‘‘y’’, or log=‘‘xy’’ as arguments to the plot() function.
x = 1:10
plot(x, Power3(x, 2), log = "xy", ylab = "Log of y = x^2", xlab = "Log of x",
main = "Log of x^2 versus Log of x")
Create a function, PlotPower(), that allows you to create a plot of x against x^a for a fixed a and for a range of values of x. For instance, if you call
PlotPower (1:10 ,3)
then a plot should be created with an x-axis taking on values 1, 2, . . . , 10, and a y-axis taking on values \(1^3, 2^3, . . . , 10^3\).
PlotPower = function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
#plot(Boston)
Boston %>% ggcorr(palette = "RdBu", label = TRUE)
Boston$crim01<-ifelse(Boston$crim>median(Boston$crim),1,0)
set.seed(1)
train<-sample(nrow(Boston),nrow(Boston)/2)
train.set<-Boston[train,]
test.set<-Boston[-train,]
LOGISTIC REGRESSION: all variables
log_reg<-glm(crim01 ~. -crim, data = train.set, family = binomial)
log_prob<-predict(log_reg, newdata = test.set, type="response")
log_pred<-ifelse(log_prob>0.5, 1, 0)
paste("test error:", mean(log_pred != test.set$crim01))
## [1] "test error: 0.0988142292490119"
LOGISTIC REGRESSION: subset (highest correlation value with crim01)
log_reg.sub<-glm(crim01 ~ tax + rad + dis + age + nox + indus, data = train.set, family = binomial)
log_prob.sub<-predict(log_reg.sub, newdata = test.set, type="response")
log_pred.sub<-ifelse(log_prob.sub>0.5, 1, 0)
paste("test error:", mean(log_pred.sub != test.set$crim01))
## [1] "test error: 0.114624505928854"
LDA: all variables
lda.fit<-lda(crim01~. -crim, data = train.set) #Model built using training data set.
lda.predict<-predict(lda.fit, newdata = test.set) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant
paste('test error:', mean(test.set$crim01 != lda.predict$class))
## [1] "test error: 0.162055335968379"
LDA subset (highest correlation value with crim01)
lda.fit<-lda(crim01~tax + rad + dis + age + nox + indus, data = train.set) #Model built using training data set.
lda.predict<-predict(lda.fit, newdata = test.set)
paste('test error:', mean(test.set$crim01 != lda.predict$class))
## [1] "test error: 0.158102766798419"
QDA: all variables
qda.fit<-qda(crim01~. -crim, data = train.set) #Model built using training data set.
qda.predict<-predict(qda.fit, newdata = test.set)
paste('test error:', mean(test.set$crim01 != qda.predict$class))
## [1] "test error: 0.0988142292490119"
QDA subset (highest correlation value with crim01)
qda.fit<-qda(crim01~tax + rad + dis + age + nox + indus, data = train.set) #Model built using training data set.
qda.predict<-predict(qda.fit, newdata = test.set)
paste('test error:', mean(test.set$crim01 != qda.predict$class))
## [1] "test error: 0.0988142292490119"
KNN: all variables
k.val<-rep(NA,20)
knn.correct<-k.val
x_train<-train.set %>% dplyr::select(-crim)
y_train<-train.set$crim01
x_test<-test.set %>% dplyr::select(-crim)
set.seed(1)
for( i in seq_along(k.val)) {
knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)
knn.correct[i]<-mean(test.set$crim01 == knn.pred)
}
data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
ggplot(aes(k, cor_per)) +
geom_col() +
labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
scale_x_continuous(breaks = 1:length(k.val)) +
coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))
paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 1"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.0948616600790514"
KNN: subset
k.val<-rep(NA,20)
knn.correct<-k.val
var_sel<-c("tax","rad","dis","age","nox","indus")
x_train<-train.set %>% dplyr::select(var_sel)
y_train<-train.set$crim01
x_test<-test.set %>% dplyr::select(var_sel)
set.seed(1)
for( i in seq_along(k.val)) {
knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)
knn.correct[i]<-mean(test.set$crim01 == knn.pred)
}
data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
ggplot(aes(k, cor_per)) +
geom_col() +
labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
scale_x_continuous(breaks = 1:length(k.val)) +
coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))
paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 3"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.0869565217391305"
KNN seems to be return the most accurate results. The best test error was achieved with K=3 and a subset of the predictors available “tax”,“rad”,“dis”,“age”,“nox”,“indus” or with k=1 and all the predictors.
In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
data(Default)
Fit a logistic regression model that uses income and balance to predict default.
log.fit<-glm(default~ income + balance, data=Default, family = binomial)
Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:
set.seed(10)
train=sample(nrow(Default) , nrow(Default)/2) #Generates a vector containing the indexes for train set. Remaining indexes in 'Default' data set are for testing.
log.fit_TR<-glm(default ~ income + balance, data=Default, family = binomial, subset = train)
test_set= Default[-train,]
test_set$log.pred.test<-predict(log.fit_TR, type="response", newdata = test_set)
test_set$log.prediction<-ifelse(test_set$log.pred.test>0.5, "Yes", "No")
test_set$log.prediction[1:5] #example
## [1] "No" "No" "No" "No" "No"
#attributes(Default$default)
#Default$log.prediction<-factor(Default$log.prediction, levels = attributes(Default$default)$levels)
VSE<-mean(test_set$log.prediction!=test_set$default)
VSE
## [1] 0.0288
Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.
1st iteration
set.seed(1)
val_err<-rep(NA,3)
split<-c(0.5,0.7,0.9)
for (i in seq_along(val_err)) {
train_size<-nrow(Default) * split[i]
train<-sample(1:nrow(Default), size = train_size)
train_set<-Default[train,]
test_set<-Default[-train,]
log_train<-glm(default ~ income + balance, data = train_set, family = 'binomial')
pred.prb<-predict(log_train, test_set, type = 'response')
pred_values<-ifelse(pred.prb > 0.5, 'Yes', 'No')
val_err[i]<-mean(pred_values != test_set$default)
}
ss.err<-data.frame(split,val_err)
ss.err
## split val_err
## 1 0.5 0.0254
## 2 0.7 0.0290
## 3 0.9 0.0160
bestsplit<-ss.err %>% filter(val_err==min(val_err))
bestsplit
## split val_err
## 1 0.9 0.016
We have lowest estimated test error with split 0.9. Meaning that 0.9of the observation were used to train the model while the remaining ones where used to estimated the test error.
Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.
set.seed(1)
train_size<-nrow(Default) * bestsplit$split
train<-sample(1:nrow(Default), size = train_size)
train_set<-Default[train,]
test_set<-Default[-train,]
log_train.S<-glm(default ~ income + balance + student, data = train_set, family = 'binomial')
pred.prb.S<-predict(log_train.S, test_set, type = 'response')
pred_values.S<-ifelse(pred.prb.S > 0.5, 'Yes', 'No')
mean(pred_values.S != test_set$default)
## [1] 0.031
Including Student in the model does not lead to any accuracy improvement.
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
data(Default)
data(Boston)
Using the summary() and glm() functions, determine the estimated standard errors for the coefficients associated with income and balance in a multiple logistic regression model that uses both predictors.
log.fit<-glm(default~ income + balance, data=Default, family = binomial)
log.coef<-summary(log.fit)$coefficients # [,'Std. Error']
Write a function, boot.fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.
#The bootstrap approach can be used to assess the variability of the coefficient estimates and predictions from a statistical learning method.
#Function 'boot.fn' returns the coefficient estimates for income and balance in the multiple logistic regression model using the index specified
boot.fn<-function(data,index){
return(coef(glm(default~income + balance, data=data, subset = index, family = binomial)))
}
Use the boot() function together with your boot.fn() function to estimate the standard errors of the logistic regression coefficients for income and balance.
n<-nrow(Default)
set.seed (1) # If I run set.seed and boot.fn over and over I always get the same results but if I run only boot.fn always get different results. Set.seed provide reproducible results.
#boot.fn(Default ,sample (n ,n , replace =T))
#fit log model over a sample from Default df and as "big" as Default df. the sample is draw with replacement, therefore the same obs can appear more than once.
boot.coef<-boot(Default ,boot.fn ,100)
boot.coef
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 8.556378e-03 4.122015e-01
## t2* 2.080898e-05 -3.993598e-07 4.186088e-06
## t3* 5.647103e-03 -4.116657e-06 2.226242e-04
#The boot command executes the re-sampling of the data set and calculation of the statistic(s) of interest on these samples (coefficient in this case). Before calling boot, it's necessary to define a function that will return the statistic(s) that you would like to bootstrap.
Comment on the estimated standard errors obtained using the glm() function and using your bootstrap function.
log.coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
boot.coef
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 8.556378e-03 4.122015e-01
## t2* 2.080898e-05 -3.993598e-07 4.186088e-06
## t3* 5.647103e-03 -4.116657e-06 2.226242e-04
The std. error obtained by the “Regular” glm() function and the bootstrap function are very similar. This suggest that the model found using logistic regression provides a good fit to the data. SE(b..)^2 depends on sigma^2 which is the variance of the error term. Sigma^2 is an unknown parameter as we don’t have this kind of details from the population. Sigma^2 is estimated using RSE (residual standard error) = sqrt(RSS/(n-2)) where RSS is the residual sum of square sum((y-yhat)^2). So if y and yhat are “close” to each other, this will result in small RSE.
RSS is also SSE, while RSE is sometimes referred to sqrt(MSE)??
We will now perform cross-validation on a simulated data set.
Generate a simulated data set as follows:
set.seed (1)
x = rnorm (100)
y = x - 2*x^2 + rnorm (100)
In this data set, what is n and what is p? Write out the model used to generate the data in equation form.
x is a vector containing 100 random values from a normal distribution with mu = 0 and sigma = 1. So n = 100.
There are going to be 2 regression parameters (p=2) as the mu_x = 0 y = x - 2*x^2 + rnorm (100) can be rewritten as: y = x - 2x^2 + epsilon
In this model epsilon (random error term) would accountable for all the variation of predictor x as input.
Create a scatterplot of X against Y . Comment on what you find.
plot(x,y)
#min(y)
#max(y)
The plot shows a quadratic form. x ranges between -2 and 2 while y from -12 and 2.
Set a random seed, and then compute the LOOCV errors that result from fitting the following four models using least squares:
Y = β0 + β1X + epsilon
Y = β0 + β1X + β2X2 + epsilon
Y = β0 + β1X + β2X2 + β3X3 + epsilon
Y = β0 + β1X + β2X2 + β3X3 + β4X4 + epsilon.
Note you may find it helpful to use the data.frame() function to create a single data set containing both X and Y.
DF = data.frame(x, y)
set.seed(1)
# i.
glm.fit = glm(y ~ x)
cv.glm(DF, glm.fit)$delta
## [1] 7.288162 7.284744
# This function calculates the estimated K-fold cross-validation prediction error for generalized linear models. if k is not specified it is set equal to n ?
## ii.
glm.fit = glm(y ~ poly(x,2))
cv.glm(DF, glm.fit)$delta
## [1] 0.9374236 0.9371789
## iii.
glm.fit = glm(y ~ poly(x,3))
cv.glm(DF, glm.fit)$delta
## [1] 0.9566218 0.9562538
## iv.
glm.fit = glm(y ~ poly(x,4))
cv.glm(DF, glm.fit)$delta
## [1] 0.9539049 0.9534453
Alternatively
set.seed(1)
ip = 4 #iteration/polynomial degree
cv.error=rep (0,ip)
for (i in 1:ip){
glm.fit<-glm(y~poly(x ,i))
cv.error[i]<-cv.glm(DF ,glm.fit)$delta[1]
}
cv.error
## [1] 7.2881616 0.9374236 0.9566218 0.9539049
Repeat (c) using another random seed, and report your results. Are your results the same as what you got in (c)? Why?
set.seed(3)
ip = 4 #iteration/polynomial degree
cv.error=rep (0,ip)
for (i in 1:ip){
glm.fit<-glm(y~poly(x ,i))
cv.error[i]<-cv.glm(DF ,glm.fit)$delta[1]
}
cv.error
## [1] 7.2881616 0.9374236 0.9566218 0.9539049
Setting a new seed does not change the results obtained as there is no randomness in the training/validation set splits. LOOCV will fit the statistical learning method (Linear regression, here) n times over the training set composed by n-1 observations.
Which of the models in (c) had the smallest LOOCV error? Is this what you expected? Explain your answer.
The second degree polynomial has the lowest cross-validation (prediction?) error as the true underlying function between y and x is actually quadratic: y = x - 2x^2 + epsilon.
Comment on the statistical significance of the coefficient estimates that results from fitting each of the models in (c) using least squares. Do these results agree with the conclusions drawn based on the cross-validation results?
glm(y~poly(x ,2)) %>% summary()
##
## Call:
## glm(formula = y ~ poly(x, 2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9650 -0.6254 -0.1288 0.5803 2.2700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5500 0.0958 -16.18 < 2e-16 ***
## poly(x, 2)1 6.1888 0.9580 6.46 4.18e-09 ***
## poly(x, 2)2 -23.9483 0.9580 -25.00 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9178258)
##
## Null deviance: 700.852 on 99 degrees of freedom
## Residual deviance: 89.029 on 97 degrees of freedom
## AIC: 280.17
##
## Number of Fisher Scoring iterations: 2
p-values are all much lower than standard level of significance (0.05,0.001…). We reject the hypothesis that related betas are = 0 and conclude that both the x terms (x and x^2) are statistically significant. This is in line with the CV results.
We will now consider the Boston housing data set, from the MASS library.
Based on this data set, provide an estimate for the population mean of medv. Call this estimate ˆμ.
mu_hat<-mean(Boston$medv)
mu_hat
## [1] 22.53281
Provide an estimate of the standard error of ˆμ. Interpret this result. Hint: We can compute the standard error of the sample mean by dividing the sample standard deviation by the square root of the number of observations.
n<-nrow(Boston)
se_hat<-sd(Boston$medv)/sqrt(n)
se_hat
## [1] 0.4088611
The standard error tells us the average amount that the estimate ˆμ differs from the actual value of μ.
Now estimate the standard error of ˆμ using the bootstrap. How does this compare to your answer from (b)?
boot.fn<-function(data,index){
return(mean(data[index]))
}
n<-nrow(Boston)
#boot.fn(Boston$medv, sample(n,n, replace = TRUE)) this return one iteration of the bootstrap method
bstrap = boot(Boston$medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.01381542 0.4181613
#plot(bstrap)
#It seems that the function boot.fn() needs to have arguments (data,index) as they are needed by the function boot() in order to perform bootstrapping.
The standard error from point b) and c) are very similar.
Based on your bootstrap estimate from (c), provide a 95% confidence interval for the mean of medv. Compare it to the results obtained using t.test(Boston$medv). Hint: You can approximate a 95% confidence interval using the formula [ˆμ − 2SE(ˆμ), ˆμ + 2SE(ˆμ)].
#to get sd from output of boot() function:
# sd(bstrap$t) as t contains all the estimates of t (medv). I am applying sd() function to bstrap$t which is a vector with R=1000 values
# se <- summary(bstrap)
ci<-c(mu_hat - 2*sd(bstrap$t), mu_hat + 2*sd(bstrap$t))
ci
## [1] 21.69648 23.36913
t.test(Boston$medv) # $conf.int
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Confidence intervals calculated with t.test() and boostrap are very similar. The difference can be attributed to the different values for st dev.
Based on this data set, provide an estimate, ˆμmed, for the median value of medv in the population.
medv.med<-median(Boston$medv)
medv.med
## [1] 21.2
We now would like to estimate the standard error of ˆμmed. Unfortunately, there is no simple formula for computing the standard error of the median. Instead, estimate the standard error of the median using the bootstrap. Comment on your findings.
boot.fn<-function(data,index){
return(median(data[index]))
}
bstrap <- boot(Boston$medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0308 0.3661426
Median of 21.2 with SE of 0.39. Small standard error relative to median value.
Based on this data set, provide an estimate for the tenth percentile of medv in Boston suburbs. Call this quantity ˆμ0.1. (You can use the quantile() function.)
medv.tenth = quantile(Boston$medv, 0.1)
medv.tenth
## 10%
## 12.75
Use the bootstrap to estimate the standard error of ˆμ0.1. Comment on your findings.
boot.fn<-function(data,index){
return(quantile(data[index],0.1))
}
bstrap <- boot(Boston$medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 -0.0016 0.5138935
St. error is still quite small compared to the tenth-percentile value.
Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector \(\epsilon\) of length n = 100.
x=rnorm(100)
e = rnorm(100)
Generate a response vector Y of length n = 100 according to the model:
\(Y = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \epsilon\), where \(\beta_0,\beta_1,\beta_2\) and \(\beta_3\) are constants of your choice
b0=3
b1=1
b2=-2
b3=4
y=b0 + b1*x + b2*x^2 + b3*x^3
Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors \(X,X^2, . . .,X^{10}\). What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.
data<-data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10),data,nvmax = 10)
regfit.adjr2<-summary(regfit.full)$adjr2
regfit.cp<-summary(regfit.full)$cp
regfit.bic<-summary(regfit.full)$bic
#The summary() command outputs the best set of variables for each model size.
# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfit.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfit.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfit.bic)) %>% print()
## [1] "Polynimial degree: 4"
#ggplot(mapping=aes(x=seq_along(regfit.adjr2), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="adjR2")
#ggplot(mapping=aes(x=seq_along(regfit.bic), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="BIC")
#ggplot(mapping=aes(x=seq_along(regfit.cp), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="Cp")
plot(regfit.adjr2, xlab = "Subset Size", ylab = "Adjr2", type = "l")
points(which.max(regfit.adjr2), regfit.adjr2[which.max(regfit.adjr2)], pch = 4, col = "red", lwd = 3)
plot(regfit.cp, xlab = "Subset Size", ylab = "Cp", type = "l")
points(which.min(regfit.cp), regfit.cp[which.min(regfit.cp)], pch = 4, col = "red", lwd = 3)
plot(regfit.bic, xlab = "Subset Size", ylab = "BIC", type = "l")
points(which.min(regfit.bic), regfit.bic[which.min(regfit.bic)], pch = 4, col = "red", lwd = 3)
# cofficients for the models selected.
coefficients(regfit.full, which.max(regfit.adjr2))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3
## 1.577715 100.570909 -7.287776 64.353564
coefficients(regfit.full, which.min(regfit.cp))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)5
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 -1.352911e-14
## poly(x, 10)6
## 2.035174e-14
coefficients(regfit.full, which.min(regfit.bic))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)6
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 2.035174e-14
# object regfit.full contains 10 models one for each polynomial degree/predictors available.
# I then have 1 best overall model according to adjusted R^2, one according to C'p and one according BIC.
# function coefficient extract coefficient of a specific model. so we need to specify for which model of regfit.full we want the coeff.
# same coefficients found fitting again a linear model
# lm(y~poly(x,which.max(regfit.adjr2))).
Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?
Forward stepwise selection
regfit.fw=regsubsets(y~poly(x,10),data, nvmax = 10, method = "forward")
regfitfw.adjr2<-summary(regfit.fw)$adjr2
regfitfw.cp<-summary(regfit.fw)$cp
regfitfw.bic<-summary(regfit.fw)$bic
#The summary() command outputs the best set of variables for each model size.
# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfitfw.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfitfw.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfitfw.bic)) %>% print()
## [1] "Polynimial degree: 4"
plot(regfitfw.adjr2, xlab = "Subset Size", ylab = "Forward Adjr2", type = "l")
points(which.max(regfitfw.adjr2), regfitfw.adjr2[which.max(regfitfw.adjr2)], pch = 4, col = "red", lwd = 3)
plot(regfitfw.cp, xlab = "Subset Size", ylab = "Forward Cp", type = "l")
points(which.min(regfitfw.cp), regfitfw.cp[which.min(regfitfw.cp)], pch = 4, col = "red", lwd = 3)
plot(regfitfw.bic, xlab = "Subset Size", ylab = "Forward BIC", type = "l")
points(which.min(regfitfw.bic), regfitfw.bic[which.min(regfitfw.bic)], pch = 4, col = "red", lwd = 3)
coefficients(regfit.fw, which.max(regfitfw.adjr2))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3
## 1.577715 100.570909 -7.287776 64.353564
coefficients(regfit.fw, which.min(regfitfw.cp))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)5
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 -1.352911e-14
## poly(x, 10)6
## 2.035174e-14
coefficients(regfit.fw, which.min(regfitfw.bic))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)6
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 2.035174e-14
Backward stepwise selection
regfit.bw=regsubsets(y~poly(x,10),data, nvmax = 10, method = "backward")
regfitbw.adjr2<-summary(regfit.bw)$adjr2
regfitbw.cp<-summary(regfit.bw)$cp
regfitbw.bic<-summary(regfit.bw)$bic
#The summary() command outputs the best set of variables for each model size.
# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfitbw.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfitbw.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfitbw.bic)) %>% print()
## [1] "Polynimial degree: 4"
plot(regfitbw.adjr2, xlab = "Subset Size", ylab = "Backward Adjr2", type = "l")
points(which.max(regfitbw.adjr2), regfitbw.adjr2[which.max(regfitbw.adjr2)], pch = 4, col = "red", lwd = 3)
plot(regfitbw.cp, xlab = "Subset Size", ylab = "Backward Cp", type = "l")
points(which.min(regfitbw.cp), regfitbw.cp[which.min(regfitbw.cp)], pch = 4, col = "red", lwd = 3)
plot(regfitbw.bic, xlab = "Subset Size", ylab = "Backward BIC", type = "l")
points(which.min(regfitbw.bic), regfitbw.bic[which.min(regfitbw.bic)], pch = 4, col = "red", lwd = 3)
coefficients(regfit.bw, which.max(regfitbw.adjr2))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3
## 1.577715 100.570909 -7.287776 64.353564
coefficients(regfit.bw, which.min(regfitbw.cp))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)5
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 -1.352911e-14
## poly(x, 10)6
## 2.035174e-14
coefficients(regfit.bw, which.min(regfitbw.bic))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)6
## 1.577715e+00 1.005709e+02 -7.287776e+00 6.435356e+01 2.035174e-14
Best subset selection, forward selection and backward selection select the same models, consequently the coefficients are the same.
Now fit a lasso model to the simulated data, again using \(X,X^2, . . . , X^{10}\) as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.
set.seed (1)
train = sample (c(TRUE ,FALSE), nrow(data),rep=TRUE) # here the number of T and F is more or less equal
#sum(train)
test = (!train)
# train=sample(nrow(data) , nrow(data)/2) I can set the size of the training set.
xm = model.matrix(y~poly(x,10), data) # creates design matrix (n x p). qualitative variable are turned into dummy variables.
y = data$y
Lcv.out = cv.glmnet (xm[train,], y[train], alpha =1) #Does k-fold cross-validation for glmnet, produces a plot, and returns a value for lambda
plot(Lcv.out)
bestlam.L = Lcv.out$lambda.min
bestlam.L
## [1] 0.2066705
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values glmnet() function standardizes the variables so that they are on the same scale. To turn off this default setting, use the argument standardize=FALSE.
lasso.mod = glmnet(xm, y, alpha = 1) # Fit the model on entire data using best lambda
#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.
predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.577715
## (Intercept) .
## poly(x, 10)1 98.504205
## poly(x, 10)2 -5.221072
## poly(x, 10)3 62.286859
## poly(x, 10)4 .
## poly(x, 10)5 .
## poly(x, 10)6 .
## poly(x, 10)7 .
## poly(x, 10)8 .
## poly(x, 10)9 .
## poly(x, 10)10 .
#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.
The lasso method select a model with 3 predictors: \(x, x^2, x^3\). “best lambda” yielding the lowest expected test error is close to 0, this might indicate that he Lasso regression return a result similar to the standard linear regression with RSS.
Now generate a response vector Y according to the model \(Y = \beta_0 + \beta_7X^7 + \epsilon\), and perform best subset selection and the lasso. Discuss the results obtained.
b7=4
x=rnorm(100)
y=b0 + b7*x^7
data<-data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10), data, nvmax = 10)
regfit.adjr2<-summary(regfit.full)$adjr2
regfit.cp<-summary(regfit.full)$cp
regfit.bic<-summary(regfit.full)$bic
#The summary() command outputs the best set of variables for each model size.
# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfit.adjr2)) %>% print()
## [1] "Polynimial degree: 7"
# best model according to Cp
paste("Polynimial degree:", which.min(regfit.cp)) %>% print()
## [1] "Polynimial degree: 7"
# best model according to BIC
paste("Polynimial degree:", which.min(regfit.bic)) %>% print()
## [1] "Polynimial degree: 7"
coefficients(regfit.full, which.max(regfit.adjr2))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5
## 39.06375 1220.02639 939.47276 1185.84734 398.00786 458.45997
## poly(x, 10)6 poly(x, 10)7
## 84.63802 66.36999
coefficients(regfit.full, which.min(regfit.cp))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5
## 39.06375 1220.02639 939.47276 1185.84734 398.00786 458.45997
## poly(x, 10)6 poly(x, 10)7
## 84.63802 66.36999
coefficients(regfit.full, which.min(regfit.bic))
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5
## 39.06375 1220.02639 939.47276 1185.84734 398.00786 458.45997
## poly(x, 10)6 poly(x, 10)7
## 84.63802 66.36999
set.seed (1)
train = sample (c(TRUE ,FALSE), nrow(data),rep=TRUE) # here the number of T and F is more or less equal
#sum(train)
test = (!train)
# train=sample(nrow(data) , nrow(data)/2) I can set the size of the training set.
xm = model.matrix(y~poly(x,10), data) [,-1]# creates design matrix (n x p). qualitative variable are turned into dummy variables.
y = data$y
Lcv.out = cv.glmnet (xm[train,], y[train], alpha =1)
plot(Lcv.out)
bestlam.L = Lcv.out$lambda.min
bestlam.L
## [1] 15.94335
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values.
lasso.mod = glmnet(xm, y, alpha = 1) # Fit the model on entire data. For each lambda values I have a set of coefficients.
#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.
predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 39.06375
## poly(x, 10)1 1060.59286
## poly(x, 10)2 780.03922
## poly(x, 10)3 1026.41380
## poly(x, 10)4 238.57432
## poly(x, 10)5 299.02644
## poly(x, 10)6 .
## poly(x, 10)7 .
## poly(x, 10)8 .
## poly(x, 10)9 .
## poly(x, 10)10 .
# make a prediction using the model in 'lasso.mod' with a specific lambda values which is the lambda value that yields the lowest test error.
#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.
Best subset selection and lasso select different models. Best subset selects a model of degree 7 while lasso a model of degree 5.
In this exercise, we will predict the number of applications received using the other variables in the College data set.
Split the data set into a training set and a test set.
data(College)
set.seed (2)
train = sample (c(TRUE ,FALSE), nrow(College),rep=TRUE) # here the number of T and F is more or less equal
#train=sample(nrow(College) , nrow(College)/2)
test = (!train)
y.test<-College$Apps[test]
y.train<-College$Apps[train]
#sum(train) + sum(test) - nrow(College)
Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit<-lm(Apps~. ,data=College[train,], na.action = na.omit)
lm.pred<-predict(lm.fit, newdata = College[test,])
# Test error
mean((lm.pred-y.test)^2) #MSE for standard linear regression
## [1] 1556804
Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
xm = model.matrix(Apps~., College)[,-1] # creates design matrix (n x p). qualitative variable are turned into dummy variables.
ridge.cv.out = cv.glmnet (xm[train,], y.train, alpha =0)
#plot(ridge.cv.out)
bestlam.ridge = ridge.cv.out$lambda.min #value of lambda that gives minimum cvm.
bestlam.ridge
## [1] 292.5324
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values.
ridge.mod = glmnet(xm[train,], y.train, alpha = 0) # Fit the model on entire data
#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.
#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.
ridge.pred<-predict(ridge.mod, s=bestlam.ridge, newx=xm[test ,])
# Test error
mean((ridge.pred - y.test)^2) # MSE with Ridge regression
## [1] 2451335
MSE with ridge regression is higher than MSE with standard linear regression OLS
Fit a lasso model on the training set, with λ chosen by cross validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
lasso.cv.out = cv.glmnet (xm[train,], y.train, alpha = 1)
#plot(ridge.cv.out)
bestlam.lasso = lasso.cv.out$lambda.min #value of lambda that gives minimum cvm.
bestlam.lasso
## [1] 10.03524
lasso.mod = glmnet(xm[train,], y.train, alpha = 1)
lasso.pred<-predict(lasso.mod, s=bestlam.lasso, newx=xm[test ,])
# Test error
mean((lasso.pred - y.test)^2) # MSE with Lasso regression
## [1] 1619912
predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -9.383909e+02
## PrivateYes -8.180112e+01
## Accept 1.318719e+00
## Enroll .
## Top10perc 3.388912e+01
## Top25perc -6.056578e+00
## F.Undergrad .
## P.Undergrad 1.424121e-01
## Outstate -6.118679e-02
## Room.Board 8.404844e-02
## Books 7.773932e-02
## Personal -9.714371e-03
## PhD .
## Terminal -7.264470e+00
## S.F.Ratio 1.030567e+01
## perc.alumni -1.773488e+00
## Expend 1.045115e-01
## Grad.Rate 4.731908e+00
lasso.coef<-predict(lasso.mod, s = bestlam.L, type = "coefficients") %>% as.data.frame.matrix()
colnames(lasso.coef)="Coef_Value"
# number of non-zero coefficient
lasso.coef %>% subset(Coef_Value !=0) %>% nrow()
## [1] 15
MSE with Lasso regression is higher than the one obtained with the OLS.
Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pcr.fit=pcr(Apps~., data=College[train,] ,scale=TRUE , validation ="CV")
# values of the M selected by cross-validation.
summary(pcr.fit)
## Data: X dimension: 384 17
## Y dimension: 384 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3155 3124 1489 1494 1419 1204 1176
## adjCV 3155 3124 1487 1496 1418 1192 1173
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1141 1133 1094 1083 1083 1093 1097
## adjCV 1136 1131 1092 1079 1080 1090 1093
## 14 comps 15 comps 16 comps 17 comps
## CV 1089 1094 945.9 940.2
## adjCV 1086 1092 941.7 935.6
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.418 58.16 65.51 71.62 76.78 81.32 85.13 88.55
## Apps 3.652 78.23 78.59 81.35 86.69 86.88 87.80 87.87
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 91.16 93.38 95.40 97.30 98.25 99.09 99.54
## Apps 88.77 89.13 89.13 89.14 89.25 89.37 89.38
## 16 comps 17 comps
## X 99.89 100.00
## Apps 92.39 92.62
validationplot(pcr.fit, val.type="MSEP")
pcr.pred = predict(pcr.fit, College[test, ], ncomp=17) #ncomp could be set at a lower values as from the graph it seems that the cross validation MSE doesn't change much for higher values of ncomp after ncomp=5. Same conclusio can be drawn using summary() which show the root mean squared error (sqrt(MSE)).
# if ncomp=p then I am applying a linear regression with OLS.
mean((pcr.pred - y.test)^2)
## [1] 1556804
Consider the following method as well.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
## The following object is masked from 'package:purrr':
##
## lift
model <- train(
Apps~., data = College[train,], method = "pcr",
scale = TRUE,
trControl = trainControl("cv", number = 17),
tuneLength = 17
)
# Plot model RMSE vs different values of components
plot(model)
# Print the best tuning parameter ncomp that
# minimize the cross-validation error, RMSE
model$bestTune
## ncomp
## 16 16
Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
pls.fit=plsr(Apps~., data=College[train,] ,scale=TRUE , validation ="CV")
# values of the M selected by cross-validation.
summary(pls.fit)
## Data: X dimension: 384 17
## Y dimension: 384 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3155 1374 1173 1066 1049 1015 979.7
## adjCV 3155 1371 1177 1063 1044 1009 973.3
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 962.6 952.6 943.4 945.7 949.3 945.7 942.7
## adjCV 956.7 947.6 939.0 941.1 944.5 940.6 937.9
## 14 comps 15 comps 16 comps 17 comps
## CV 942.1 942.2 942.1 942.1
## adjCV 937.4 937.5 937.4 937.4
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.11 46.54 63.46 67.66 71.77 75.06 79.36 82.90
## Apps 81.91 86.46 89.46 90.46 91.36 92.18 92.44 92.51
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 84.68 87.67 90.57 91.55 94.49 96.01 97.47
## Apps 92.55 92.57 92.58 92.62 92.62 92.62 92.62
## 16 comps 17 comps
## X 98.00 100.00
## Apps 92.62 92.62
validationplot(pls.fit, val.type="MSEP")
pls.pred = predict(pls.fit, College[test, ], ncomp=17)
mean((pls.pred - y.test)^2)
## [1] 1556804
# Same considerations here as for PCR
Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
lm<-mean((lm.pred-y.test)^2)
ridge<-mean((ridge.pred - y.test)^2)
lasso<-mean((lasso.pred - y.test)^2)
pcr<-mean((pcr.pred - y.test)^2)
pls<-mean((pls.pred - y.test)^2)
MSE.comp<-c(lm,ridge,lasso,pcr,pls)
barplot(MSE.comp, names.arg = c("OLS", "Ridge", "Lasso", "PCR", "PLS"), ylab="MSE")
PCR and PLS use all the predictors therefore they are equivalent as OLS. If we impose a number of components (based on an acceptable threshold) we would get different MSE. Lasso return a model with MSE very close to the OLS.
We have seen that as the number of features used in a model increases, the training error will necessarily decrease, but the test error may not. We will now explore this in a simulated data set.
Generate a data set with p = 20 features, n = 1,000 observations, and an associated quantitative response vector generated according to the model \(Y = X\beta + \epsilon\),
where \(\beta\) has some elements that are exactly equal to zero.
set.seed(1)
p = 20
n = 1000
x = matrix(rnorm(n * p), n, p)
B = rnorm(p)
zeros<-rbinom(p,1,0.2) %>% as.logical() # set the position of the zeros
B[zeros]=0 # set the zeros
B
## [1] 0.2353485 0.2448250 -0.6421869 -1.9348085 1.0386957 0.0000000
## [7] 0.0000000 0.7231804 2.0310355 0.0000000 0.8791534 0.0000000
## [13] -0.2845811 -0.6746580 -0.7154889 -0.2705279 0.3129646 1.6698068
## [19] 0.8922593 -1.0154889
eps = rnorm(p)
y = x %*% B + eps
Split your data set into a training set containing 100 observations and a test set containing 900 observations.
set.seed(1)
train<-sample(n, 100)
test<-(-train)
x.train<-x[train,]
y.train<-y[train]
x.test<-x[test,]
y.test<-y[test]
Perform best subset selection on the training set, and plot the training set MSE associated with the best model of each size.
regfit.train=regsubsets(y ~. ,data = data.frame(x = x.train, y = y.train), nvmax = p)
MSE.train<-(summary(regfit.train)$rss)/length(train)
plot(MSE.train,ylab = "Training MSE", pch = 19, type = "b")
Plot the test set MSE associated with the best model of each size.
predict.regsubsets = function(object,newdata,id,...){
form = as.formula(object$call[[2]]) # Extract the formula used when we called regsubsets()
mat = model.matrix(form,newdata) # Build the model matrix
coefi = coef(object,id=id) # Extract the coefficients of the ith model
xvars = names(coefi) # Pull out the names of the predictors used in the ith model
mat[,xvars]%*%coefi # Make predictions using matrix multiplication
}
# Iterates over each size i
pred<-matrix(NA, length(y.test), p) %>% as.data.frame()
for(i in 1:p){
pred[,i]<-predict.regsubsets(regfit.train, data.frame(x = x.test, y = y.test),i)
}
MSE.test<-rep(NA,p)
for(i in 1:p){
MSE.test[i]<-mean((pred[,i]-y.test)^2)
}
MSE.test
## [1] 12.981049 9.303255 7.071847 6.591699 6.057322 4.527023 3.801184
## [8] 3.196142 2.447528 1.957794 1.476790 1.366107 1.223271 1.276057
## [15] 1.222040 1.182931 1.207938 1.213847 1.222304 1.198599
plot(MSE.test,ylab = "Test MSE", pch = 15, type = "b")
"
for(i in 1:19){
# Extract the vector of predictors in the best fit model on i predictors
coefi = coef(regfit_best_train, id = i)
# Make predictions using matrix multiplication of the test matirx and the coefficients vector
pred = test_mat[,names(coefi)]%*%coefi
# Calculate the MSE
val_errors[i] = mean((test$Salary-pred)^2)
}
for(i in 1:p){
coefi = coef(regfit.train,id=i)
xvars = names(coefi)
pred = df.test[,xvars]%*%coefi
val_errors[i] = mean((y.test-pred)^2)
}
"
## [1] "\nfor(i in 1:19){\n \n # Extract the vector of predictors in the best fit model on i predictors\n coefi = coef(regfit_best_train, id = i)\n \n # Make predictions using matrix multiplication of the test matirx and the coefficients vector\n pred = test_mat[,names(coefi)]%*%coefi\n \n # Calculate the MSE\n val_errors[i] = mean((test$Salary-pred)^2)\n}\n\nfor(i in 1:p){\n coefi = coef(regfit.train,id=i)\n xvars = names(coefi)\n pred = df.test[,xvars]%*%coefi\n val_errors[i] = mean((y.test-pred)^2)\n\n}\n"
For which model size does the test set MSE take on its minimum value? Comment on your results. If it takes on its minimum value for a model containing only an intercept or a model containing all of the features, then play around with the way that you are generating the data in (a) until you come up with a scenario in which the test set MSE is minimized for an intermediate model size.
min.t<-which.min(MSE.test)
min.t
## [1] 16
Model size 16 has the minimum value of test set MSE. However after index 15 the test MSE is almost constant. Changes the coefficients set to 0 will lead to a different best model size.
How does the model at which the test set MSE is minimized compare to the true model used to generate the data? Comment on the coefficient values.
addname<-coef(regfit.train, id = p) %>% names()
c<-coef(regfit.train, id = which.min(MSE.test))[-1]
names(B)<-addname[-1]
#list(coef(regfit.train, id = which.min(MSE.test))[-1], B)
coef.best <- rep(0, length(B))
names(coef.best)<-addname[-1]
where<- match(names(B), names(c)) %>% is.na() #locate the NAs
coef.best[!where]<- c # overwrite the vector full of zeros with corresponding value in c where there is no NA.
data.frame(coef.best, B)
## coef.best B
## x.1 0.1663089 0.2353485
## x.2 0.1457268 0.2448250
## x.3 -0.6649714 -0.6421869
## x.4 -1.8301688 -1.9348085
## x.5 1.0105789 1.0386957
## x.6 0.0000000 0.0000000
## x.7 0.2137181 0.0000000
## x.8 0.6085251 0.7231804
## x.9 2.1001306 2.0310355
## x.10 0.0000000 0.0000000
## x.11 0.7196519 0.8791534
## x.12 0.0000000 0.0000000
## x.13 -0.4935891 -0.2845811
## x.14 -0.7317653 -0.6746580
## x.15 -0.8505321 -0.7154889
## x.16 0.0000000 -0.2705279
## x.17 0.2679864 0.3129646
## x.18 1.7239110 1.6698068
## x.19 0.8473220 0.8922593
## x.20 -0.8907173 -1.0154889
x.7 is 0 in B but not in the best subset model. x.16 is 0 in the best subset model but not in B.
Create a plot displaying \(\sqrt{\sum_{j=1}^p(\beta-\hat{\beta}_j^r)^2}\) for a range of values of r, where \(\hat{\beta}_j^r\) j is the jth coefficient estimate for the best model containing r coefficients. Comment on what you observe. How does this compare to the test MSE plot from (d)?
#names(B)<-c(paste("x",1:(length(B)),sep="")) # B has "true" parameter values
par.error<-vector()
for(i in 1:length(B)){
var<-names(coef(regfit.train,id=i)[-1])
par.error[i]<-sum((B[names(B)%in%var]-coef(regfit.train,id=i)[-1])^2)
}
par.error
## [1] 0.03079925 0.16889075 0.39947322 0.50245575 0.30303245 0.69388259
## [7] 0.64594896 0.37105825 0.27345053 0.19534057 0.14378940 0.15281194
## [13] 0.14243178 0.20004695 0.18809258 0.20360212 0.23513938 0.24222532
## [19] 0.24600460 0.29255324
plot(par.error, xlab = "Number of coefficients", ylab = "Error between estimated and true coefficients")
We will now try to predict percapita crime rate in the Boston dataset.
Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
Best Subset Selection
data(Boston)
# I am comparing model of different size and I need to choose the "best" according to a specific criterion. I'll use cross validation as I can compare the selected model with the one from the lasso/Ridge without additional calculation
set.seed(1)
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
# For each fold
for (i in 1:k) {
# Fit the model with each subset of predictors on the training part of the fold
# first iteration k=1, row labeled with k=1 form the test set. The others the training set.
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
# For each subset
for (j in 1:p) {
# Predict on the hold out part of the fold for that subset
pred = predict(best.fit, Boston[folds == i, ], id = j)
# Get the mean squared error for the model trained on the fold with the subset
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = apply(cv.errors, 2, mean) %>% sqrt()
min(rmse.cv)
## [1] 6.543281
which.min(rmse.cv)
## [1] 9
plot(rmse.cv, pch = 9, type = "b")
#coefficients of the best subset. re-run regsubset() on the full model and select the coefficients for the subset model identified with the cross validation (10 variables.)
# Note that the selected variable during cross validation (regsubset applied to the training set) might not be the same as the on selected applying the regsubset() on the full data set. Even if the variable selected are the same it is very likely that the coefficient values are different.
regsubsets(crim ~ ., data = Boston, nvmax = p) %>% coefficients(id=which.min(rmse.cv))
## (Intercept) zn indus nox dis
## 19.124636156 0.042788127 -0.099385948 -10.466490364 -1.002597606
## rad ptratio black lstat medv
## 0.539503547 -0.270835584 -0.008003761 0.117805932 -0.180593877
Lasso Regression
X = model.matrix(crim~., Boston)[,-1] # creates design matrix (n x p). qualitative variable are turned into dummy variables.
lasso.cv = cv.glmnet (X, Boston$crim, alpha = 1, type.measure="mse")
#plot(lasso.cv)
bestlam.lasso = lasso.cv$lambda.min #value of lambda that gives minimum cvm.
bestlam.lasso
## [1] 0.2071268
#lasso.cv$cvm #The mean cross-validated error - a vector of length lasso.cv$lambda
min(lasso.cv$cvm) %>% sqrt()
## [1] 6.696065
coef(lasso.cv)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.176491
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.150484
## tax .
## ptratio .
## black .
## lstat .
## medv .
Ridge Regression
ridge.cv = cv.glmnet (X, Boston$crim, alpha = 0, type.measure="mse")
#plot(ridge.cv)
bestlam.ridge = ridge.cv$lambda.min #value of lambda that gives minimum cvm.
bestlam.ridge
## [1] 0.5374992
#ridge.cv$cvm #The mean cross-validated error - a vector of length ridge.cv$lambda
min(ridge.cv$cvm) %>% sqrt()
## [1] 6.582841
coef(ridge.cv)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.523899548
## zn -0.002949852
## indus 0.029276741
## chas -0.166526006
## nox 1.874769661
## rm -0.142852604
## age 0.006207995
## dis -0.094547258
## rad 0.045932737
## tax 0.002086668
## ptratio 0.071258052
## black -0.002605281
## lstat 0.035745604
## medv -0.023480540
Principal Component Regression
pcr.fit=pcr(crim~., data=Boston ,scale=TRUE , validation ="CV")
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.175 7.180 6.724 6.731 6.727 6.727
## adjCV 8.61 7.174 7.179 6.721 6.725 6.724 6.724
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.722 6.614 6.618 6.607 6.598 6.553 6.488
## adjCV 6.718 6.609 6.613 6.602 6.592 6.546 6.481
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
13 component pcr fit has lowest CV/adjCV RMSEP.
Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross validation, or some other reasonable alternative, as opposed to using training error.
The model found with Best Subset Selection seems the preferable. It returns the lowest RMSE (root mean square error) and select a subset involving 9 predictors while Ridge makes no variable selection and Lasso selects only 1 variable.
Since p is not big, best subset selection is not very computationally expensive
regsubsets(crim ~ ., data = Boston, nvmax = p) %>% system.time()
## user system elapsed
## 0 0 0
Does your chosen model involve all of the features in the data set? Why or why not?
The model does not involve all the features. Best Subset select 9 features as the model with 9 predictor yields the lowest MSE/RMSE
In this exercise, you will further analyze the Wage data set considered throughout this chapter.
Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
set.seed(1)
data(Wage)
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
# This function calculates the estimated K-fold cross-validation prediction error for generalized linear models. if k is not specified it is set equal to n ?
cv.pe_poly<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(cv.pe_poly)){
poly.fit<-glm(wage ~ poly(age,i), data = Wage) #fit the model with poly degree i
cv.pe_poly[i]<-cv.glm(Wage, poly.fit, K=10)$delta[1]
#save the prediction error estimate in vector cv.pe
}
plot(seq_along(cv.pe_poly), cv.pe_poly, xlab="Degree", ylab="CV error", type = "l")
points(which.min(cv.pe_poly),min(cv.pe_poly), col="red", pch=19)
5 degree polynomial yields the minimum estimated cross validation error. However from the plot we can see that the line flatten out after degree = 3. We might go for the cubic polynomial as a better trade of between accuracy and simplicity.
pfit.1 = lm(wage~poly(age, 1), data=Wage)
pfit.2 = lm(wage~poly(age, 2), data=Wage)
pfit.3 = lm(wage~poly(age, 3), data=Wage)
pfit.4 = lm(wage~poly(age, 4), data=Wage)
pfit.5 = lm(wage~poly(age, 5), data=Wage)
anova(pfit.1,pfit.2,pfit.3,pfit.4,pfit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova analysis somehow confirm the results obtained from cross validation. the p-value for 4-degree polynomial is not negligible. The residual sum of squares does not decrease much from degree 3 to degree 4.
plot(wage~age, data=Wage, col="lightblue")
age.grid = seq_range(Wage$age, by=1)
lm.fit3 = lm(wage~poly(age, 3), data=Wage)
lm.pred3 = predict(lm.fit3, data.frame(age=age.grid))
lines(age.grid, lm.pred3, col="blue", lwd=2)
legend("topright", legend="3rd poly degree",col="blue", lty=1, cex=0.8)
Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained.
cv.pe_step<-rep(NA,10) # I am gonna try up to 5 cuts.
for(i in 2:length(cv.pe_step)){
Wage$ageCut <- cut(Wage$age,i) # add a factor column to the data frame. for each observation it tells me the corresponding "bucket".
step.fit<-glm(wage ~ ageCut, data = Wage) # fit the model with i cuts/knots
#print(i)
cv.pe_step[i-1]<-cv.glm(Wage, step.fit, K=10)$delta[1]
# save the prediction error estimate in vector cv.pe_step
}
cv.pe_step<-na.omit(cv.pe_step)
plot(seq_along(cv.pe_step), cv.pe_step, xlab="Number of cuts", ylab="CV error", type = "l")
points(which.min(cv.pe_step),min(cv.pe_step), col="red", pch=19)
plot(wage~age, data=Wage, col="lightblue")
age.grid<-seq_range(Wage$age,by=1)
age.grid.cut = cut(age.grid,which.min(cv.pe_step))
#Wage$ageCut <- cut(Wage$age, which.min(cv.pe_step))
Wage$ageCut <- cut(Wage$age, which.min(cv.pe_step))
lm.fit_step = glm(wage ~ ageCut, data=Wage)
lm.pred_step = predict(lm.fit_step, data.frame(ageCut=age.grid.cut)) # predict function needs a data frame as input with the same predictor names as glm. ageCut appears in glm() and predict().
lines(age.grid, lm.pred_step, col="red", lwd=2)
legend("topright", legend=paste(which.min(cv.pe_step), "cuts"),col="red", lty=1, cex=0.8)
Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.
data(Auto)
#str(Auto)
#summary(Auto) to check data structure
Auto %>% select_if(is.numeric) %>% plot() # exclude variable name from the plot
Variable mpg appears inversely proportional to cylinders, displacement, horsepower, weight. The relationship between mpg and displacement, horsepower and weight (separately) doesn’t seem to be linear. mpg seems directly proportional to year. As year increases, the expected mpg increases as well. The relationship seems to be linear.
POLYNOMIAL
poly.fit = list()
poly.deg<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(poly.deg)){
poly.fit[[i]]<-lm(mpg ~ poly(horsepower,i), data = Auto) #fit the model with poly degree i. I need [[]] because I need to access the object itself. [] extract a list.
#I have to feed anova function with a lm object or similar.
}
#poly.fit[[1]] %>% class()
anova(poly.fit[[1]],poly.fit[[2]],poly.fit[[3]],poly.fit[[4]],poly.fit[[5]])
## Analysis of Variance Table
##
## Model 1: mpg ~ poly(horsepower, i)
## Model 2: mpg ~ poly(horsepower, i)
## Model 3: mpg ~ poly(horsepower, i)
## Model 4: mpg ~ poly(horsepower, i)
## Model 5: mpg ~ poly(horsepower, i)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 390 9385.9
## 2 389 7442.0 1 1943.89 103.8767 < 2.2e-16 ***
## 3 388 7426.4 1 15.59 0.8333 0.361897
## 4 387 7399.5 1 26.91 1.4382 0.231169
## 5 386 7223.4 1 176.15 9.4131 0.002306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mpg~horsepower, data=Auto, col="lightblue")
hp.grid = seq_range(Auto$horsepower, by=1)
poly.anova=2
lm.fit = lm(mpg~poly(horsepower, poly.anova), data=Auto)
lm.pred = predict(lm.fit, data.frame(horsepower=hp.grid))
lines(hp.grid, lm.pred, col="blue", lwd=2)
legend("topright", legend=paste("poly degree",poly.anova),col="blue", lty=1, cex=0.8)
Quadratic polynomial seems to provide a good fit. Anova confirms so, p-value for 3rd and 4th degree are higher than usual levels of significance.
set.seed(1)
cv.poly<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(cv.poly)){
fitp<-glm(mpg ~ poly(horsepower,i), data = Auto)
cv.poly[i]<-cv.glm(Auto, fitp, K=10)$delta[1]
#save the prediction error estimate in vector cv.poly
}
cv.poly
## [1] 24.21538 19.28327 19.12998 19.29201 19.09471
which.min(cv.poly)
## [1] 5
5th degree polynomial provides the minimum value of estimated prediction error with cross validation.However the values after the 2nd term are very similar. So a quadratic polynomial should be a good compromise between accuracy and simplicity.
STEP FUNCTION
set.seed(1)
cv.step<-rep(NA,10) # I am gonna try up to 5 cuts.
for(i in 2:length(cv.step)){
Auto$hpCut <- cut(Auto$horsepower,i) # add a factor column to the data frame. for each observation it tells me the corresponding "bucket".
fits<-glm(mpg ~ hpCut, data = Auto) #fit the model with i cuts/knots
#print(i)
cv.step[i]<-cv.glm(Auto, fits, K=10)$delta[1]
#save the prediction error estimate in vector cv.step
}
plot(seq_along(cv.step), cv.step, xlab="Number of cuts", ylab="CV error", type = "l", xlim=c(2,length(cv.step)))
points(which.min(cv.step),min(cv.step, na.rm = T), col="red", pch=19)
SPLINES
set.seed(1)
cv.sp = rep(NA, 10)
for (i in 2:length(cv.sp)) {
fitsp = glm(mpg ~ ns(horsepower, df = i), data = Auto)
cv.sp[i] = cv.glm(Auto, fitsp, K = 10)$delta[1]
}
which.min(cv.sp)
## [1] 9
fitsp<-glm(mpg ~ ns(horsepower, df = which.min(cv.sp)), data = Auto)
#Plotting the Regression Line to the scatterplot
plot(mpg~horsepower,data=Auto,col="lightblue")
pred<-predict(fitsp ,newdata =data.frame(horsepower=hp.grid),se=T)
lines(hp.grid , pred$fit ,col ="red",lwd =2)
GAM
fitgam1 = lm(mpg ~ ns(displacement, 1) + ns(horsepower, 1), data = Auto)
#summary(fitgam)
fitgam2 = lm(mpg ~ ns(displacement, 2) + ns(horsepower, 2), data = Auto)
anova(fitgam1,fitgam2)
## Analysis of Variance Table
##
## Model 1: mpg ~ ns(displacement, 1) + ns(horsepower, 1)
## Model 2: mpg ~ ns(displacement, 2) + ns(horsepower, 2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 389 7995.3
## 2 387 6091.0 2 1904.4 60.499 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
All the models we have tried out point out that a quadratic term (at least) plays a significant role. The matrix plot already suggested that the relationship between some variables is not linear.
This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.
Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.
data(Boston)
range(Boston$dis)
## [1] 1.1296 12.1265
dis.grid<-seq_range(Boston$dis,n=100) #package modelr
fit.p3<-lm(nox~poly(dis,3), data=Boston)
summary(fit.p3)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
pred.p3<-predict(fit.p3, newdata=data.frame(dis=dis.grid))
plot(nox~dis, data=Boston, col="lightblue")
lines(dis.grid, pred.p3, col="red", lwd = 2)
Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.
rss.poly<-rep(NA,10)# I am gonna try polynomial up to 5th degree.
poly.fit = list()
pred.pp = list()
for(i in seq_along(rss.poly)){
poly.fit[[i]]<-lm(nox ~ poly(dis,i), data = Boston)
#rss.poly[i]<-deviance(poly.fit)
rss.poly[i]<-sum(poly.fit[[i]]$residuals^2)
pred.pp[[i]]<-predict(poly.fit[[i]], newdata=data.frame(dis=dis.grid))
}
rss.poly
## [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
## [9] 1.833331 1.832171
#ggplot(Boston, aes(x=dis,y=nox)) + geom_point() + stat_function(data = data.frame(), fun = )
cols <- rainbow(10)
plot(nox~dis, data=Boston, col="lightblue")
for(i in seq_along(rss.poly)){
lines(dis.grid, pred.pp[[i]], col= cols[i], lwd = 2)
legend("topright", legend=paste("degree",1:10), lwd=2, col=cols,cex = 0.5)
}
Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.
set.seed(1)
cv.poly<-rep(NA,10)
for(i in seq_along(cv.poly)){
fitp<-glm(nox ~ poly(dis,i), data = Boston)
cv.poly[i]<-cv.glm(Boston, fitp, K=10)$delta[1]
}
cv.poly
## [1] 0.005558263 0.004085706 0.003876521 0.003863342 0.004237452 0.005686862
## [7] 0.010278897 0.006810868 0.033308607 0.004075599
which.min(cv.poly)
## [1] 4
According to 10-fold cross validation a 4-degree polynomial will yield the lowest estimated prediction error. A 3rd polynomial degree yields a similar estimated prediction error. For higher degree the prediction error returned is much higher.
Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.
fitsp4 = lm(nox ~ bs(dis, df = 4), data = Boston)
summary(fitsp4)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, df = 4)1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, df = 4)2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, df = 4)3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
# The bs() function generates the entire matrix of basis functions for splines with the specified set of knots.
# df option to produce a spline with knots at uniform quantiles of the data.
# fitting a cubic spline with K knots uses K+4 degrees of freedom.
attr(bs(Boston$dis, df = 4), "knots")
## 50%
## 3.20745
plot(nox~dis,data=Boston,col="lightblue")
pred<-predict(fitsp4 ,newdata =data.frame(dis=dis.grid))
lines(dis.grid , pred ,col ="red",lwd =2)
Knots where chosen setting the degree of freedom. If not specified, bs function sets the knot at uniform percentile/quantiles. By default bs uses cubic splines. The number of knots is defined by the type of spline and the degree of freedom.
df = K + 4
Where K is the number of knots
Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.
deg<-rep(NA,20)
rss.sp<-rep(NA,length(deg))
#fitsp<-list()
degree<-3 # standard value in bs
for(i in 3:length(deg)){
fitsp = lm(nox ~ bs(dis, df = i), data = Boston)
#fitsp[[i]] = lm(nox ~ bs(dis, df = i, degree = degree), data = Boston)
rss.sp[i] = sum(fitsp$residuals^2)
}
which.min(rss.sp)
## [1] 19
min(rss.sp, na.rm = T)
## [1] 1.774487
plot(seq_along(rss.sp), rss.sp, xlab="Degree of freedom", ylab="RSS", type = "l", xlim = c(degree,length(rss.sp)))
points(which.min(rss.sp),min(rss.sp, na.rm = T), col="red", pch=19)
plot(nox~dis, data = Boston, col="lightblue")
fitsp = lm(nox ~ bs(dis, df = which.min(rss.sp)), data = Boston)
predsp<-predict(fitsp, newdata = data.frame(dis=dis.grid))
lines(dis.grid,predsp, col ="red",lwd =2)
Degree of freedom equal to 19 yields the lowest residual sum of squares. That being said from the RSS plot we can notice that the line flatten out after df = 14
Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.
set.seed(1)
cv.df<-rep(NA,20)
for(i in 3:length(cv.df)){
fitsp = glm(nox ~ bs(dis, df = i), data = Boston)
cv.df[i]<-cv.glm(Boston, fitsp, K=10)$delta[1]
}
cv.df
## [1] NA NA 0.003897402 0.003890178 0.003724890 0.003697010
## [7] 0.003740823 0.003709286 0.003727194 0.003715941 0.003708342 0.003701810
## [13] 0.003723050 0.003740607 0.003762967 0.003860353 0.003799018 0.003721269
## [19] 0.003861497 0.003830194
which.min(cv.df)
## [1] 6
plot(seq_along(cv.df), cv.df, lwd = 2, type = "l", xlab = "df", ylab = "CV error",xlim = c(3,length(cv.df)))
points(which.min(cv.df),min(cv.df,na.rm = T), col="red", pch=19)
According to cross validation 6 degree of freedom yields the lowest estimated prediction error. The plot is more irregular compared to the one generated against the RSS. The prediction error can considerably increase as the number of degree of freedom increases.
This question relates to the College data set.
Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
data(College)
set.seed(1)
train = sample(length(College$Outstate), length(College$Outstate)/2)
test = -train
College.train = College[train, ]
College.test = College[test, ]
reg = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
reg.summary = summary(reg)
reg.summary$adjr2 %>% which.max()
## [1] 14
reg.summary$cp %>% which.min()
## [1] 14
reg.summary$bic %>% which.min()
## [1] 6
par(mfrow = c(1, 3))
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = "adjr2", type = "l")
points(which.max(reg.summary$adjr2), max(reg.summary$adjr2), col="red", pch=19)
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
points(which.min(reg.summary$cp), min(reg.summary$cp), col="red", pch=19)
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), min(reg.summary$bic), col="red", pch=19)
BIC, Cp and Adjusted R2 have minimum values for different number of variables. However, all the criteria point out that after 6 variables the score remain approximately constant (BIC criterion already suggest a 6 variables model as the absolute best one). We will use a model with 6 variables.
Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
reg<- regsubsets(Outstate ~ ., data = College, method = "forward")
coef(reg, id = 6) %>% names()
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
fit.gam<- lm(Outstate ~ Private + ns(Room.Board, df = 3) + ns(PhD, df = 3) + ns(perc.alumni, df = 3) + ns(Expend, df = 3) + ns(Grad.Rate, df = 3), data = College.train)
par(mfrow = c(2, 3))
plot.Gam(fit.gam, se=T, col = "blue")
plot.Gam shows the relationship each variable and the response. Each plot displays the fitted function and pointwise standard errors.
Some interpretations from the plot.Gam:
Out-of-state tuition increases as perc.alumni increases linearly (Pct. alumni who donate) holding the other variables constant.
Out-of-state tuition increases as Room.Board increases (Room and board costs) holding the other variables constant. The increase is not constant.
Out-of-state tuition is higher for private university holding constant the other variables.
Similar considerations can be made for the other plots.
Evaluate the model obtained on the test set, and explain the results obtained.
pred.test<-predict(fit.gam, newdata=College.test)
err.test<-mean((College.test$Outstate - pred.test)^2)
err.test
## [1] 3460135
mean(fit.gam$residuals^2)
## [1] 3660175
As expected the MSE for the test test is higher than the one for the training set.
For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(fit.gam)
##
## Call:
## lm(formula = Outstate ~ Private + ns(Room.Board, df = 3) + ns(PhD,
## df = 3) + ns(perc.alumni, df = 3) + ns(Expend, df = 3) +
## ns(Grad.Rate, df = 3), data = College.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6998.1 -1154.5 -89.8 1280.9 7579.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2365.4 1470.5 1.609 0.108562
## PrivateYes 2497.9 295.7 8.446 6.90e-16 ***
## ns(Room.Board, df = 3)1 3089.2 539.0 5.731 2.07e-08 ***
## ns(Room.Board, df = 3)2 7146.3 1634.6 4.372 1.60e-05 ***
## ns(Room.Board, df = 3)3 3966.8 770.3 5.149 4.25e-07 ***
## ns(PhD, df = 3)1 438.2 605.5 0.724 0.469690
## ns(PhD, df = 3)2 -1512.5 2106.5 -0.718 0.473198
## ns(PhD, df = 3)3 582.6 681.9 0.854 0.393450
## ns(perc.alumni, df = 3)1 1492.8 537.9 2.775 0.005792 **
## ns(perc.alumni, df = 3)2 2658.1 1300.0 2.045 0.041595 *
## ns(perc.alumni, df = 3)3 2496.4 910.3 2.742 0.006396 **
## ns(Expend, df = 3)1 10793.9 1103.8 9.779 < 2e-16 ***
## ns(Expend, df = 3)2 6577.3 1556.6 4.226 3.00e-05 ***
## ns(Expend, df = 3)3 3718.5 1468.2 2.533 0.011729 *
## ns(Grad.Rate, df = 3)1 2045.5 557.3 3.671 0.000277 ***
## ns(Grad.Rate, df = 3)2 2467.2 1670.8 1.477 0.140612
## ns(Grad.Rate, df = 3)3 1859.7 561.2 3.313 0.001012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1956 on 371 degrees of freedom
## Multiple R-squared: 0.7968, Adjusted R-squared: 0.7881
## F-statistic: 90.94 on 16 and 371 DF, p-value: < 2.2e-16
fit.lm<-lm(Outstate ~ Private + Room.Board + PhD + perc.alumni + Expend + Grad.Rate, data = College.train)
Looking at the p-value from the summary table, it seems that the relationship between Expend and Outstate is not linear. Similar results were found with the plot.gam.