Chapter 2

Exercise 2.8

college<-read.csv("College.csv")
#view(college)
#fix(college)
rownames(college)=college[,1] #column 1 will be turned in the name of the row,
#R will not make operation on this column, this is not a data column.
college=college[,-1]
#i.
summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
#ii.
pairs(college[,1:10])

#iii.
plot(college$Private,college$Outstate)

#iv.
Elite=rep("No",nrow(college))
Elite[college$Top10perc>50]="Yes"
Elite=as.factor(Elite)
college=data.frame(college,Elite)
# Delete Top10perc column? use dplyr

summary(college)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate      Elite    
##  Min.   : 10.00   No :699  
##  1st Qu.: 53.00   Yes: 78  
##  Median : 65.00            
##  Mean   : 65.46            
##  3rd Qu.: 78.00            
##  Max.   :118.00
str(college)
## 'data.frame':    777 obs. of  19 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : int  1660 2186 1428 417 193 587 353 1899 1038 582 ...
##  $ Accept     : int  1232 1924 1097 349 146 479 340 1720 839 498 ...
##  $ Enroll     : int  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : int  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : int  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: int  2885 2683 1036 510 249 678 416 1594 973 799 ...
##  $ P.Undergrad: int  537 1227 99 63 869 41 230 32 306 78 ...
##  $ Outstate   : int  7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
##  $ Room.Board : int  3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
##  $ Books      : int  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : int  2200 1500 1165 875 1500 675 1500 850 500 1800 ...
##  $ PhD        : int  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : int  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: int  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : int  7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
##  $ Grad.Rate  : int  60 56 54 59 15 55 63 73 80 52 ...
##  $ Elite      : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
plot(college$Elite,college$Outstate)

# v.
par(mfrow=c(2,2))
hist(college$Accept)
hist(college$Enroll)
hist(college$Personal)
hist(college$Expend)

# vi.
#...Continue exploring the data, and provide a brief summary of what you discover.

Exercise 2.9

auto<-read.csv("Auto.csv")
#fix(auto)
summary(auto)
##       mpg          cylinders      displacement     horsepower      weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   150    : 22   Min.   :1613  
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.0   90     : 20   1st Qu.:2223  
##  Median :23.00   Median :4.000   Median :146.0   88     : 19   Median :2800  
##  Mean   :23.52   Mean   :5.458   Mean   :193.5   110    : 18   Mean   :2970  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0   100    : 17   3rd Qu.:3609  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   75     : 14   Max.   :5140  
##                                                  (Other):287                 
##   acceleration        year           origin                  name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   ford pinto    :  6  
##  1st Qu.:13.80   1st Qu.:73.00   1st Qu.:1.000   amc matador   :  5  
##  Median :15.50   Median :76.00   Median :1.000   ford maverick :  5  
##  Mean   :15.56   Mean   :75.99   Mean   :1.574   toyota corolla:  5  
##  3rd Qu.:17.10   3rd Qu.:79.00   3rd Qu.:2.000   amc gremlin   :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   amc hornet    :  4  
##                                                  (Other)       :368
str(auto)
## 'data.frame':    397 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
sum(complete.cases(auto)) # All the rows are 'complete cases'
## [1] 397
#However I can see that "?" is a value in the data set for column "horsepower". 

# It does not make much sense to have 'hoursepower' as factor.
auto$horsepower<-as.numeric(as.character(auto$horsepower))
## Warning: NAs introduced by coercion
#as.numeric(levels(f))[f] more efficient as the number of levels<number of obs.
sum(is.na(auto$horsepower)) #the same can now be seen with summary() function.
## [1] 5
#auto[rowSums(is.na(auto))>0,] would work too.
auto<-na.omit(auto)

Which of the predictors are quantitative, and which are qualitative?

str(auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int  33 127 331 337 355
##   ..- attr(*, "names")= chr  "33" "127" "331" "337" ...
unique(auto$origin)
## [1] 1 3 2
unique(auto$cylinders)
## [1] 8 4 6 3 5
auto$cylinders<-as.factor(auto$cylinders)
auto$origin<-factor(auto$origin, levels = c(1, 2, 3), labels = c('American', 'European', 'Japanese'))

Using the function ‘str()’ we can see that ‘name’ is the only qualitative variable but looking at the description of the data set (?Auto) it’s possible to notice that variable origin and cylinders should be set as factor as they can take on only fixed discrete values.

What is the range of each quantitative predictor? You can answer this using the range() function.

sum_num<-auto %>% select_if(is.numeric) %>% summary()
sum_num[c(1,6),] #select only Min and Max
##       mpg         displacement     horsepower        weight      acceleration  
##  Min.   : 9.00   Min.   : 68.0   Min.   : 46.0   Min.   :1613   Min.   : 8.00  
##  Max.   :46.60   Max.   :455.0   Max.   :230.0   Max.   :5140   Max.   :24.80  
##       year      
##  Min.   :70.00  
##  Max.   :82.00

What is the mean and standard deviation of each quantitative predictor?

sum_num[4,]
##               mpg      displacement        horsepower            weight 
## "Mean   :23.45  " "Mean   :194.4  " "Mean   :104.5  "  "Mean   :2978  " 
##      acceleration              year 
## "Mean   :15.54  " "Mean   :75.98  "
#or
auto %>% select_if(is.numeric) %>% sapply(mean)
##          mpg displacement   horsepower       weight acceleration         year 
##     23.44592    194.41199    104.46939   2977.58418     15.54133     75.97959
auto %>% select_if(is.numeric) %>% sapply(sd) #standard deviation
##          mpg displacement   horsepower       weight acceleration         year 
##     7.805007   104.644004    38.491160   849.402560     2.758864     3.683737

Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

auto_minus<-auto[-c(10:85),]

sum_num<-auto_minus %>% select_if(is.numeric) %>% summary()
sum_num[c(1,6),]
##       mpg         displacement     horsepower        weight      acceleration  
##  Min.   :11.00   Min.   : 68.0   Min.   : 46.0   Min.   :1649   Min.   : 8.50  
##  Max.   :46.60   Max.   :455.0   Max.   :230.0   Max.   :4997   Max.   :24.80  
##       year      
##  Min.   :70.00  
##  Max.   :82.00
auto_minus %>% select_if(is.numeric) %>% sapply(mean)
##          mpg displacement   horsepower       weight acceleration         year 
##     24.40443    187.24051    100.72152   2935.97152     15.72690     77.14557
auto_minus %>% select_if(is.numeric) %>% sapply(sd)
##          mpg displacement   horsepower       weight acceleration         year 
##     7.867283    99.678367    35.708853   811.300208     2.693721     3.106217

Using the full data set, investigate the predictors graphically, using scatter plots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

auto %>% dplyr::select(-name) %>% pairs()

ggcorr(auto,label=T)
## Warning in ggcorr(auto, label = T): data in column(s) 'cylinders', 'origin',
## 'name' are not numeric and were ignored

hw<-ggplot(auto,aes(x=horsepower,y=weight, col=cylinders)) +
  geom_point()

ca<-ggplot(auto,aes(x=cylinders,y=acceleration, col=origin)) +
  geom_boxplot() + labs(x="cylinders",colour = "origin")

wc<-ggplot(auto,aes(x=(year+1900),y=weight,col=cylinders)) +
  geom_point() +
  scale_x_continuous(breaks= scales::pretty_breaks()) + 
  labs(x="year")

mw<-ggplot(auto,aes(x=weight,y=mpg, col=origin)) +
  geom_point()

grid.arrange(hw,ca,wc,mw)

The correlation matrix and the scatterplot matrix are the first tools used to study the relationship among the predictors. Horsepowen, displacement and cylinders are highly correlated with each others and negatively with mpg. We use then scatterplot and boxplot to highlight the relationship between 3 variables at the time (x axis, y axis and color).

Weight-Horsepower, color=Cylinders

From the first plot, we can see that as horsepower increases weight increases as well. However the relationship is not linear. In addition, it can be noticed that more powerful cars have higher number of cylnders.

Cylinders-Acceleration, color=Origin

The second plot tells us that some auto with 3,5 and 8 cylinders are specific to a single country (namely: Japan, Europe, America). 5 cylinders seems to score higher than the others in acceleration.

Year-Weight, color=Cylinders

Cars with less cylinders tend to weigh less. Car weight seems to decrease over the year as the number of cars with 8 cylinders.

Weight-Mpg, color=Origin

Not surprisingly, mpg decreases as weight increases (not linearly). American cars tends to be heavier than Japanese and European.

Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

from the scatter plot matrix and the correlation matrix we see that mpg is strongly correlated to cylinders, displacement, horsepower and weight. The relationship is negative: mpg decreases as these variables (cylinders, displacement, horsepower and weight) increase. That being said these variable are also strongly correlated with each others, we need to take this in consideration when building the model predicting mpg.

Chapter 3

Exercise 3.9

  1. Produce a scatterplot matrix which includes all of the variables in the data set.
auto<-read.csv("Auto.csv")
dim(auto)
## [1] 397   9
pairs(auto)

#View(auto)

Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, which is qualitative.

#auto<-na.omit(auto)
data(Auto)
Auto %>% dplyr::select(-name) %>% ggcorr(label=T)

  1. Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Use the summary() function to print the results. Comment on the output.
auto_num<-select_if(auto,is.numeric)
auto_fit<-lm(mpg~.,auto_num)
summary(auto_fit)
## 
## Call:
## lm(formula = mpg ~ ., data = auto_num)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5573 -2.1745 -0.0456  1.8454 12.9946 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.014e+01  4.145e+00  -4.858 1.72e-06 ***
## cylinders    -4.198e-01  3.203e-01  -1.311   0.1908    
## displacement  1.742e-02  7.189e-03   2.423   0.0158 *  
## weight       -6.928e-03  5.781e-04 -11.983  < 2e-16 ***
## acceleration  1.591e-01  7.741e-02   2.055   0.0405 *  
## year          7.703e-01  4.934e-02  15.613  < 2e-16 ***
## origin        1.356e+00  2.691e-01   5.040 7.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.333 on 390 degrees of freedom
## Multiple R-squared:  0.8214, Adjusted R-squared:  0.8186 
## F-statistic: 298.9 on 6 and 390 DF,  p-value: < 2.2e-16

variables like cylinder, horsepower and weight have a negative coefficient. This means that increasing one of them (singularly) by 1 unit,holding the other constant/fixed, corresponds to a decrease of the response mpg equal to the corresponding coefficient. E.g: If we add 1 cylinder to a car (increase variable cylinder by 1 unit) and keep all the other variable constant. Another thing we can note is that the p-value for cylinder is quite high. This ‘might’ indicate that there is no significant relationship between the variable cylinders and the response mpg. F-statistic is very low, therefore we can reject the null hypothesis that all the coefficient are equal to 0. R-squared and Adjusted R-squared are pretty high which means that there is a pretty clear linear relationship between Y and X’s.

Use the plot() function to produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?

par(mfrow=c(2,2))
plot(auto_fit)

The relationship between Y and the predictor X’s might not be linear as the errors terms plotted against the fitted values seem to scatter around curvilinear line. Further to this variance of the error terms is not constant.

According to Cook’s distance method there are no influencial outliers

Use the * and : symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?

auto_fit2<-lm(mpg~ (. -name)^2, data=Auto) #This creates all combinations of two-way interactions between the predictors (name excluded)
summary(auto_fit2)$coefficients %>% 
  data.frame() %>%
  subset(Pr...t..<=0.05) %>% 
  dplyr::select(everything(),p_value=Pr...t..)
##                          Estimate  Std..Error   t.value     p_value
## displacement         -0.478538689 0.189353429 -2.527225 0.011920695
## acceleration         -5.859173212 2.173621188 -2.695582 0.007353578
## origin              -20.895570401 7.097090511 -2.944245 0.003445892
## displacement:year     0.005933802 0.002390716  2.482019 0.013515633
## acceleration:year     0.055621508 0.025581747  2.174265 0.030330641
## acceleration:origin   0.458316099 0.156659694  2.925552 0.003654670
#subset keeps row names

Displacement:year, acceleration:year, acceleration:origin seem to be statistically significant according to p-value criterion.

Try a few different transformations of the variables, such as \(\log(x), \sqrt{x}, x^2\), Comment on your findings.

lm.fit<-lm(mpg ~ displacement + horsepower + weight, data=Auto)
lm.fit.log<-lm(mpg ~ log(displacement) + log(horsepower) + log(weight), data=Auto)
lm.fit.sqrt<-lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight), data=Auto)
lm.fit.p2<-lm(mpg ~ poly(displacement,2) + poly(horsepower,2) + poly(weight,2), data=Auto)

summary(lm.fit)$adj.r.squared
## [1] 0.7046897
summary(lm.fit.log)$adj.r.squared
## [1] 0.7401928
summary(lm.fit.sqrt)$adj.r.squared
## [1] 0.7254835
summary(lm.fit.p2)$adj.r.squared
## [1] 0.7494586
par(mfrow=c(2,2))
plot(lm.fit)

par(mfrow=c(2,2))
plot(lm.fit.log)

par(mfrow=c(2,2))
plot(lm.fit.sqrt)

par(mfrow=c(2,2))
plot(lm.fit.p2)

For simplicity’s sake we consider 3 variables: displacement, horsepower, weight and perform the transformation.

adjusted R^2 for the different models are all quite similar. The linear model performs a little bit worse than the others with transformed predictor.

The residual plots are all quite similar, all the models somehow suffer form non-constant variance of the residuals and the error term might not be normally distributed (skewed right). These 2 facts suggest that a transformation of the response might lead to better results. We can try with log(y) which is called ‘variance stabilizing’ transformation.

lm.logy<-lm(log(mpg) ~ displacement + horsepower + weight, data=Auto)
summary(lm.logy)
## 
## Call:
## lm(formula = log(mpg) ~ displacement + horsepower + weight, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47436 -0.10001 -0.00689  0.09827  0.52821 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.063e+00  4.420e-02  91.922  < 2e-16 ***
## displacement -3.506e-04  2.433e-04  -1.441     0.15    
## horsepower   -2.215e-03  4.736e-04  -4.677 4.02e-06 ***
## weight       -2.235e-04  2.633e-05  -8.487 4.54e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1568 on 388 degrees of freedom
## Multiple R-squared:  0.7891, Adjusted R-squared:  0.7874 
## F-statistic: 483.8 on 3 and 388 DF,  p-value: < 2.2e-16

Exercise 3.10

This question should be answered using the Carseats data set.

  1. Fit a multiple regression model to predict Sales using Price, Urban, and US.
data("Carseats")
str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
lm.fit<-lm(Sales~Price + Urban + US, data=Carseats)
summary(lm.fit)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!

PRICE: low p-value suggest that there is a significant relationship between price and sales. I reject the underlying null hypothesis beta=0. The estimated coefficient is negative meaning that when price increases sale decreases. every unit increase in price leads to a decrease in saleas by -0.0544588 holding the other predictors constant.

URBANYES: p-values is very high. We fail to reject the underlying null hypothesis (beta=0) and conclude that there is no relationship between variable urban and sales.

USYES: low p-value suggest that there is a significant relationship between the variable US and sales. I reject the underlying null hypothesis beta=0 and conclude that if store is in the US, sales are 1.2005727 higher regardless of price.

Write out the model in equation form, being careful to handle the qualitative variables properly.

\(Sales = 13.043 + 1.2X_{us} - 0.022X_{urban} - 0.054X_{price}\)

with \(\cases{X_{us} =1 &\text{in US} \\X_{us}=0 &\text{outside US}}\) and \(\cases{X_{urban} =1 &\text{urban} \\X_{urban}=0 &\text{non-urban}}\)

For which of the predictors can you reject the null hypothesis \(H_0 : \beta_{j} = 0\) ?

USYes and Price. The related p-values is lower than any standard level of significance.

On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.

lm.fit2<-lm(Sales~Price + US, data=Carseats)
summary(lm.fit2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

How well do the models in (a) and (e) fit the data?

Both the model do not fit the date very well, the adjusted R^2 is quite low.

Using the model from (e), obtain 95% confidence intervals for the coefficient(s).

confint(lm.fit2, level = 0.95)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

Is there evidence of outliers or high leverage observations in the model from (e)?

par(mfrow=c(2,2))
plot(lm.fit2)

Observation 51,69 and 377 might be outliers as the corresponding residual is high. However there is no influential case according to Cook’s distance as no observation is beyond the dotted line.

PS: somehow with par(mfrow=c(2,2)) the appearance of the plots change and it seems that there are observation beyond the dotted line for Cook’s distance. I trust the plot without par(mfrow=c(2,2)).

Chapter 4

Exercise 4.10

This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010

Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

data("Weekly")
head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
pairs(Weekly)

head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
Weekly %>% select_if(is.numeric) %>% ggcorr()

According to the scatter plot matrix it seems that there is a positive relationship between ‘Volume’ and ‘Year’. At this point we cannot spot any other pattern. As year passes the volume increases. The correlation matrix confirm that only Volume and Year there correlation.

ggplot(Weekly,mapping=aes(x=Year,y=Volume)) + geom_point()

The average number of shares traded daily increased from 2001 to 2005.

Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

levels(Weekly$Direction)
## [1] "Down" "Up"
W.fit<-glm(Direction~Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, Weekly, family = binomial)

summary(W.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Only the variable ‘Lag2’ can be considered as statistically significant for a level of significance alfa = 0.05 (p-value_Lag2<0.05). All the other p-values are higher than usual significance values. This means that for all the predictor, except for ‘Lag2’, we fail to reject the null hypothesis (beta=0).

Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

W.fit.pred<-predict(W.fit,type="response")
contrasts(Weekly$Direction) #to check how R has coded "up" and "down".
##      Up
## Down  0
## Up    1
Weekly$fit_pred<-W.fit.pred
Weekly$fit_dir<-ifelse(Weekly$fit_pred>0.5,"Up","Down")#we have set 0.5 as threshold.
conf<-table(Weekly$fit_dir,Weekly$Direction)
conf
##       
##        Down  Up
##   Down   54  48
##   Up    430 557
#fp<-conf["Up","Down"]/colSums(conf)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
table(Weekly$Direction)
## 
## Down   Up 
##  484  605
#sum(diag(conf))/sum(conf)
correct<-mean(Weekly$Direction == Weekly$fit_dir)
correct
## [1] 0.5610652
paste('Correct prediction:', correct)
## [1] "Correct prediction: 0.561065197428834"

The confusion matrix compares my prediction with the actual value. The prediction was generated by a model build on the training data set. The predicted values (based on the training data set) are compared to the actual response in the training data set.

On the main diagonal I have the correct prediction: Responses predicted UP or DOWN and the actual response is UP or DOWN respectively.

On the off-diagonal, I have the errors: values predicted UP or DOWN but the actual value is opposite.

With the confusion matrix we can calculate the False Positive Ratio: Responses that I have predicted as Positive/True/1 but the real values is Negative/False/0. This number is than divided by the number of real negative values.

Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

Weekly_tr<-filter(Weekly,Year<=2008)  %>% dplyr::select(-fit_dir,-fit_pred) ## MASS package clashes with dplyr

#summary(Weekly_tr)

w.fit.tr<-glm(Direction~Lag2,Weekly_tr,family=binomial) #Model built using training data set.
#summary(w.fit.tr)

Weekly_test<-filter(Weekly,Year>2008) %>% dplyr::select(-fit_dir,-fit_pred)


W.fit.pred.test<-predict.glm(w.fit.tr, newdata = Weekly_test, type="response") # vector of probabilities. I am applying the model (function) found fitting a glm on training data on the test dataset

Weekly_test$fit_pred<-W.fit.pred.test #add a column to the data set
Weekly_test$fit_dir<-ifelse(Weekly_test$fit_pred>0.5,"Up","Down")
#we have set 0.5 as threshold.


conf_t<-table(Weekly_test$fit_dir,Weekly_test$Direction)
conf_t
##       
##        Down Up
##   Down    9  5
##   Up     34 56
fp_t<-conf_t["Up","Down"]/colSums(conf_t)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t
##      Down 
## 0.7906977
correct_t<-mean(Weekly_test$Direction == Weekly_test$fit_dir)
#same as:sum(diag(conf_t))/sum(conf_t)

paste('Correct prediction:', correct_t)
## [1] "Correct prediction: 0.625"

Fraction of correct prediction increased. False Positive ratio decreased.

Repeat (d) using LDA.

#alternative approach for test and training data set:
#add a column with value [train,test] instead of duplicating the dataset and use argument 'subset=train' in glm, lda etc etc functions.

w.fit.tr.lda<-lda(Direction~Lag2,Weekly_tr) #Model built using training data set.
#summary(w.fit.tr.lda)


lda.predict<-predict(w.fit.tr.lda, newdata = Weekly_test) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "clasS" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant


Weekly_test$fit_dir.lda<-lda.predict$class


conf_t.lda<-table(Weekly_test$fit_dir.lda,Weekly_test$Direction)
conf_t.lda
##       
##        Down Up
##   Down    9  5
##   Up     34 56
fp_t.lda<-conf_t.lda["Up","Down"]/colSums(conf_t.lda)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.lda
##      Down 
## 0.7906977
correct_t.lda<-mean(Weekly_test$Direction == Weekly_test$fit_dir.lda)
#same as: sum(diag(conf_t.lda))/sum(conf_t.lda)

paste('Correct prediction:', correct_t.lda)
## [1] "Correct prediction: 0.625"

LDA and logistic regression return the same results

Repeat (d) using QDA.

w.fit.tr.qda<-qda(Direction~Lag2,Weekly_tr) #Model built using training data set.
#summary(w.fit.tr.qda)


qda.predict<-predict(w.fit.tr.qda, newdata = Weekly_test) #predict applied to QDA returns a list with 3 element.
#"class", "posterior", "X".
# "clasS" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant


Weekly_test$fit_dir.qda<-qda.predict$class


conf_t.qda<-table(Weekly_test$fit_dir.qda, Weekly_test$Direction)
conf_t.qda
##       
##        Down Up
##   Down    0  0
##   Up     43 61
fp_t.qda<-conf_t.qda["Up","Down"]/colSums(conf_t.qda)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.qda
## Down 
##    1
correct_t.qda<-mean(Weekly_test$Direction == Weekly_test$fit_dir.qda)
#same as: sum(diag(conf_t.qda))/sum(conf_t.qda)

paste('Correct prediction:', correct_t.qda)
## [1] "Correct prediction: 0.586538461538462"

Repeat (d) using KNN with K = 1.

library(class)
set.seed(1)
knn.pred<-knn(as.data.frame(Weekly_tr$Lag2), as.data.frame(Weekly_test$Lag2), Weekly_tr$Direction, 1)

# knn wants as input train and test as matrix/df.
# train = matrix/df of training set
# test = " " of test set
# cl = factor or true classification of training set

conf_t.knn<-table(knn.pred, Weekly_test$Direction)
conf_t.knn
##         
## knn.pred Down Up
##     Down   21 30
##     Up     22 31
fp_t.knn<-conf_t.knn["Up","Down"]/colSums(conf_t.knn)["Down"] #FalsePositive (values that I have predicted as UP but in reality are DOWN)/Negative (Real number of Negative)
fp_t.knn
##      Down 
## 0.5116279
correct_t.knn<-mean(Weekly_test$Direction == knn.pred)
#same as: sum(diag(conf_t.knn))/sum(conf_t.knn)
paste('Correct prediction:', correct_t.knn)
## [1] "Correct prediction: 0.5"

Which of these methods appears to provide the best results on this data?

Logistic regression and linear discriminant seems to guarantee the higher level of correctness however the false positive for KNN algorithm is lower than the other methods.

Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

k.val<-rep(NA,15)
knn.correct<-k.val
set.seed(1)
for( i in seq_along(k.val)) {
  knn.pred<-knn(as.data.frame(Weekly_tr$Lag2), as.data.frame(Weekly_test$Lag2), Weekly_tr$Direction, k=i)

  knn.correct[i]<-mean(Weekly_test$Direction == knn.pred)
}

data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
    ggplot(aes(k, cor_per)) +
    geom_col() +
    labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
    scale_x_continuous(breaks = 1:length(k.val)) +
    coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

max(knn.correct)
## [1] 0.6057692
which.max(knn.correct)
## [1] 12
lda.mod <- lda(Direction ~ Lag2 + Lag3, data = Weekly_tr)

pred<- predict(lda.mod, newdata = Weekly_test)
pred.val <- pred$class

paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.625"
table(pred.val, Weekly_test$Direction)
##         
## pred.val Down Up
##     Down    8  4
##     Up     35 57
lda.mod <- lda(Direction ~ Lag2:Lag5, data = Weekly_tr)

pred<- predict(lda.mod, newdata = Weekly_test)
pred.val <- pred$class

paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(pred.val, Weekly_test$Direction)
##         
## pred.val Down Up
##     Down    0  0
##     Up     43 61
qda.mod <- qda(Direction ~ Lag2 + log(abs(Lag2)), data = Weekly_tr)

pred<- predict(qda.mod, newdata = Weekly_test)
pred.val <- pred$class

paste('Correct prediction:', mean(pred.val == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(pred.val, Weekly_test$Direction)
##         
## pred.val Down Up
##     Down   11 11
##     Up     32 50
l.mod = glm(Direction ~ Lag2:Lag1, data = Weekly_tr, family = binomial)
l.probs = predict(l.mod, newdata = Weekly_test, type = "response")
l.pred<-ifelse(l.probs >= 0.5, 'Up', 'Down')

paste('Correct prediction:', mean(l.pred == Weekly_test$Direction))
## [1] "Correct prediction: 0.586538461538462"
table(l.pred, Weekly_test$Direction)
##       
## l.pred Down Up
##   Down    1  1
##   Up     42 60

Exercise 4.11

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

  1. Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.
data(Auto)
Auto$mpg01<-ifelse(Auto$mpg>median(Auto$mpg),1,0)
mean(Auto$mpg01) # it should return 0.5
## [1] 0.5

Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.

Auto %>% dplyr::select(-name, -mpg) %>% plot()

str(Auto)
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ mpg01       : num  0 0 0 0 0 0 0 0 0 0 ...
Auto %>% dplyr::select(-name,-mpg) %>% ggcorr(palette = "RdBu", label = TRUE) 

Cylinder, Displacement, horsepower and Weight seem to have the strongest relationship with mpg01.

#ggplot(Auto,aes(x=as.factor(mpg01), y=cylinders)) + geom_boxplot()
base<-ggplot(Auto) + labs(x = "mpg01")

d<-base + geom_boxplot(aes(x=as.factor(mpg01), y=displacement)) + coord_flip()

h<-base + geom_boxplot(aes(x=as.factor(mpg01), y=horsepower)) + coord_flip()

w<-base + geom_boxplot(aes(x=as.factor(mpg01), y=weight)) + coord_flip()


grid.arrange(d, h, w, nrow=3)

boxplots confirm what found with the correlation matrix and the matrix scatterplot.

Split the data into a training set and a test set.

set.seed(1)
train<-sample(nrow(Auto), nrow(Auto)/2)

train.set<-Auto[train,]
test.set<-Auto[-train,]

Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

lda.fit<-lda(mpg01~ cylinders + displacement + horsepower + weight, data = train.set) #Model built using training data set.
#summary(w.fit.tr.lda)


lda.predict<-predict(lda.fit, newdata = test.set) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant

correct_t.lda<-mean(test.set$mpg01 == lda.predict$class)

paste('Correct prediction:', correct_t.lda)
## [1] "Correct prediction: 0.872448979591837"
paste('test error:', 1-correct_t.lda)
## [1] "test error: 0.127551020408163"

Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

qda.fit<-qda(mpg01~ cylinders + displacement + horsepower + weight, data = train.set) #Model built using training data set.


qda.predict<-predict(qda.fit, newdata = test.set) #predict applied to QDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant

correct_t.qda<-mean(test.set$mpg01 == qda.predict$class)

paste('Correct prediction:', correct_t.qda)
## [1] "Correct prediction: 0.88265306122449"
paste('test error:', 1-correct_t.qda)
## [1] "test error: 0.11734693877551"

QDA does a little bit worse than LDA

Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

log.reg = glm(mpg01~ cylinders + displacement + horsepower + weight, data = train.set, family = binomial)
log.probs = predict(log.reg, newdata = test.set, type = "response")
log.pred<-ifelse(log.probs > 0.5, 1, 0)

correct_t.log<-mean(log.pred == test.set$mpg01)
paste('Correct prediction:', correct_t.log)
## [1] "Correct prediction: 0.877551020408163"
paste('test error:', 1-correct_t.log)
## [1] "test error: 0.122448979591837"

Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?

k.val<-rep(NA,20)
knn.correct<-k.val
var_sel<-c('cylinders', 'displacement', 'horsepower', 'weight')
x_train<-train.set %>% dplyr::select(var_sel)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(var_sel)` instead of `var_sel` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
y_train<-train.set$mpg01 
x_test<-test.set %>% dplyr::select(var_sel)

set.seed(1)
for( i in seq_along(k.val)) {
  knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)

  knn.correct[i]<-mean(test.set$mpg01 == knn.pred)
}

data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
    ggplot(aes(k, cor_per)) +
    geom_col() +
    labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
    scale_x_continuous(breaks = 1:length(k.val)) +
    coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))

paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 3"
paste('Correct prediction:', max(knn.correct))
## [1] "Correct prediction: 0.877551020408163"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.122448979591837"

Exercise 4.12

This problem involves writing functions.

Write a function, Power(), that prints out the result of raising 2 to the 3rd power. In other words, your function should compute \(2^3\) and print out the results.

Hint: Recall that x^a raises x to the power a. Use the print() function to output the result.

Power<-function(a = 2, b = 3) print(a^b)

Power()
## [1] 8

Create a new function, Power2(), that allows you to pass any two numbers, x and a, and prints out the value of x^a. You can do this by beginning your function with the line

Power2 =function (x,a){

You should be able to call your function by entering, for instance,

Power2 (3,8)

on the command line. This should output the value of \(3^8\), namely, 6,561.

Power2 <- function(x, a) print(x^a)
Power2(3,8)
## [1] 6561

Using the Power2() function that you just wrote, compute \(10^3\), \(8^{17}\), and \(131^3\)

Power2(10,3)
## [1] 1000
Power2(8,17)
## [1] 2.2518e+15
Power2(131,3)
## [1] 2248091

Now create a new function, Power3(), that actually returns the result x^a as an R object, rather than simply printing it to the screen. That is, if you store the value x^a in an object called result within your function, then you can simply return() this result, using the following line:

return (result )

The line above should be the last line in your function, before the } symbol.

Power3 = function(x, a) {
    result = x^a
    return(result)
}

Now using the Power3() function, create a plot of \(f(x) = x^2\). The x-axis should display a range of integers from 1 to 10, and the y-axis should display \(x^2\). Label the axes appropriately, and use an appropriate title for the figure. Consider displaying either the x-axis, the y-axis, or both on the log-scale. You can do this by using log=‘‘x’’, log=‘‘y’’, or log=‘‘xy’’ as arguments to the plot() function.

x = 1:10
plot(x, Power3(x, 2), log = "xy", ylab = "Log of y = x^2", xlab = "Log of x", 
    main = "Log of x^2 versus Log of x")

Create a function, PlotPower(), that allows you to create a plot of x against x^a for a fixed a and for a range of values of x. For instance, if you call

PlotPower (1:10 ,3)

then a plot should be created with an x-axis taking on values 1, 2, . . . , 10, and a y-axis taking on values \(1^3, 2^3, . . . , 10^3\).

PlotPower = function(x, a) {
    plot(x, Power3(x, a))
}
PlotPower(1:10, 3)

Exercise 4.13

Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.

data(Boston)
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
#plot(Boston)
Boston %>% ggcorr(palette = "RdBu", label = TRUE) 

Boston$crim01<-ifelse(Boston$crim>median(Boston$crim),1,0)

set.seed(1)
train<-sample(nrow(Boston),nrow(Boston)/2)


train.set<-Boston[train,]
test.set<-Boston[-train,]

LOGISTIC REGRESSION: all variables

log_reg<-glm(crim01 ~. -crim, data = train.set, family = binomial)
log_prob<-predict(log_reg, newdata = test.set, type="response")
log_pred<-ifelse(log_prob>0.5, 1, 0) 

paste("test error:", mean(log_pred != test.set$crim01))
## [1] "test error: 0.0988142292490119"

LOGISTIC REGRESSION: subset (highest correlation value with crim01)

log_reg.sub<-glm(crim01 ~ tax + rad + dis + age + nox + indus, data = train.set, family = binomial)
log_prob.sub<-predict(log_reg.sub, newdata = test.set, type="response")
log_pred.sub<-ifelse(log_prob.sub>0.5, 1, 0) 

paste("test error:", mean(log_pred.sub != test.set$crim01))
## [1] "test error: 0.114624505928854"

LDA: all variables

lda.fit<-lda(crim01~. -crim, data = train.set) #Model built using training data set.

lda.predict<-predict(lda.fit, newdata = test.set) #predict applied to LDA returns a list with 3 element.
#"class", "posterior", "X".
# "class" contains the class [up,down].
# "posterior" contains the posterior probability
# "X" contains the linear discriminant

paste('test error:', mean(test.set$crim01 != lda.predict$class))
## [1] "test error: 0.162055335968379"

LDA subset (highest correlation value with crim01)

lda.fit<-lda(crim01~tax + rad + dis + age + nox + indus, data = train.set) #Model built using training data set.

lda.predict<-predict(lda.fit, newdata = test.set)

paste('test error:', mean(test.set$crim01 != lda.predict$class))
## [1] "test error: 0.158102766798419"

QDA: all variables

qda.fit<-qda(crim01~. -crim, data = train.set) #Model built using training data set.

qda.predict<-predict(qda.fit, newdata = test.set) 

paste('test error:', mean(test.set$crim01 != qda.predict$class))
## [1] "test error: 0.0988142292490119"

QDA subset (highest correlation value with crim01)

qda.fit<-qda(crim01~tax + rad + dis + age + nox + indus, data = train.set) #Model built using training data set.

qda.predict<-predict(qda.fit, newdata = test.set)

paste('test error:', mean(test.set$crim01 != qda.predict$class))
## [1] "test error: 0.0988142292490119"

KNN: all variables

k.val<-rep(NA,20)
knn.correct<-k.val

x_train<-train.set %>% dplyr::select(-crim)
y_train<-train.set$crim01 
x_test<-test.set %>% dplyr::select(-crim)

set.seed(1)
for( i in seq_along(k.val)) {
  knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)

  knn.correct[i]<-mean(test.set$crim01 == knn.pred)
}

data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
    ggplot(aes(k, cor_per)) +
    geom_col() +
    labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
    scale_x_continuous(breaks = 1:length(k.val)) +
    coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))

paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 1"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.0948616600790514"

KNN: subset

k.val<-rep(NA,20)
knn.correct<-k.val
var_sel<-c("tax","rad","dis","age","nox","indus")
x_train<-train.set %>% dplyr::select(var_sel)
y_train<-train.set$crim01 
x_test<-test.set %>% dplyr::select(var_sel)

set.seed(1)
for( i in seq_along(k.val)) {
  knn.pred<-knn(train= x_train, test = x_test, cl = as.factor(y_train), k=i)

  knn.correct[i]<-mean(test.set$crim01 == knn.pred)
}

data_frame(k = seq_along(knn.correct), cor_per = knn.correct) %>%
    ggplot(aes(k, cor_per)) +
    geom_col() +
    labs(x = 'K', y = 'Correct prediction', title = 'KNN Accuracy for different values of K') +
    scale_x_continuous(breaks = 1:length(k.val)) +
    coord_cartesian(ylim = c(min(knn.correct), max(knn.correct)))

paste('Best K-value:', which.max(knn.correct))
## [1] "Best K-value: 3"
paste('Test error:', 1-max(knn.correct))
## [1] "Test error: 0.0869565217391305"

KNN seems to be return the most accurate results. The best test error was achieved with K=3 and a subset of the predictors available “tax”,“rad”,“dis”,“age”,“nox”,“indus” or with k=1 and all the predictors.

Chapter 5

Exercise 5.5

In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.

data(Default)

Fit a logistic regression model that uses income and balance to predict default.

log.fit<-glm(default~ income + balance, data=Default, family = binomial)

Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:

  1. Split the sample set into a training set and a validation set.
set.seed(10)
train=sample(nrow(Default) , nrow(Default)/2) #Generates a vector containing the indexes for train set. Remaining indexes in 'Default' data set are for testing.
  1. Fit a multiple logistic regression model using only the training observations.
log.fit_TR<-glm(default ~ income + balance, data=Default, family = binomial, subset = train)
  1. Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
test_set= Default[-train,]
test_set$log.pred.test<-predict(log.fit_TR, type="response", newdata = test_set)

test_set$log.prediction<-ifelse(test_set$log.pred.test>0.5, "Yes", "No") 
test_set$log.prediction[1:5] #example
## [1] "No" "No" "No" "No" "No"
  1. Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
#attributes(Default$default)
#Default$log.prediction<-factor(Default$log.prediction, levels = attributes(Default$default)$levels)
VSE<-mean(test_set$log.prediction!=test_set$default)
VSE
## [1] 0.0288

Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.

1st iteration

set.seed(1)
val_err<-rep(NA,3)
split<-c(0.5,0.7,0.9)
for (i in seq_along(val_err)) {
    train_size<-nrow(Default) * split[i]
    train<-sample(1:nrow(Default), size = train_size)
    train_set<-Default[train,]
    test_set<-Default[-train,]
    
    log_train<-glm(default ~ income + balance, data = train_set, family = 'binomial')
    pred.prb<-predict(log_train, test_set, type = 'response')
    pred_values<-ifelse(pred.prb > 0.5, 'Yes', 'No')
    
    val_err[i]<-mean(pred_values != test_set$default)
}

ss.err<-data.frame(split,val_err)
ss.err
##   split val_err
## 1   0.5  0.0254
## 2   0.7  0.0290
## 3   0.9  0.0160
bestsplit<-ss.err %>% filter(val_err==min(val_err))
bestsplit
##   split val_err
## 1   0.9   0.016

We have lowest estimated test error with split 0.9. Meaning that 0.9of the observation were used to train the model while the remaining ones where used to estimated the test error.

Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.

set.seed(1)
train_size<-nrow(Default) * bestsplit$split
train<-sample(1:nrow(Default), size = train_size)
train_set<-Default[train,]
test_set<-Default[-train,]
    
log_train.S<-glm(default ~ income + balance + student, data = train_set, family = 'binomial')

pred.prb.S<-predict(log_train.S, test_set, type = 'response')
pred_values.S<-ifelse(pred.prb.S > 0.5, 'Yes', 'No')
    
mean(pred_values.S != test_set$default)
## [1] 0.031

Including Student in the model does not lead to any accuracy improvement.

Exercise 5.6

We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.

data(Default)
data(Boston)

Using the summary() and glm() functions, determine the estimated standard errors for the coefficients associated with income and balance in a multiple logistic regression model that uses both predictors.

log.fit<-glm(default~ income + balance, data=Default, family = binomial)

log.coef<-summary(log.fit)$coefficients # [,'Std. Error']

Write a function, boot.fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.

#The bootstrap approach can be used to assess the variability of the coefficient estimates and predictions from a statistical learning method.

#Function 'boot.fn' returns the coefficient estimates for income and balance in the multiple logistic regression model using the index specified
boot.fn<-function(data,index){
 return(coef(glm(default~income + balance, data=data, subset = index, family = binomial)))
  
}

Use the boot() function together with your boot.fn() function to estimate the standard errors of the logistic regression coefficients for income and balance.

n<-nrow(Default)
set.seed (1) # If I run set.seed and boot.fn over and over I always get the same results but if I run only boot.fn always get different results. Set.seed provide reproducible results.

#boot.fn(Default ,sample (n ,n , replace =T))
#fit log model over a sample from Default df and as "big" as Default df. the sample is draw with replacement, therefore the same obs can appear more than once.


boot.coef<-boot(Default ,boot.fn ,100)
boot.coef
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01  8.556378e-03 4.122015e-01
## t2*  2.080898e-05 -3.993598e-07 4.186088e-06
## t3*  5.647103e-03 -4.116657e-06 2.226242e-04
#The boot command executes the re-sampling of the data set and calculation of the statistic(s) of interest on these samples (coefficient in this case). Before calling boot, it's necessary to define a function that will return the statistic(s) that you would like to bootstrap. 

Comment on the estimated standard errors obtained using the glm() function and using your bootstrap function.

log.coef
##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136
boot.coef
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01  8.556378e-03 4.122015e-01
## t2*  2.080898e-05 -3.993598e-07 4.186088e-06
## t3*  5.647103e-03 -4.116657e-06 2.226242e-04

The std. error obtained by the “Regular” glm() function and the bootstrap function are very similar. This suggest that the model found using logistic regression provides a good fit to the data. SE(b..)^2 depends on sigma^2 which is the variance of the error term. Sigma^2 is an unknown parameter as we don’t have this kind of details from the population. Sigma^2 is estimated using RSE (residual standard error) = sqrt(RSS/(n-2)) where RSS is the residual sum of square sum((y-yhat)^2). So if y and yhat are “close” to each other, this will result in small RSE.

RSS is also SSE, while RSE is sometimes referred to sqrt(MSE)??

Exercise 5.8

We will now perform cross-validation on a simulated data set.

Generate a simulated data set as follows:

set.seed (1)
x = rnorm (100)
y = x - 2*x^2 + rnorm (100)

In this data set, what is n and what is p? Write out the model used to generate the data in equation form.

x is a vector containing 100 random values from a normal distribution with mu = 0 and sigma = 1. So n = 100.

There are going to be 2 regression parameters (p=2) as the mu_x = 0 y = x - 2*x^2 + rnorm (100) can be rewritten as: y = x - 2x^2 + epsilon

In this model epsilon (random error term) would accountable for all the variation of predictor x as input.

Create a scatterplot of X against Y . Comment on what you find.

plot(x,y)

#min(y)
#max(y)

The plot shows a quadratic form. x ranges between -2 and 2 while y from -12 and 2.

Set a random seed, and then compute the LOOCV errors that result from fitting the following four models using least squares:

  1. Y = β0 + β1X + epsilon

  2. Y = β0 + β1X + β2X2 + epsilon

  3. Y = β0 + β1X + β2X2 + β3X3 + epsilon

  4. Y = β0 + β1X + β2X2 + β3X3 + β4X4 + epsilon.

Note you may find it helpful to use the data.frame() function to create a single data set containing both X and Y.

DF = data.frame(x, y)
set.seed(1)

# i.
glm.fit = glm(y ~ x)
cv.glm(DF, glm.fit)$delta
## [1] 7.288162 7.284744
# This function calculates the estimated K-fold cross-validation prediction error for generalized linear models. if k is not specified it is set equal to n ?
## ii.
glm.fit = glm(y ~ poly(x,2))
cv.glm(DF, glm.fit)$delta
## [1] 0.9374236 0.9371789
## iii.
glm.fit = glm(y ~ poly(x,3))
cv.glm(DF, glm.fit)$delta
## [1] 0.9566218 0.9562538
## iv.
glm.fit = glm(y ~ poly(x,4))
cv.glm(DF, glm.fit)$delta
## [1] 0.9539049 0.9534453

Alternatively

set.seed(1)
ip = 4 #iteration/polynomial degree
cv.error=rep (0,ip)
for (i in 1:ip){
  glm.fit<-glm(y~poly(x ,i))
  cv.error[i]<-cv.glm(DF ,glm.fit)$delta[1]
  }
cv.error
## [1] 7.2881616 0.9374236 0.9566218 0.9539049

Repeat (c) using another random seed, and report your results. Are your results the same as what you got in (c)? Why?

set.seed(3)
ip = 4 #iteration/polynomial degree
cv.error=rep (0,ip)
for (i in 1:ip){
  glm.fit<-glm(y~poly(x ,i))
  cv.error[i]<-cv.glm(DF ,glm.fit)$delta[1]
  }
cv.error
## [1] 7.2881616 0.9374236 0.9566218 0.9539049

Setting a new seed does not change the results obtained as there is no randomness in the training/validation set splits. LOOCV will fit the statistical learning method (Linear regression, here) n times over the training set composed by n-1 observations.

Which of the models in (c) had the smallest LOOCV error? Is this what you expected? Explain your answer.

The second degree polynomial has the lowest cross-validation (prediction?) error as the true underlying function between y and x is actually quadratic: y = x - 2x^2 + epsilon.

Comment on the statistical significance of the coefficient estimates that results from fitting each of the models in (c) using least squares. Do these results agree with the conclusions drawn based on the cross-validation results?

glm(y~poly(x ,2)) %>% summary()
## 
## Call:
## glm(formula = y ~ poly(x, 2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9650  -0.6254  -0.1288   0.5803   2.2700  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.5500     0.0958  -16.18  < 2e-16 ***
## poly(x, 2)1   6.1888     0.9580    6.46 4.18e-09 ***
## poly(x, 2)2 -23.9483     0.9580  -25.00  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9178258)
## 
##     Null deviance: 700.852  on 99  degrees of freedom
## Residual deviance:  89.029  on 97  degrees of freedom
## AIC: 280.17
## 
## Number of Fisher Scoring iterations: 2

p-values are all much lower than standard level of significance (0.05,0.001…). We reject the hypothesis that related betas are = 0 and conclude that both the x terms (x and x^2) are statistically significant. This is in line with the CV results.

Exercise 5.9

We will now consider the Boston housing data set, from the MASS library.

Based on this data set, provide an estimate for the population mean of medv. Call this estimate ˆμ.

mu_hat<-mean(Boston$medv)
mu_hat
## [1] 22.53281

Provide an estimate of the standard error of ˆμ. Interpret this result. Hint: We can compute the standard error of the sample mean by dividing the sample standard deviation by the square root of the number of observations.

n<-nrow(Boston)
se_hat<-sd(Boston$medv)/sqrt(n)
se_hat
## [1] 0.4088611

The standard error tells us the average amount that the estimate ˆμ differs from the actual value of μ.

Now estimate the standard error of ˆμ using the bootstrap. How does this compare to your answer from (b)?

boot.fn<-function(data,index){
  return(mean(data[index]))
}
n<-nrow(Boston)
#boot.fn(Boston$medv, sample(n,n, replace = TRUE)) this return one iteration of the bootstrap method
bstrap = boot(Boston$medv, boot.fn, 1000)
bstrap
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original     bias    std. error
## t1* 22.53281 0.01381542   0.4181613
#plot(bstrap)
#It seems that the function boot.fn() needs to have arguments (data,index) as they are needed by the function boot() in order to perform bootstrapping.

The standard error from point b) and c) are very similar.

Based on your bootstrap estimate from (c), provide a 95% confidence interval for the mean of medv. Compare it to the results obtained using t.test(Boston$medv). Hint: You can approximate a 95% confidence interval using the formula [ˆμ − 2SE(ˆμ), ˆμ + 2SE(ˆμ)].

#to get sd from output of boot() function:
# sd(bstrap$t) as t contains all the estimates of t (medv). I am applying sd() function to bstrap$t which is a vector with R=1000 values
# se <- summary(bstrap) 

ci<-c(mu_hat - 2*sd(bstrap$t), mu_hat + 2*sd(bstrap$t))
ci
## [1] 21.69648 23.36913
t.test(Boston$medv) # $conf.int
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

Confidence intervals calculated with t.test() and boostrap are very similar. The difference can be attributed to the different values for st dev.

Based on this data set, provide an estimate, ˆμmed, for the median value of medv in the population.

medv.med<-median(Boston$medv)
medv.med
## [1] 21.2

We now would like to estimate the standard error of ˆμmed. Unfortunately, there is no simple formula for computing the standard error of the median. Instead, estimate the standard error of the median using the bootstrap. Comment on your findings.

boot.fn<-function(data,index){
  return(median(data[index]))
}

bstrap <- boot(Boston$medv, boot.fn, 1000)
bstrap
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0308   0.3661426

Median of 21.2 with SE of 0.39. Small standard error relative to median value.

Based on this data set, provide an estimate for the tenth percentile of medv in Boston suburbs. Call this quantity ˆμ0.1. (You can use the quantile() function.)

medv.tenth = quantile(Boston$medv, 0.1)
medv.tenth
##   10% 
## 12.75

Use the bootstrap to estimate the standard error of ˆμ0.1. Comment on your findings.

boot.fn<-function(data,index){
  return(quantile(data[index],0.1))
}

bstrap <- boot(Boston$medv, boot.fn, 1000)
bstrap
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 -0.0016   0.5138935

St. error is still quite small compared to the tenth-percentile value.

Chapter 6

Exercise 6.8

Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector \(\epsilon\) of length n = 100.

x=rnorm(100)
e = rnorm(100)

Generate a response vector Y of length n = 100 according to the model:

\(Y = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \epsilon\), where \(\beta_0,\beta_1,\beta_2\) and \(\beta_3\) are constants of your choice

b0=3
b1=1
b2=-2
b3=4

y=b0 + b1*x + b2*x^2 + b3*x^3

Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors \(X,X^2, . . .,X^{10}\). What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.

data<-data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10),data,nvmax = 10)
regfit.adjr2<-summary(regfit.full)$adjr2
regfit.cp<-summary(regfit.full)$cp
regfit.bic<-summary(regfit.full)$bic

#The summary() command outputs the best set of variables for each model size.

# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfit.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfit.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfit.bic)) %>% print()
## [1] "Polynimial degree: 4"
#ggplot(mapping=aes(x=seq_along(regfit.adjr2), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="adjR2")
#ggplot(mapping=aes(x=seq_along(regfit.bic), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="BIC")
#ggplot(mapping=aes(x=seq_along(regfit.cp), y=regfit.adjr2)) + geom_line() + labs(x="Poly degree", y="Cp")


plot(regfit.adjr2, xlab = "Subset Size", ylab = "Adjr2", type = "l")
points(which.max(regfit.adjr2), regfit.adjr2[which.max(regfit.adjr2)], pch = 4, col = "red", lwd = 3)

plot(regfit.cp, xlab = "Subset Size", ylab = "Cp", type = "l")
points(which.min(regfit.cp), regfit.cp[which.min(regfit.cp)], pch = 4, col = "red", lwd = 3)

plot(regfit.bic, xlab = "Subset Size", ylab = "BIC", type = "l")
points(which.min(regfit.bic), regfit.bic[which.min(regfit.bic)], pch = 4, col = "red", lwd = 3)

# cofficients for the models selected. 
coefficients(regfit.full, which.max(regfit.adjr2))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
##     1.577715   100.570909    -7.287776    64.353564
coefficients(regfit.full, which.min(regfit.cp))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)5 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01 -1.352911e-14 
##  poly(x, 10)6 
##  2.035174e-14
coefficients(regfit.full, which.min(regfit.bic))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)6 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01  2.035174e-14
# object regfit.full contains 10 models one for each polynomial degree/predictors available.
# I then have 1 best overall model according to adjusted R^2, one according to C'p and one according BIC.
# function coefficient extract coefficient of a specific model. so we need to specify for which model of regfit.full we want the coeff.
# same coefficients found fitting again a linear model
# lm(y~poly(x,which.max(regfit.adjr2))).

Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?

Forward stepwise selection

regfit.fw=regsubsets(y~poly(x,10),data, nvmax = 10, method = "forward")
regfitfw.adjr2<-summary(regfit.fw)$adjr2
regfitfw.cp<-summary(regfit.fw)$cp
regfitfw.bic<-summary(regfit.fw)$bic

#The summary() command outputs the best set of variables for each model size.

# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfitfw.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfitfw.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfitfw.bic)) %>% print()
## [1] "Polynimial degree: 4"
plot(regfitfw.adjr2, xlab = "Subset Size", ylab = "Forward Adjr2", type = "l")
points(which.max(regfitfw.adjr2), regfitfw.adjr2[which.max(regfitfw.adjr2)], pch = 4, col = "red", lwd = 3)

plot(regfitfw.cp, xlab = "Subset Size", ylab = "Forward Cp", type = "l")
points(which.min(regfitfw.cp), regfitfw.cp[which.min(regfitfw.cp)], pch = 4, col = "red", lwd = 3)

plot(regfitfw.bic, xlab = "Subset Size", ylab = "Forward BIC", type = "l")
points(which.min(regfitfw.bic), regfitfw.bic[which.min(regfitfw.bic)], pch = 4, col = "red", lwd = 3)

coefficients(regfit.fw, which.max(regfitfw.adjr2))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
##     1.577715   100.570909    -7.287776    64.353564
coefficients(regfit.fw, which.min(regfitfw.cp))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)5 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01 -1.352911e-14 
##  poly(x, 10)6 
##  2.035174e-14
coefficients(regfit.fw, which.min(regfitfw.bic))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)6 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01  2.035174e-14

Backward stepwise selection

regfit.bw=regsubsets(y~poly(x,10),data, nvmax = 10, method = "backward")
regfitbw.adjr2<-summary(regfit.bw)$adjr2
regfitbw.cp<-summary(regfit.bw)$cp
regfitbw.bic<-summary(regfit.bw)$bic

#The summary() command outputs the best set of variables for each model size.

# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfitbw.adjr2)) %>% print()
## [1] "Polynimial degree: 3"
# best model according to Cp
paste("Polynimial degree:", which.min(regfitbw.cp)) %>% print()
## [1] "Polynimial degree: 5"
# best model according to BIC
paste("Polynimial degree:", which.min(regfitbw.bic)) %>% print()
## [1] "Polynimial degree: 4"
plot(regfitbw.adjr2, xlab = "Subset Size", ylab = "Backward Adjr2", type = "l")
points(which.max(regfitbw.adjr2), regfitbw.adjr2[which.max(regfitbw.adjr2)], pch = 4, col = "red", lwd = 3)

plot(regfitbw.cp, xlab = "Subset Size", ylab = "Backward Cp", type = "l")
points(which.min(regfitbw.cp), regfitbw.cp[which.min(regfitbw.cp)], pch = 4, col = "red", lwd = 3)

plot(regfitbw.bic, xlab = "Subset Size", ylab = "Backward BIC", type = "l")
points(which.min(regfitbw.bic), regfitbw.bic[which.min(regfitbw.bic)], pch = 4, col = "red", lwd = 3)

coefficients(regfit.bw, which.max(regfitbw.adjr2))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
##     1.577715   100.570909    -7.287776    64.353564
coefficients(regfit.bw, which.min(regfitbw.cp))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)5 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01 -1.352911e-14 
##  poly(x, 10)6 
##  2.035174e-14
coefficients(regfit.bw, which.min(regfitbw.bic))
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)6 
##  1.577715e+00  1.005709e+02 -7.287776e+00  6.435356e+01  2.035174e-14

Best subset selection, forward selection and backward selection select the same models, consequently the coefficients are the same.

Now fit a lasso model to the simulated data, again using \(X,X^2, . . . , X^{10}\) as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.

set.seed (1)
train = sample (c(TRUE ,FALSE), nrow(data),rep=TRUE) # here the number of T and F is more or less equal 
#sum(train)
test = (!train)
# train=sample(nrow(data) , nrow(data)/2) I can set the size of the training set.

xm = model.matrix(y~poly(x,10), data) # creates design matrix (n x p). qualitative variable are turned into dummy variables.

y = data$y


Lcv.out = cv.glmnet (xm[train,], y[train], alpha =1) #Does k-fold cross-validation for glmnet, produces a plot, and returns a value for lambda
plot(Lcv.out)

bestlam.L = Lcv.out$lambda.min
bestlam.L
## [1] 0.2066705
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values glmnet() function standardizes the variables so that they are on the same scale. To turn off this default setting, use the argument standardize=FALSE.

lasso.mod = glmnet(xm, y, alpha = 1) # Fit the model on entire data using best lambda
#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.
predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept)    1.577715
## (Intercept)    .       
## poly(x, 10)1  98.504205
## poly(x, 10)2  -5.221072
## poly(x, 10)3  62.286859
## poly(x, 10)4   .       
## poly(x, 10)5   .       
## poly(x, 10)6   .       
## poly(x, 10)7   .       
## poly(x, 10)8   .       
## poly(x, 10)9   .       
## poly(x, 10)10  .
#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.

The lasso method select a model with 3 predictors: \(x, x^2, x^3\). “best lambda” yielding the lowest expected test error is close to 0, this might indicate that he Lasso regression return a result similar to the standard linear regression with RSS.

Now generate a response vector Y according to the model \(Y = \beta_0 + \beta_7X^7 + \epsilon\), and perform best subset selection and the lasso. Discuss the results obtained.

b7=4
x=rnorm(100)
y=b0 + b7*x^7

data<-data.frame(x,y)

regfit.full=regsubsets(y~poly(x,10), data, nvmax = 10)
regfit.adjr2<-summary(regfit.full)$adjr2
regfit.cp<-summary(regfit.full)$cp
regfit.bic<-summary(regfit.full)$bic

#The summary() command outputs the best set of variables for each model size.

# best model according to adjusted R2
paste("Polynimial degree:", which.max(regfit.adjr2)) %>% print()
## [1] "Polynimial degree: 7"
# best model according to Cp
paste("Polynimial degree:", which.min(regfit.cp)) %>% print()
## [1] "Polynimial degree: 7"
# best model according to BIC
paste("Polynimial degree:", which.min(regfit.bic)) %>% print()
## [1] "Polynimial degree: 7"
coefficients(regfit.full, which.max(regfit.adjr2))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5 
##     39.06375   1220.02639    939.47276   1185.84734    398.00786    458.45997 
## poly(x, 10)6 poly(x, 10)7 
##     84.63802     66.36999
coefficients(regfit.full, which.min(regfit.cp))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5 
##     39.06375   1220.02639    939.47276   1185.84734    398.00786    458.45997 
## poly(x, 10)6 poly(x, 10)7 
##     84.63802     66.36999
coefficients(regfit.full, which.min(regfit.bic))
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 poly(x, 10)5 
##     39.06375   1220.02639    939.47276   1185.84734    398.00786    458.45997 
## poly(x, 10)6 poly(x, 10)7 
##     84.63802     66.36999
set.seed (1)
train = sample (c(TRUE ,FALSE), nrow(data),rep=TRUE) # here the number of T and F is more or less equal 
#sum(train)
test = (!train)
# train=sample(nrow(data) , nrow(data)/2) I can set the size of the training set.

xm = model.matrix(y~poly(x,10), data) [,-1]# creates design matrix (n x p). qualitative variable are turned into dummy variables.
y = data$y


Lcv.out = cv.glmnet (xm[train,], y[train], alpha =1)
plot(Lcv.out)

bestlam.L = Lcv.out$lambda.min
bestlam.L
## [1] 15.94335
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values.

lasso.mod = glmnet(xm, y, alpha = 1) # Fit the model on entire data. For each lambda values I have a set of coefficients.

#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.


predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)     39.06375
## poly(x, 10)1  1060.59286
## poly(x, 10)2   780.03922
## poly(x, 10)3  1026.41380
## poly(x, 10)4   238.57432
## poly(x, 10)5   299.02644
## poly(x, 10)6     .      
## poly(x, 10)7     .      
## poly(x, 10)8     .      
## poly(x, 10)9     .      
## poly(x, 10)10    .
# make a prediction using the model in 'lasso.mod' with a specific lambda values which is the lambda value that yields the lowest test error.

#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.

Best subset selection and lasso select different models. Best subset selects a model of degree 7 while lasso a model of degree 5.

Exercise 6.9

In this exercise, we will predict the number of applications received using the other variables in the College data set.

Split the data set into a training set and a test set.

data(College)
set.seed (2)
train = sample (c(TRUE ,FALSE), nrow(College),rep=TRUE) # here the number of T and F is more or less equal 
#train=sample(nrow(College) , nrow(College)/2)
test = (!train)
y.test<-College$Apps[test]
y.train<-College$Apps[train]
#sum(train) + sum(test) - nrow(College)

Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit<-lm(Apps~. ,data=College[train,], na.action = na.omit)
lm.pred<-predict(lm.fit, newdata = College[test,])

# Test error
mean((lm.pred-y.test)^2) #MSE for standard linear regression
## [1] 1556804

Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

xm = model.matrix(Apps~., College)[,-1] # creates design matrix (n x p). qualitative variable are turned into dummy variables.


ridge.cv.out = cv.glmnet (xm[train,], y.train, alpha =0)
#plot(ridge.cv.out)
bestlam.ridge = ridge.cv.out$lambda.min #value of lambda that gives minimum cvm.
bestlam.ridge
## [1] 292.5324
#By default the glmnet() function performs ridge/lasso regression for an automatically selected range of λ values.

ridge.mod = glmnet(xm[train,], y.train, alpha = 0) # Fit the model on entire data
#coefficients(lasso.mod) # Matrix (p+1) x lambda n. For each lambda value (column) I get the value of each coefficient (some are set to 0). plot(lasso.mod, "lambda") show this. The plot however doesn't tell me which is the best.

#coef(Lcv.out, s = "lambda.min") # return slightly different values, as I used the train set only.

ridge.pred<-predict(ridge.mod, s=bestlam.ridge, newx=xm[test ,])
# Test error
mean((ridge.pred - y.test)^2) # MSE with Ridge regression
## [1] 2451335

MSE with ridge regression is higher than MSE with standard linear regression OLS

Fit a lasso model on the training set, with λ chosen by cross validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso.cv.out = cv.glmnet (xm[train,], y.train, alpha = 1)
#plot(ridge.cv.out)
bestlam.lasso = lasso.cv.out$lambda.min #value of lambda that gives minimum cvm.
bestlam.lasso
## [1] 10.03524
lasso.mod = glmnet(xm[train,], y.train, alpha = 1) 

lasso.pred<-predict(lasso.mod, s=bestlam.lasso, newx=xm[test ,])
# Test error
mean((lasso.pred - y.test)^2) # MSE with Lasso regression
## [1] 1619912
predict(lasso.mod, s = bestlam.L, type = "coefficients")
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -9.383909e+02
## PrivateYes  -8.180112e+01
## Accept       1.318719e+00
## Enroll       .           
## Top10perc    3.388912e+01
## Top25perc   -6.056578e+00
## F.Undergrad  .           
## P.Undergrad  1.424121e-01
## Outstate    -6.118679e-02
## Room.Board   8.404844e-02
## Books        7.773932e-02
## Personal    -9.714371e-03
## PhD          .           
## Terminal    -7.264470e+00
## S.F.Ratio    1.030567e+01
## perc.alumni -1.773488e+00
## Expend       1.045115e-01
## Grad.Rate    4.731908e+00
lasso.coef<-predict(lasso.mod, s = bestlam.L, type = "coefficients") %>% as.data.frame.matrix()
colnames(lasso.coef)="Coef_Value"

# number of non-zero coefficient
lasso.coef %>% subset(Coef_Value !=0) %>% nrow()
## [1] 15

MSE with Lasso regression is higher than the one obtained with the OLS.

Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.

pcr.fit=pcr(Apps~., data=College[train,] ,scale=TRUE , validation ="CV")

# values of the M selected by cross-validation.
summary(pcr.fit)
## Data:    X dimension: 384 17 
##  Y dimension: 384 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3155     3124     1489     1494     1419     1204     1176
## adjCV         3155     3124     1487     1496     1418     1192     1173
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1141     1133     1094      1083      1083      1093      1097
## adjCV     1136     1131     1092      1079      1080      1090      1093
##        14 comps  15 comps  16 comps  17 comps
## CV         1089      1094     945.9     940.2
## adjCV      1086      1092     941.7     935.6
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      32.418    58.16    65.51    71.62    76.78    81.32    85.13    88.55
## Apps    3.652    78.23    78.59    81.35    86.69    86.88    87.80    87.87
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       91.16     93.38     95.40     97.30     98.25     99.09     99.54
## Apps    88.77     89.13     89.13     89.14     89.25     89.37     89.38
##       16 comps  17 comps
## X        99.89    100.00
## Apps     92.39     92.62
validationplot(pcr.fit, val.type="MSEP")

pcr.pred = predict(pcr.fit, College[test, ], ncomp=17) #ncomp could be set at a lower values as from the graph it seems that the cross validation MSE doesn't change much for higher values of ncomp after ncomp=5. Same conclusio can be drawn using summary() which show the root mean squared error (sqrt(MSE)). 

# if ncomp=p then I am applying a linear regression with OLS.

mean((pcr.pred - y.test)^2)
## [1] 1556804

Consider the following method as well.

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
## 
##     R2
## The following object is masked from 'package:purrr':
## 
##     lift
model <- train(
  Apps~., data = College[train,], method = "pcr",
  scale = TRUE,
  trControl = trainControl("cv", number = 17),
  tuneLength = 17
  )
# Plot model RMSE vs different values of components
plot(model)

# Print the best tuning parameter ncomp that
# minimize the cross-validation error, RMSE
model$bestTune
##    ncomp
## 16    16

Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

pls.fit=plsr(Apps~., data=College[train,] ,scale=TRUE , validation ="CV")

# values of the M selected by cross-validation.
summary(pls.fit)
## Data:    X dimension: 384 17 
##  Y dimension: 384 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3155     1374     1173     1066     1049     1015    979.7
## adjCV         3155     1371     1177     1063     1044     1009    973.3
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       962.6    952.6    943.4     945.7     949.3     945.7     942.7
## adjCV    956.7    947.6    939.0     941.1     944.5     940.6     937.9
##        14 comps  15 comps  16 comps  17 comps
## CV        942.1     942.2     942.1     942.1
## adjCV     937.4     937.5     937.4     937.4
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.11    46.54    63.46    67.66    71.77    75.06    79.36    82.90
## Apps    81.91    86.46    89.46    90.46    91.36    92.18    92.44    92.51
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       84.68     87.67     90.57     91.55     94.49     96.01     97.47
## Apps    92.55     92.57     92.58     92.62     92.62     92.62     92.62
##       16 comps  17 comps
## X        98.00    100.00
## Apps     92.62     92.62
validationplot(pls.fit, val.type="MSEP")

pls.pred = predict(pls.fit, College[test, ], ncomp=17) 
mean((pls.pred - y.test)^2)
## [1] 1556804
# Same considerations here as for PCR

Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

lm<-mean((lm.pred-y.test)^2)
ridge<-mean((ridge.pred - y.test)^2)
lasso<-mean((lasso.pred - y.test)^2)
pcr<-mean((pcr.pred - y.test)^2)
pls<-mean((pls.pred - y.test)^2)

MSE.comp<-c(lm,ridge,lasso,pcr,pls)

barplot(MSE.comp, names.arg = c("OLS", "Ridge", "Lasso", "PCR", "PLS"), ylab="MSE")

PCR and PLS use all the predictors therefore they are equivalent as OLS. If we impose a number of components (based on an acceptable threshold) we would get different MSE. Lasso return a model with MSE very close to the OLS.

Exercise 6.10

We have seen that as the number of features used in a model increases, the training error will necessarily decrease, but the test error may not. We will now explore this in a simulated data set.

Generate a data set with p = 20 features, n = 1,000 observations, and an associated quantitative response vector generated according to the model \(Y = X\beta + \epsilon\),

where \(\beta\) has some elements that are exactly equal to zero.

set.seed(1)
p = 20
n = 1000
x = matrix(rnorm(n * p), n, p)
B = rnorm(p)
zeros<-rbinom(p,1,0.2) %>% as.logical() # set the position of the zeros
B[zeros]=0 # set the zeros
B
##  [1]  0.2353485  0.2448250 -0.6421869 -1.9348085  1.0386957  0.0000000
##  [7]  0.0000000  0.7231804  2.0310355  0.0000000  0.8791534  0.0000000
## [13] -0.2845811 -0.6746580 -0.7154889 -0.2705279  0.3129646  1.6698068
## [19]  0.8922593 -1.0154889
eps = rnorm(p)
y = x %*% B + eps

Split your data set into a training set containing 100 observations and a test set containing 900 observations.

set.seed(1)
train<-sample(n, 100)
test<-(-train)

x.train<-x[train,]
y.train<-y[train]
x.test<-x[test,]
y.test<-y[test]

Perform best subset selection on the training set, and plot the training set MSE associated with the best model of each size.

regfit.train=regsubsets(y ~. ,data = data.frame(x = x.train, y = y.train), nvmax = p)
MSE.train<-(summary(regfit.train)$rss)/length(train)

plot(MSE.train,ylab = "Training MSE", pch = 19, type = "b")

Plot the test set MSE associated with the best model of each size.

predict.regsubsets = function(object,newdata,id,...){
  form = as.formula(object$call[[2]]) # Extract the formula used when we called regsubsets()
  mat = model.matrix(form,newdata)    # Build the model matrix
  coefi = coef(object,id=id)          # Extract the coefficients of the ith model
  xvars = names(coefi)                # Pull out the names of the predictors used in the ith model
  mat[,xvars]%*%coefi               # Make predictions using matrix multiplication
  }
# Iterates over each size i
pred<-matrix(NA, length(y.test), p) %>% as.data.frame()

for(i in 1:p){
  pred[,i]<-predict.regsubsets(regfit.train, data.frame(x = x.test, y = y.test),i)
}

MSE.test<-rep(NA,p)
for(i in 1:p){
  MSE.test[i]<-mean((pred[,i]-y.test)^2)
}
MSE.test
##  [1] 12.981049  9.303255  7.071847  6.591699  6.057322  4.527023  3.801184
##  [8]  3.196142  2.447528  1.957794  1.476790  1.366107  1.223271  1.276057
## [15]  1.222040  1.182931  1.207938  1.213847  1.222304  1.198599
plot(MSE.test,ylab = "Test MSE", pch = 15, type = "b")

"
for(i in 1:19){
    
    # Extract the vector of predictors in the best fit model on i predictors
    coefi = coef(regfit_best_train, id = i)
    
    # Make predictions using matrix multiplication of the test matirx and the coefficients vector
    pred = test_mat[,names(coefi)]%*%coefi
    
    # Calculate the MSE
    val_errors[i] = mean((test$Salary-pred)^2)
}

for(i in 1:p){
  coefi = coef(regfit.train,id=i)
  xvars = names(coefi)
  pred = df.test[,xvars]%*%coefi
  val_errors[i] = mean((y.test-pred)^2)

}
"
## [1] "\nfor(i in 1:19){\n    \n    # Extract the vector of predictors in the best fit model on i predictors\n    coefi = coef(regfit_best_train, id = i)\n    \n    # Make predictions using matrix multiplication of the test matirx and the coefficients vector\n    pred = test_mat[,names(coefi)]%*%coefi\n    \n    # Calculate the MSE\n    val_errors[i] = mean((test$Salary-pred)^2)\n}\n\nfor(i in 1:p){\n  coefi = coef(regfit.train,id=i)\n  xvars = names(coefi)\n  pred = df.test[,xvars]%*%coefi\n  val_errors[i] = mean((y.test-pred)^2)\n\n}\n"

For which model size does the test set MSE take on its minimum value? Comment on your results. If it takes on its minimum value for a model containing only an intercept or a model containing all of the features, then play around with the way that you are generating the data in (a) until you come up with a scenario in which the test set MSE is minimized for an intermediate model size.

min.t<-which.min(MSE.test)
min.t
## [1] 16

Model size 16 has the minimum value of test set MSE. However after index 15 the test MSE is almost constant. Changes the coefficients set to 0 will lead to a different best model size.

How does the model at which the test set MSE is minimized compare to the true model used to generate the data? Comment on the coefficient values.

addname<-coef(regfit.train, id = p) %>% names()
c<-coef(regfit.train, id = which.min(MSE.test))[-1]
names(B)<-addname[-1]
#list(coef(regfit.train, id = which.min(MSE.test))[-1], B)

coef.best <- rep(0, length(B))
names(coef.best)<-addname[-1]
where<- match(names(B), names(c)) %>% is.na() #locate the NAs
coef.best[!where]<- c # overwrite the vector full of zeros with corresponding value in c where there is no NA. 
data.frame(coef.best, B)
##       coef.best          B
## x.1   0.1663089  0.2353485
## x.2   0.1457268  0.2448250
## x.3  -0.6649714 -0.6421869
## x.4  -1.8301688 -1.9348085
## x.5   1.0105789  1.0386957
## x.6   0.0000000  0.0000000
## x.7   0.2137181  0.0000000
## x.8   0.6085251  0.7231804
## x.9   2.1001306  2.0310355
## x.10  0.0000000  0.0000000
## x.11  0.7196519  0.8791534
## x.12  0.0000000  0.0000000
## x.13 -0.4935891 -0.2845811
## x.14 -0.7317653 -0.6746580
## x.15 -0.8505321 -0.7154889
## x.16  0.0000000 -0.2705279
## x.17  0.2679864  0.3129646
## x.18  1.7239110  1.6698068
## x.19  0.8473220  0.8922593
## x.20 -0.8907173 -1.0154889

x.7 is 0 in B but not in the best subset model. x.16 is 0 in the best subset model but not in B.

Create a plot displaying \(\sqrt{\sum_{j=1}^p(\beta-\hat{\beta}_j^r)^2}\) for a range of values of r, where \(\hat{\beta}_j^r\) j is the jth coefficient estimate for the best model containing r coefficients. Comment on what you observe. How does this compare to the test MSE plot from (d)?

#names(B)<-c(paste("x",1:(length(B)),sep="")) # B has "true" parameter values
par.error<-vector()
for(i in 1:length(B)){
  var<-names(coef(regfit.train,id=i)[-1])
  par.error[i]<-sum((B[names(B)%in%var]-coef(regfit.train,id=i)[-1])^2)
}
par.error
##  [1] 0.03079925 0.16889075 0.39947322 0.50245575 0.30303245 0.69388259
##  [7] 0.64594896 0.37105825 0.27345053 0.19534057 0.14378940 0.15281194
## [13] 0.14243178 0.20004695 0.18809258 0.20360212 0.23513938 0.24222532
## [19] 0.24600460 0.29255324
plot(par.error, xlab = "Number of coefficients", ylab = "Error between estimated and true coefficients")

Exercise 6.11

We will now try to predict percapita crime rate in the Boston dataset.

Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.

Best Subset Selection

data(Boston)

# I am comparing model of different size and I need to choose the "best" according to a specific criterion. I'll use cross validation as I can compare the selected model with the one from the lasso/Ridge without additional calculation 
set.seed(1)
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)

# For each fold
for (i in 1:k) {
    # Fit the model with each subset of predictors on the training part of the fold
    # first iteration k=1, row labeled with k=1 form the test set. The others the training set.
    best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
    # For each subset
    for (j in 1:p) {
        # Predict on the hold out part of the fold for that subset
        pred = predict(best.fit, Boston[folds == i, ], id = j)
        # Get the mean squared error for the model trained on the fold with the subset
        cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
    }
}
rmse.cv = apply(cv.errors, 2, mean) %>% sqrt()
min(rmse.cv)
## [1] 6.543281
which.min(rmse.cv)
## [1] 9
plot(rmse.cv, pch = 9, type = "b")

#coefficients of the best subset. re-run regsubset() on the full model and select the coefficients for the subset model identified with the cross validation (10 variables.)
# Note that the selected variable during cross validation (regsubset applied to the training set) might not be the same as the on selected applying the regsubset() on the full data set. Even if the variable selected are the same it is very likely that the coefficient values are different.
regsubsets(crim ~ ., data = Boston, nvmax = p) %>% coefficients(id=which.min(rmse.cv))
##   (Intercept)            zn         indus           nox           dis 
##  19.124636156   0.042788127  -0.099385948 -10.466490364  -1.002597606 
##           rad       ptratio         black         lstat          medv 
##   0.539503547  -0.270835584  -0.008003761   0.117805932  -0.180593877

Lasso Regression

X = model.matrix(crim~., Boston)[,-1] # creates design matrix (n x p). qualitative variable are turned into dummy variables.
lasso.cv = cv.glmnet (X, Boston$crim, alpha = 1, type.measure="mse")
#plot(lasso.cv)
bestlam.lasso = lasso.cv$lambda.min #value of lambda that gives minimum cvm.
bestlam.lasso
## [1] 0.2071268
#lasso.cv$cvm #The mean cross-validated error - a vector of length lasso.cv$lambda

min(lasso.cv$cvm) %>% sqrt()
## [1] 6.696065
coef(lasso.cv)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                    1
## (Intercept) 2.176491
## zn          .       
## indus       .       
## chas        .       
## nox         .       
## rm          .       
## age         .       
## dis         .       
## rad         0.150484
## tax         .       
## ptratio     .       
## black       .       
## lstat       .       
## medv        .

Ridge Regression

ridge.cv = cv.glmnet (X, Boston$crim, alpha = 0, type.measure="mse")
#plot(ridge.cv)
bestlam.ridge = ridge.cv$lambda.min #value of lambda that gives minimum cvm.
bestlam.ridge
## [1] 0.5374992
#ridge.cv$cvm #The mean cross-validated error - a vector of length ridge.cv$lambda

min(ridge.cv$cvm) %>% sqrt()
## [1] 6.582841
coef(ridge.cv)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  1.523899548
## zn          -0.002949852
## indus        0.029276741
## chas        -0.166526006
## nox          1.874769661
## rm          -0.142852604
## age          0.006207995
## dis         -0.094547258
## rad          0.045932737
## tax          0.002086668
## ptratio      0.071258052
## black       -0.002605281
## lstat        0.035745604
## medv        -0.023480540

Principal Component Regression

pcr.fit=pcr(crim~., data=Boston ,scale=TRUE , validation ="CV")
summary(pcr.fit)
## Data:    X dimension: 506 13 
##  Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            8.61    7.175    7.180    6.724    6.731    6.727    6.727
## adjCV         8.61    7.174    7.179    6.721    6.725    6.724    6.724
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       6.722    6.614    6.618     6.607     6.598     6.553     6.488
## adjCV    6.718    6.609    6.613     6.602     6.592     6.546     6.481
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       47.70    60.36    69.67    76.45    82.99    88.00    91.14    93.45
## crim    30.69    30.87    39.27    39.61    39.61    39.86    40.14    42.47
##       9 comps  10 comps  11 comps  12 comps  13 comps
## X       95.40     97.04     98.46     99.52     100.0
## crim    42.55     42.78     43.04     44.13      45.4

13 component pcr fit has lowest CV/adjCV RMSEP.

Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross validation, or some other reasonable alternative, as opposed to using training error.

The model found with Best Subset Selection seems the preferable. It returns the lowest RMSE (root mean square error) and select a subset involving 9 predictors while Ridge makes no variable selection and Lasso selects only 1 variable.

Since p is not big, best subset selection is not very computationally expensive

regsubsets(crim ~ ., data = Boston, nvmax = p) %>% system.time()
##    user  system elapsed 
##       0       0       0

Does your chosen model involve all of the features in the data set? Why or why not?

The model does not involve all the features. Best Subset select 9 features as the model with 9 predictor yields the lowest MSE/RMSE

Chapter 7

Exercise 7.6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

set.seed(1)
data(Wage)
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
# This function calculates the estimated K-fold cross-validation prediction error for generalized linear models. if k is not specified it is set equal to n ?

cv.pe_poly<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(cv.pe_poly)){
  poly.fit<-glm(wage ~ poly(age,i), data = Wage) #fit the model with poly degree i
  cv.pe_poly[i]<-cv.glm(Wage, poly.fit, K=10)$delta[1]
  #save the prediction error estimate in vector cv.pe
}

plot(seq_along(cv.pe_poly), cv.pe_poly, xlab="Degree", ylab="CV error", type = "l")
points(which.min(cv.pe_poly),min(cv.pe_poly), col="red", pch=19)

5 degree polynomial yields the minimum estimated cross validation error. However from the plot we can see that the line flatten out after degree = 3. We might go for the cubic polynomial as a better trade of between accuracy and simplicity.

pfit.1 = lm(wage~poly(age, 1), data=Wage)
pfit.2 = lm(wage~poly(age, 2), data=Wage)
pfit.3 = lm(wage~poly(age, 3), data=Wage)
pfit.4 = lm(wage~poly(age, 4), data=Wage)
pfit.5 = lm(wage~poly(age, 5), data=Wage)
anova(pfit.1,pfit.2,pfit.3,pfit.4,pfit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova analysis somehow confirm the results obtained from cross validation. the p-value for 4-degree polynomial is not negligible. The residual sum of squares does not decrease much from degree 3 to degree 4.

plot(wage~age, data=Wage, col="lightblue")
age.grid = seq_range(Wage$age, by=1)
lm.fit3 = lm(wage~poly(age, 3), data=Wage)
lm.pred3 = predict(lm.fit3, data.frame(age=age.grid))
lines(age.grid, lm.pred3, col="blue", lwd=2)
legend("topright", legend="3rd poly degree",col="blue", lty=1, cex=0.8)

Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.pe_step<-rep(NA,10) # I am gonna try up to 5 cuts.
for(i in 2:length(cv.pe_step)){
  Wage$ageCut <- cut(Wage$age,i) # add a factor column to the data frame. for each observation it tells me the corresponding "bucket".
  step.fit<-glm(wage ~ ageCut, data = Wage) # fit the model with i cuts/knots
  #print(i)
  cv.pe_step[i-1]<-cv.glm(Wage, step.fit, K=10)$delta[1]
  # save the prediction error estimate in vector cv.pe_step
}

cv.pe_step<-na.omit(cv.pe_step)

plot(seq_along(cv.pe_step), cv.pe_step, xlab="Number of cuts", ylab="CV error", type = "l")
points(which.min(cv.pe_step),min(cv.pe_step), col="red", pch=19)

plot(wage~age, data=Wage, col="lightblue")
age.grid<-seq_range(Wage$age,by=1)
age.grid.cut = cut(age.grid,which.min(cv.pe_step))
#Wage$ageCut <- cut(Wage$age, which.min(cv.pe_step))
Wage$ageCut <- cut(Wage$age, which.min(cv.pe_step))
lm.fit_step = glm(wage ~ ageCut, data=Wage)
lm.pred_step = predict(lm.fit_step, data.frame(ageCut=age.grid.cut)) # predict function needs a data frame as input with the same predictor names as glm. ageCut appears in glm() and predict().
lines(age.grid, lm.pred_step, col="red", lwd=2)

legend("topright", legend=paste(which.min(cv.pe_step), "cuts"),col="red", lty=1, cex=0.8)

Exercise 7.8

Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.

data(Auto)
#str(Auto)
#summary(Auto) to check data structure
Auto %>% select_if(is.numeric) %>% plot() # exclude variable name from the plot

Variable mpg appears inversely proportional to cylinders, displacement, horsepower, weight. The relationship between mpg and displacement, horsepower and weight (separately) doesn’t seem to be linear. mpg seems directly proportional to year. As year increases, the expected mpg increases as well. The relationship seems to be linear.

POLYNOMIAL

poly.fit = list()
poly.deg<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(poly.deg)){
  poly.fit[[i]]<-lm(mpg ~ poly(horsepower,i), data = Auto) #fit the model with poly degree i. I need [[]] because I need to access the object itself. [] extract a list. 
  
  #I have to feed anova function with a lm object or similar.
}
#poly.fit[[1]] %>% class()
anova(poly.fit[[1]],poly.fit[[2]],poly.fit[[3]],poly.fit[[4]],poly.fit[[5]])
## Analysis of Variance Table
## 
## Model 1: mpg ~ poly(horsepower, i)
## Model 2: mpg ~ poly(horsepower, i)
## Model 3: mpg ~ poly(horsepower, i)
## Model 4: mpg ~ poly(horsepower, i)
## Model 5: mpg ~ poly(horsepower, i)
##   Res.Df    RSS Df Sum of Sq        F    Pr(>F)    
## 1    390 9385.9                                    
## 2    389 7442.0  1   1943.89 103.8767 < 2.2e-16 ***
## 3    388 7426.4  1     15.59   0.8333  0.361897    
## 4    387 7399.5  1     26.91   1.4382  0.231169    
## 5    386 7223.4  1    176.15   9.4131  0.002306 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mpg~horsepower, data=Auto, col="lightblue")

hp.grid = seq_range(Auto$horsepower, by=1)
poly.anova=2
lm.fit = lm(mpg~poly(horsepower, poly.anova), data=Auto)
lm.pred = predict(lm.fit, data.frame(horsepower=hp.grid))
lines(hp.grid, lm.pred, col="blue", lwd=2)
legend("topright", legend=paste("poly degree",poly.anova),col="blue", lty=1, cex=0.8)

Quadratic polynomial seems to provide a good fit. Anova confirms so, p-value for 3rd and 4th degree are higher than usual levels of significance.

set.seed(1)
cv.poly<-rep(NA,5) # I am gonna try polynomial up to 5th degree.
for(i in seq_along(cv.poly)){
  fitp<-glm(mpg ~ poly(horsepower,i), data = Auto) 
  cv.poly[i]<-cv.glm(Auto, fitp, K=10)$delta[1]
  #save the prediction error estimate in vector cv.poly
}

cv.poly
## [1] 24.21538 19.28327 19.12998 19.29201 19.09471
which.min(cv.poly)
## [1] 5

5th degree polynomial provides the minimum value of estimated prediction error with cross validation.However the values after the 2nd term are very similar. So a quadratic polynomial should be a good compromise between accuracy and simplicity.

STEP FUNCTION

set.seed(1)
cv.step<-rep(NA,10) # I am gonna try up to 5 cuts.
for(i in 2:length(cv.step)){
  Auto$hpCut <- cut(Auto$horsepower,i) # add a factor column to the data frame. for each observation it tells me the corresponding "bucket".
  fits<-glm(mpg ~ hpCut, data = Auto) #fit the model with i cuts/knots
  #print(i)
  cv.step[i]<-cv.glm(Auto, fits, K=10)$delta[1]
  #save the prediction error estimate in vector cv.step
}
plot(seq_along(cv.step), cv.step, xlab="Number of cuts", ylab="CV error", type = "l", xlim=c(2,length(cv.step)))
points(which.min(cv.step),min(cv.step, na.rm = T), col="red", pch=19)

SPLINES

set.seed(1)
cv.sp = rep(NA, 10)
for (i in 2:length(cv.sp)) {
    fitsp = glm(mpg ~ ns(horsepower, df = i), data = Auto)
    cv.sp[i] = cv.glm(Auto, fitsp, K = 10)$delta[1]
}
which.min(cv.sp)
## [1] 9
fitsp<-glm(mpg ~ ns(horsepower, df = which.min(cv.sp)), data = Auto)
#Plotting the Regression Line to the scatterplot   
plot(mpg~horsepower,data=Auto,col="lightblue")
pred<-predict(fitsp ,newdata =data.frame(horsepower=hp.grid),se=T)
lines(hp.grid , pred$fit ,col ="red",lwd =2)

GAM

fitgam1 = lm(mpg ~ ns(displacement, 1) + ns(horsepower, 1), data = Auto)
#summary(fitgam)
fitgam2 = lm(mpg ~ ns(displacement, 2) + ns(horsepower, 2), data = Auto)

anova(fitgam1,fitgam2)
## Analysis of Variance Table
## 
## Model 1: mpg ~ ns(displacement, 1) + ns(horsepower, 1)
## Model 2: mpg ~ ns(displacement, 2) + ns(horsepower, 2)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    389 7995.3                                  
## 2    387 6091.0  2    1904.4 60.499 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All the models we have tried out point out that a quadratic term (at least) plays a significant role. The matrix plot already suggested that the relationship between some variables is not linear.

Exercise 7.9

This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

data(Boston)
range(Boston$dis)
## [1]  1.1296 12.1265
dis.grid<-seq_range(Boston$dis,n=100) #package modelr

fit.p3<-lm(nox~poly(dis,3), data=Boston)
summary(fit.p3)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
pred.p3<-predict(fit.p3, newdata=data.frame(dis=dis.grid))

plot(nox~dis, data=Boston, col="lightblue")
lines(dis.grid, pred.p3, col="red", lwd = 2)

Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.

rss.poly<-rep(NA,10)# I am gonna try polynomial up to 5th degree.
poly.fit = list()
pred.pp = list()
for(i in seq_along(rss.poly)){
  poly.fit[[i]]<-lm(nox ~ poly(dis,i), data = Boston) 
  #rss.poly[i]<-deviance(poly.fit)
  rss.poly[i]<-sum(poly.fit[[i]]$residuals^2)
  pred.pp[[i]]<-predict(poly.fit[[i]], newdata=data.frame(dis=dis.grid))
}
rss.poly
##  [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
##  [9] 1.833331 1.832171
#ggplot(Boston, aes(x=dis,y=nox)) + geom_point() + stat_function(data = data.frame(), fun = )

cols <- rainbow(10)
plot(nox~dis, data=Boston, col="lightblue")
for(i in seq_along(rss.poly)){
  
  lines(dis.grid, pred.pp[[i]], col= cols[i], lwd = 2)
  legend("topright", legend=paste("degree",1:10), lwd=2, col=cols,cex = 0.5)
}

Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.

set.seed(1)
cv.poly<-rep(NA,10) 
for(i in seq_along(cv.poly)){
  fitp<-glm(nox ~ poly(dis,i), data = Boston) 
  cv.poly[i]<-cv.glm(Boston, fitp, K=10)$delta[1]
  
}
cv.poly
##  [1] 0.005558263 0.004085706 0.003876521 0.003863342 0.004237452 0.005686862
##  [7] 0.010278897 0.006810868 0.033308607 0.004075599
which.min(cv.poly)
## [1] 4

According to 10-fold cross validation a 4-degree polynomial will yield the lowest estimated prediction error. A 3rd polynomial degree yields a similar estimated prediction error. For higher degree the prediction error returned is much higher.

Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

fitsp4 = lm(nox ~ bs(dis, df = 4), data = Boston)
summary(fitsp4)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.124622 -0.039259 -0.008514  0.020850  0.193891 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.73447    0.01460  50.306  < 2e-16 ***
## bs(dis, df = 4)1 -0.05810    0.02186  -2.658  0.00812 ** 
## bs(dis, df = 4)2 -0.46356    0.02366 -19.596  < 2e-16 ***
## bs(dis, df = 4)3 -0.19979    0.04311  -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881    0.04551  -8.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared:  0.7164, Adjusted R-squared:  0.7142 
## F-statistic: 316.5 on 4 and 501 DF,  p-value: < 2.2e-16
# The bs() function generates the entire matrix of basis functions for splines with the specified set of knots.
# df option to produce a spline with knots at uniform quantiles of the data.
# fitting a cubic spline with K knots uses K+4 degrees of freedom.
attr(bs(Boston$dis, df = 4), "knots")
##     50% 
## 3.20745
plot(nox~dis,data=Boston,col="lightblue")
pred<-predict(fitsp4 ,newdata =data.frame(dis=dis.grid))
lines(dis.grid , pred ,col ="red",lwd =2)

Knots where chosen setting the degree of freedom. If not specified, bs function sets the knot at uniform percentile/quantiles. By default bs uses cubic splines. The number of knots is defined by the type of spline and the degree of freedom.

df = K + 4

Where K is the number of knots

Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

deg<-rep(NA,20)
rss.sp<-rep(NA,length(deg))
#fitsp<-list()
degree<-3 # standard value in bs
for(i in 3:length(deg)){
  fitsp = lm(nox ~ bs(dis, df = i), data = Boston)
  #fitsp[[i]] = lm(nox ~ bs(dis, df = i, degree = degree), data = Boston)
  rss.sp[i] = sum(fitsp$residuals^2)
}

which.min(rss.sp)
## [1] 19
min(rss.sp, na.rm = T)
## [1] 1.774487
plot(seq_along(rss.sp), rss.sp, xlab="Degree of freedom", ylab="RSS", type = "l", xlim = c(degree,length(rss.sp)))
points(which.min(rss.sp),min(rss.sp, na.rm = T), col="red", pch=19)

plot(nox~dis, data = Boston, col="lightblue")
fitsp = lm(nox ~ bs(dis, df = which.min(rss.sp)), data = Boston)
predsp<-predict(fitsp, newdata = data.frame(dis=dis.grid))
lines(dis.grid,predsp, col ="red",lwd =2)

Degree of freedom equal to 19 yields the lowest residual sum of squares. That being said from the RSS plot we can notice that the line flatten out after df = 14

Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.

set.seed(1)
cv.df<-rep(NA,20) 
for(i in 3:length(cv.df)){
  
  fitsp = glm(nox ~ bs(dis, df = i), data = Boston) 
  cv.df[i]<-cv.glm(Boston, fitsp, K=10)$delta[1]
  
}
cv.df
##  [1]          NA          NA 0.003897402 0.003890178 0.003724890 0.003697010
##  [7] 0.003740823 0.003709286 0.003727194 0.003715941 0.003708342 0.003701810
## [13] 0.003723050 0.003740607 0.003762967 0.003860353 0.003799018 0.003721269
## [19] 0.003861497 0.003830194
which.min(cv.df)
## [1] 6
plot(seq_along(cv.df), cv.df, lwd = 2, type = "l", xlab = "df", ylab = "CV error",xlim = c(3,length(cv.df)))
points(which.min(cv.df),min(cv.df,na.rm = T), col="red", pch=19)

According to cross validation 6 degree of freedom yields the lowest estimated prediction error. The plot is more irregular compared to the one generated against the RSS. The prediction error can considerably increase as the number of degree of freedom increases.

Exercise 7.10

This question relates to the College data set.

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

data(College)
set.seed(1)
train = sample(length(College$Outstate), length(College$Outstate)/2)
test = -train
College.train = College[train, ]
College.test = College[test, ]
reg = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
reg.summary = summary(reg)

reg.summary$adjr2 %>% which.max()
## [1] 14
reg.summary$cp %>% which.min()
## [1] 14
reg.summary$bic %>% which.min()
## [1] 6
par(mfrow = c(1, 3))
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = "adjr2", type = "l")
points(which.max(reg.summary$adjr2), max(reg.summary$adjr2), col="red", pch=19)

plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
points(which.min(reg.summary$cp), min(reg.summary$cp), col="red", pch=19)

plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), min(reg.summary$bic), col="red", pch=19)

BIC, Cp and Adjusted R2 have minimum values for different number of variables. However, all the criteria point out that after 6 variables the score remain approximately constant (BIC criterion already suggest a 6 variables model as the absolute best one). We will use a model with 6 variables.

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

reg<- regsubsets(Outstate ~ ., data = College, method = "forward")
coef(reg, id = 6) %>% names()
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "PhD"         "perc.alumni"
## [6] "Expend"      "Grad.Rate"
fit.gam<- lm(Outstate ~ Private + ns(Room.Board, df = 3) + ns(PhD, df = 3) + ns(perc.alumni, df = 3) + ns(Expend, df = 3) + ns(Grad.Rate, df = 3), data = College.train)
par(mfrow = c(2, 3))
plot.Gam(fit.gam, se=T, col = "blue")

plot.Gam shows the relationship each variable and the response. Each plot displays the fitted function and pointwise standard errors.

Some interpretations from the plot.Gam:

Out-of-state tuition increases as perc.alumni increases linearly (Pct. alumni who donate) holding the other variables constant.

Out-of-state tuition increases as Room.Board increases (Room and board costs) holding the other variables constant. The increase is not constant.

Out-of-state tuition is higher for private university holding constant the other variables.

Similar considerations can be made for the other plots.

Evaluate the model obtained on the test set, and explain the results obtained.

pred.test<-predict(fit.gam, newdata=College.test)
err.test<-mean((College.test$Outstate - pred.test)^2)
err.test
## [1] 3460135
mean(fit.gam$residuals^2)
## [1] 3660175

As expected the MSE for the test test is higher than the one for the training set.

For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(fit.gam)
## 
## Call:
## lm(formula = Outstate ~ Private + ns(Room.Board, df = 3) + ns(PhD, 
##     df = 3) + ns(perc.alumni, df = 3) + ns(Expend, df = 3) + 
##     ns(Grad.Rate, df = 3), data = College.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6998.1 -1154.5   -89.8  1280.9  7579.4 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2365.4     1470.5   1.609 0.108562    
## PrivateYes                 2497.9      295.7   8.446 6.90e-16 ***
## ns(Room.Board, df = 3)1    3089.2      539.0   5.731 2.07e-08 ***
## ns(Room.Board, df = 3)2    7146.3     1634.6   4.372 1.60e-05 ***
## ns(Room.Board, df = 3)3    3966.8      770.3   5.149 4.25e-07 ***
## ns(PhD, df = 3)1            438.2      605.5   0.724 0.469690    
## ns(PhD, df = 3)2          -1512.5     2106.5  -0.718 0.473198    
## ns(PhD, df = 3)3            582.6      681.9   0.854 0.393450    
## ns(perc.alumni, df = 3)1   1492.8      537.9   2.775 0.005792 ** 
## ns(perc.alumni, df = 3)2   2658.1     1300.0   2.045 0.041595 *  
## ns(perc.alumni, df = 3)3   2496.4      910.3   2.742 0.006396 ** 
## ns(Expend, df = 3)1       10793.9     1103.8   9.779  < 2e-16 ***
## ns(Expend, df = 3)2        6577.3     1556.6   4.226 3.00e-05 ***
## ns(Expend, df = 3)3        3718.5     1468.2   2.533 0.011729 *  
## ns(Grad.Rate, df = 3)1     2045.5      557.3   3.671 0.000277 ***
## ns(Grad.Rate, df = 3)2     2467.2     1670.8   1.477 0.140612    
## ns(Grad.Rate, df = 3)3     1859.7      561.2   3.313 0.001012 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1956 on 371 degrees of freedom
## Multiple R-squared:  0.7968, Adjusted R-squared:  0.7881 
## F-statistic: 90.94 on 16 and 371 DF,  p-value: < 2.2e-16
fit.lm<-lm(Outstate ~ Private + Room.Board + PhD + perc.alumni + Expend + Grad.Rate, data = College.train)

Looking at the p-value from the summary table, it seems that the relationship between Expend and Outstate is not linear. Similar results were found with the plot.gam.