Week 6: Class 14/03/2023

Exercise on overfitting

Using the data “Credit”

Analyze, graphically, the relationship between Education and Income
Suggest which model could you train to better fit the relationship
Fit the following models and analyze their \(\mathrm{R}^2\)

\[ Income_i=\beta_0+\beta_1 Education_i+u_i \]

\[ Income_i=\beta_0+\beta_1 Education_i+\beta_2 Education_i^2+u_i \]

\[ Income_i=\beta_0+\beta_1 Education_i+\beta_2 Education_i^2+\beta_3 Education_i^3+u_i \]

\[ Income_i=\beta_0+\beta_1 Education_i+\beta_2 Education_i^2+\beta_3 Education_i^3+\beta_3 Education_i^4+u_i \]

\[ Income_i=\beta_0+\beta_1 Education_i+\beta_2 Education_i^2+\beta_3 Education_i^3+\beta_3 Education_i^4+...+\beta_6 Education_i^7+u_i \]

\[ Income_i=\beta_0+\beta_1 Education_i+\beta_2 Education_i^2+\beta_3 Education_i^3+\beta_3 Education_i^4+...+\beta_6 Education_i^7+...+\beta_{10} Education_i^{11}+u_i \]

library(ISLR)
attach(Credit)

plot(Age,Income)

mod1<-lm(Income~Age)
mod2<-lm(Income~Age+I(Age^2))
mod3<-lm(Income~Age+I(Age^2)+I(Age^3))
mod4<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4))
mod5<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5)+I(Age^6)+I(Age^7))
mod6<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5)+I(Age^6)+I(Age^7)+I(Age^8)+I(Age^9)+I(Age^10)+I(Age^11))

summary(mod1)

## 
## Call:
## lm(formula = Income ~ Age)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.24 -23.49 -10.69  10.29 146.67 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.2762     5.8755   4.302 2.13e-05 ***
## Age           0.3582     0.1008   3.553 0.000426 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.74 on 398 degrees of freedom
## Multiple R-squared:  0.03074,    Adjusted R-squared:  0.02831 
## F-statistic: 12.62 on 1 and 398 DF,  p-value: 0.0004265

summary(mod2)

## 
## Call:
## lm(formula = Income ~ Age + I(Age^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44.61 -23.29 -11.60  11.82 147.76 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 64.228025  17.935851   3.581 0.000385 ***
## Age         -1.186126   0.679755  -1.745 0.081771 .  
## I(Age^2)     0.013847   0.006028   2.297 0.022133 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.56 on 397 degrees of freedom
## Multiple R-squared:  0.04346,    Adjusted R-squared:  0.03864 
## F-statistic: 9.018 on 2 and 397 DF,  p-value: 0.0001479

summary(mod3)

## 
## Call:
## lm(formula = Income ~ Age + I(Age^2) + I(Age^3))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.48 -22.55 -10.02  13.28 141.90 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.880e+02  4.835e+01  -3.888 0.000119 ***
## Age          1.429e+01  2.847e+00   5.019 7.86e-07 ***
## I(Age^2)    -2.795e-01  5.283e-02  -5.290 2.03e-07 ***
## I(Age^3)     1.741e-03  3.116e-04   5.586 4.33e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.31 on 396 degrees of freedom
## Multiple R-squared:  0.1133, Adjusted R-squared:  0.1066 
## F-statistic: 16.87 on 3 and 396 DF,  p-value: 2.484e-10

summary(mod4)

## 
## Call:
## lm(formula = Income ~ Age + I(Age^2) + I(Age^3) + I(Age^4))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43.27 -21.99 -10.35  12.17 141.01 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.732e+01  1.307e+02   0.133   0.8946  
## Age         -2.574e+00  1.038e+01  -0.248   0.8042  
## I(Age^2)     2.079e-01  2.932e-01   0.709   0.4787  
## I(Age^3)    -4.177e-03  3.516e-03  -1.188   0.2355  
## I(Age^4)     2.568e-05  1.520e-05   1.690   0.0919 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.24 on 395 degrees of freedom
## Multiple R-squared:  0.1197, Adjusted R-squared:  0.1108 
## F-statistic: 13.43 on 4 and 395 DF,  p-value: 2.87e-10

summary(mod5)

## 
## Call:
## lm(formula = Income ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + 
##     I(Age^5) + I(Age^6) + I(Age^7))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.27 -22.00 -10.20  12.76 138.54 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.236e+03  3.112e+03   1.683   0.0933 .
## Age         -7.789e+02  4.446e+02  -1.752   0.0806 .
## I(Age^2)     4.772e+01  2.624e+01   1.819   0.0697 .
## I(Age^3)    -1.557e+00  8.306e-01  -1.874   0.0616 .
## I(Age^4)     2.933e-02  1.526e-02   1.922   0.0553 .
## I(Age^5)    -3.204e-04  1.631e-04  -1.964   0.0502 .
## I(Age^6)     1.882e-06  9.407e-07   2.001   0.0461 *
## I(Age^7)    -4.593e-09  2.263e-09  -2.030   0.0431 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.17 on 392 degrees of freedom
## Multiple R-squared:  0.1299, Adjusted R-squared:  0.1144 
## F-statistic: 8.364 on 7 and 392 DF,  p-value: 1.553e-09

par(mfrow=c(3, 2))
plot(Age,Income)
lines(sort(Age),fitted(mod1)[order(Age)],col="green")

plot(Age,Income)
lines(sort(Age),fitted(mod2)[order(Age)],col="red")

plot(Age,Income)
lines(sort(Age),fitted(mod3)[order(Age)],col="orange")

plot(Age,Income)
lines(sort(Age),fitted(mod4)[order(Age)],col="purple")

plot(Age,Income)
lines(sort(Age),fitted(mod5)[order(Age)],col="pink")

plot(Age,Income)
lines(sort(Age),fitted(mod6)[order(Age)],col="yellow")

as you can see, the \(R^2\) increases for each model. But:

How do you interpret model 1,2, and 3? Which of these models make sense in economic and social terms?

Perform a cross-validation scheme. Which model predicts better?

library(ISLR)
attach(Credit)

## The following objects are masked from Credit (pos = 3):
## 
##     Age, Balance, Cards, Education, Ethnicity, Gender, ID, Income,
##     Limit, Married, Rating, Student

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ggplot2)
library(lattice)


mod1<-lm(Income~Age)
mod2<-lm(Income~Age+I(Age^2))
mod3<-lm(Income~Age+I(Age^2)+I(Age^3))
mod4<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4))
mod5<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5)+I(Age^6)+I(Age^7))
mod6<-lm(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5)+I(Age^6)+I(Age^7)+I(Age^8)+I(Age^9)+I(Age^10)+I(Age^11))


train_control<- trainControl(method="cv", number=20,p=0.75, savePredictions = TRUE)

model1_cv<- train(Income~Age, data=Credit, trControl=train_control, method = "lm" )  
model1_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE     
##   34.25382  0.06069146  25.61557
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model2_cv<- train(Income~Age+I(Age^2), data=Credit, trControl=train_control, method = "lm" )  
model2_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE    
##   34.01322  0.08911568  25.6452
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model3_cv<- train(Income~Age+I(Age^2)+I(Age^3), data=Credit, trControl=train_control, method = "lm" )  
model3_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   32.37296  0.1702583  24.75844
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model4_cv<- train(Income~Age+I(Age^2)+I(Age^3)+I(Age^4), data=Credit, trControl=train_control, method = "lm" )  
model4_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   32.68155  0.143395  24.70603
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model5_cv<- train(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5), data=Credit, trControl=train_control, method = "lm" )  
model5_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   33.20629  0.177188  24.92997
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model6_cv<- train(Income~Age+I(Age^2)+I(Age^3)+I(Age^4)+I(Age^5)+I(Age^6), data=Credit, trControl=train_control, method = "lm" )  
model6_cv

## Linear Regression 
## 
## 400 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 380, 380, 380, 380, 380, 380, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   33.89708  0.1335464  25.22238
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Errors<-data.frame(model1_cv$results$RMSE,model2_cv$results$RMSE,model3_cv$results$RMSE,model4_cv$results$RMSE,model5_cv$results$RMSE,model6_cv$results$RMSE)

Errors

##   model1_cv.results.RMSE model2_cv.results.RMSE model3_cv.results.RMSE
## 1               34.25382               34.01322               32.37296
##   model4_cv.results.RMSE model5_cv.results.RMSE model6_cv.results.RMSE
## 1               32.68155               33.20629               33.89708

Lesson 1: a better in-sample fit doesn’t mean a better forecasting performance

Lesson 2: Even with a good forecasting performance, you have to check the meaning of the model (sometimes you’ll prefer a more interpretable model)

Question of today’s class: how do I choose data transformation?

One interesting question is, when we have a predictive question and we have data, How we should transform it to get a better model? There are different approaches. Here we’ll follow an interesting one: graphical analysis always you feel it is possible. Let’s see different examples.

Idea 1 Keep a linear model if you don’t find strong evidence (KISS principle: Keep it as simple as possible)

Idea 2

If you need to estimate elasticity, use logarithms (because you have a purpose to do it)

Idea 3

Use graphical analysis. Some examples with the advertising data set:

library(readr)
Advertising <- read_csv("Advertising.csv")

## New names:
## Rows: 200 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (5): ...1, TV, radio, newspaper, sales
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

View(Advertising)
plot(Advertising$TV,Advertising$sales)

As you can see, it looks like a logarithmic relationship

\[ sales=\beta_0+\beta_1 \log TV_i+u_i, \]

However, as we can see, the error grows with the values of advertising. This is called “Heteroskedasticity”, meaning that the variance of the error is not constant and it is “heterogeneous”

In this case, it usually works well to transform both sides using logarithms:

\[ \log sales=\beta_0+\beta_1 \log TV_i+u_i, \]

library(ISLR)
attach(Credit)

## The following objects are masked from Credit (pos = 7):
## 
##     Age, Balance, Cards, Education, Ethnicity, Gender, ID, Income,
##     Limit, Married, Rating, Student

## The following objects are masked from Credit (pos = 8):
## 
##     Age, Balance, Cards, Education, Ethnicity, Gender, ID, Income,
##     Limit, Married, Rating, Student

Advertising <- read_csv("Advertising.csv")

## New names:
## • `` -> `...1`

## Rows: 200 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): ...1, TV, radio, newspaper, sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

plot(log(Advertising$TV),log(Advertising$sales))

As you can see, this transformation allow us to have a linear model.

Week 6: Class 16/03/2023

Exercise on overfitting

Using the data “Advertising”

Run the model \[ log(Sales)_i=\beta_0+\beta_1 log(TV)_i+log(radio+1)_i+log(newspapers)_i+u_i \]

How do you interpret all the elements of the output? What should we do-in your opinion- with a non-significant variable?

Use the “predict function” with the sample. Compare the result with the fitted values
Use the “predict” function to forecast sales if we invest \(TV=100\), \(radio=40\) and \(newspaper=24.2\). How do you interpret the output?

#a) 

library(readr)
Advertising <- read_csv("Advertising.csv")

## New names:
## Rows: 200 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (5): ...1, TV, radio, newspaper, sales
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

View(Advertising)

m1<-lm(log(sales)~log(TV)+log(radio+1)+log(newspaper), data=Advertising)
summary(m1)

## 
## Call:
## lm(formula = log(sales) ~ log(TV) + log(radio + 1) + log(newspaper), 
##     data = Advertising)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47874 -0.06376 -0.01975  0.04522  0.35822 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.297808   0.043143   6.903 6.91e-11 ***
## log(TV)        0.351882   0.006977  50.436  < 2e-16 ***
## log(radio + 1) 0.202757   0.007885  25.714  < 2e-16 ***
## log(newspaper) 0.011780   0.007350   1.603    0.111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09881 on 196 degrees of freedom
## Multiple R-squared:  0.944,  Adjusted R-squared:  0.9431 
## F-statistic:  1101 on 3 and 196 DF,  p-value: < 2.2e-16

We should be cautious with non significant variables. A variable can be non significant if the standard error is high (even the effect is relevant). We recommend you to do cross-validation with the “non significant” variable and without it and make a decision.

#b) 

library(readr)
Advertising <- read_csv("Advertising.csv")

## New names:
## Rows: 200 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (5): ...1, TV, radio, newspaper, sales
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

predictors<-data.frame(TV=(Advertising$TV),radio=(Advertising$radio),newspaper=(Advertising$newspaper))
predict(m1, newdata=predictors)

##         1         2         3         4         5         6         7         8 
## 3.0032021 2.4276993 2.1290192 2.8716735 2.6750109 1.9026841 2.4745431 2.6252954 
##         9        10        11        12        13        14        15        16 
## 1.2843731 2.4575302 2.1988071 2.8561252 2.1896429 2.3689129 2.9288265 2.9885959 
##        17        18        19        20        21        22        23        24 
## 2.5727301 3.0806822 2.4450423 2.7411503 2.9206498 2.6263432 1.8249841 2.8321409 
##        25        26        27        28        29        30        31        32 
## 2.3152216 2.5983658 2.7653759 2.8460100 2.9522501 2.4139196 3.0256263 2.5945111 
##        33        34        35        36        37        38        39        40 
## 2.1341392 2.8651240 2.1039013 2.6493327 3.0536093 2.6554711 2.3374806 2.9903674 
##        41        42        43        44        45        46        47        48 
## 2.8456079 2.8796304 2.9848345 2.6670027 2.1422520 2.7961461 2.4064860 3.0208061 
##        49        50        51        52        53        54        55        56 
## 2.8251714 2.3346357 2.4896697 2.4134562 2.9944463 2.9596379 2.9789918 3.0032836 
##        57        58        59        60        61        62        63        64 
## 1.7246040 2.6695152 3.0190598 2.8997648 1.9570221 3.0692893 2.8326715 2.6463591 
##        65        66        67        68        69        70        71        72 
## 2.8195410 2.2593296 2.1785410 2.6179989 2.9299743 3.0008578 2.9038180 2.5449859 
##        73        74        75        76        77        78        79        80 
## 2.2047994 2.4352066 2.8727713 2.1160862 1.6934399 2.7013646 1.6132251 2.4461243 
##        81        82        83        84        85        86        87        88 
## 2.5335753 2.5988946 2.4796396 2.6007880 2.9939141 2.8005456 2.5349764 2.7587934 
##        89        90        91        92        93        94        95        96 
## 2.5895668 2.7858525 2.4082078 1.7047824 2.9580148 3.0272605 2.5206186 2.8440674 
##        97        98        99       100       101       102       103       104 
## 2.4838112 2.7977071 3.1029519 2.8306696 2.5837191 3.0887455 2.8049491 2.7624942 
##       105       106       107       108       109       110       111       112 
## 2.9659423 2.8617577 1.9742500 1.9730024 1.3094819 2.9432121 2.7023645 3.0086778 
##       113       114       115       116       117       118       119       120 
## 2.6940871 2.8296169 2.6575327 2.5908070 2.6259527 1.9744806 2.7872802 1.9522578 
##       121       122       123       124       125       126       127       128 
## 2.7592592 2.0094358 2.4825581 2.7453889 2.9721111 2.4253389 1.8142776 1.8667831 
##       129       130       131       132       133       134       135       136 
## 3.0030975 2.3005735 0.9487481 2.5817315 1.7325039 2.9582282 2.3626608 2.4723271 
##       137       138       139       140       141       142       143       144 
## 2.2130314 3.0096899 2.3243864 2.9121919 2.4256311 2.9307025 2.9555274 2.3614510 
##       145       146       147       148       149       150       151       152 
## 2.5073873 2.2791948 2.6810598 3.0688516 2.3614113 2.3373322 2.8717211 2.4854504 
##       153       154       155       156       157       158       159       160 
## 2.8360862 2.9019017 2.7942107 1.3285349 2.7118970 2.2669503 1.9451971 2.6581510 
##       161       162       163       164       165       166       167       168 
## 2.7485449 2.6409375 2.7774304 2.8513357 2.5523197 2.5708977 2.0898339 2.5788240 
##       169       170       171       172       173       174       175       176 
## 2.8854195 2.8047749 2.2224083 2.7646887 1.9964778 2.5558455 2.5302549 3.1134259 
##       177       178       179       180       181       182       183       184 
## 2.9714824 2.5883092 2.5557799 2.6157468 2.3607577 2.6086938 2.1411221 3.1076267 
##       185       186       187       188       189       190       191       192 
## 2.9155588 2.9826523 2.3034703 2.8679274 2.8511793 1.8870548 2.3704583 2.3409216 
##       193       194       195       196       197       198       199       200 
## 1.6698963 2.8760129 2.8112953 1.9243518 2.2817834 2.6139252 3.0970805 2.6986382

m1$fitted.values

##         1         2         3         4         5         6         7         8 
## 3.0032021 2.4276993 2.1290192 2.8716735 2.6750109 1.9026841 2.4745431 2.6252954 
##         9        10        11        12        13        14        15        16 
## 1.2843731 2.4575302 2.1988071 2.8561252 2.1896429 2.3689129 2.9288265 2.9885959 
##        17        18        19        20        21        22        23        24 
## 2.5727301 3.0806822 2.4450423 2.7411503 2.9206498 2.6263432 1.8249841 2.8321409 
##        25        26        27        28        29        30        31        32 
## 2.3152216 2.5983658 2.7653759 2.8460100 2.9522501 2.4139196 3.0256263 2.5945111 
##        33        34        35        36        37        38        39        40 
## 2.1341392 2.8651240 2.1039013 2.6493327 3.0536093 2.6554711 2.3374806 2.9903674 
##        41        42        43        44        45        46        47        48 
## 2.8456079 2.8796304 2.9848345 2.6670027 2.1422520 2.7961461 2.4064860 3.0208061 
##        49        50        51        52        53        54        55        56 
## 2.8251714 2.3346357 2.4896697 2.4134562 2.9944463 2.9596379 2.9789918 3.0032836 
##        57        58        59        60        61        62        63        64 
## 1.7246040 2.6695152 3.0190598 2.8997648 1.9570221 3.0692893 2.8326715 2.6463591 
##        65        66        67        68        69        70        71        72 
## 2.8195410 2.2593296 2.1785410 2.6179989 2.9299743 3.0008578 2.9038180 2.5449859 
##        73        74        75        76        77        78        79        80 
## 2.2047994 2.4352066 2.8727713 2.1160862 1.6934399 2.7013646 1.6132251 2.4461243 
##        81        82        83        84        85        86        87        88 
## 2.5335753 2.5988946 2.4796396 2.6007880 2.9939141 2.8005456 2.5349764 2.7587934 
##        89        90        91        92        93        94        95        96 
## 2.5895668 2.7858525 2.4082078 1.7047824 2.9580148 3.0272605 2.5206186 2.8440674 
##        97        98        99       100       101       102       103       104 
## 2.4838112 2.7977071 3.1029519 2.8306696 2.5837191 3.0887455 2.8049491 2.7624942 
##       105       106       107       108       109       110       111       112 
## 2.9659423 2.8617577 1.9742500 1.9730024 1.3094819 2.9432121 2.7023645 3.0086778 
##       113       114       115       116       117       118       119       120 
## 2.6940871 2.8296169 2.6575327 2.5908070 2.6259527 1.9744806 2.7872802 1.9522578 
##       121       122       123       124       125       126       127       128 
## 2.7592592 2.0094358 2.4825581 2.7453889 2.9721111 2.4253389 1.8142776 1.8667831 
##       129       130       131       132       133       134       135       136 
## 3.0030975 2.3005735 0.9487481 2.5817315 1.7325039 2.9582282 2.3626608 2.4723271 
##       137       138       139       140       141       142       143       144 
## 2.2130314 3.0096899 2.3243864 2.9121919 2.4256311 2.9307025 2.9555274 2.3614510 
##       145       146       147       148       149       150       151       152 
## 2.5073873 2.2791948 2.6810598 3.0688516 2.3614113 2.3373322 2.8717211 2.4854504 
##       153       154       155       156       157       158       159       160 
## 2.8360862 2.9019017 2.7942107 1.3285349 2.7118970 2.2669503 1.9451971 2.6581510 
##       161       162       163       164       165       166       167       168 
## 2.7485449 2.6409375 2.7774304 2.8513357 2.5523197 2.5708977 2.0898339 2.5788240 
##       169       170       171       172       173       174       175       176 
## 2.8854195 2.8047749 2.2224083 2.7646887 1.9964778 2.5558455 2.5302549 3.1134259 
##       177       178       179       180       181       182       183       184 
## 2.9714824 2.5883092 2.5557799 2.6157468 2.3607577 2.6086938 2.1411221 3.1076267 
##       185       186       187       188       189       190       191       192 
## 2.9155588 2.9826523 2.3034703 2.8679274 2.8511793 1.8870548 2.3704583 2.3409216 
##       193       194       195       196       197       198       199       200 
## 1.6698963 2.8760129 2.8112953 1.9243518 2.2817834 2.6139252 3.0970805 2.6986382

#c) 

library(readr)
Advertising <- read_csv("Advertising.csv")

## New names:
## Rows: 200 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (5): ...1, TV, radio, newspaper, sales
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

predictors<-data.frame(TV=100,radio=40,newspaper=24.2)
predict(m1, newdata=predictors, interval="predict")

##       fit      lwr     upr
## 1 2.70877 2.512999 2.90454

Predictive6

Jorge Herrera

2023-03-14

Week 6: Class 14/03/2023

Exercise on overfitting

Lesson 1: a better in-sample fit doesn’t mean a better forecasting performance

Lesson 2: Even with a good forecasting performance, you have to check the meaning of the model (sometimes you’ll prefer a more interpretable model)

Question of today’s class: how do I choose data transformation?

Week 6: Class 16/03/2023

Exercise on overfitting