Parsed with column specification:
cols(
X1 = col_integer(),
TV = col_double(),
Radio = col_double(),
Newspaper = col_double(),
Sales = col_double()
)
Usaremos el dataset Advertising
library(readr)
Advertising <- read_csv("~/Dropbox/Cursos/ISL Chapters/Advertising.csv")
Advertising <- Advertising[,-1]
names(Advertising)
[1] "TV" "Radio" "Newspaper" "Sales"
summary(Advertising)
TV Radio Newspaper Sales
Min. : 0.70 Min. : 0.000 Min. : 0.30 Min. : 1.60
1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75 1st Qu.:10.38
Median :149.75 Median :22.900 Median : 25.75 Median :12.90
Mean :147.04 Mean :23.264 Mean : 30.55 Mean :14.02
3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10 3rd Qu.:17.40
Max. :296.40 Max. :49.600 Max. :114.00 Max. :27.00
Es de la forma \[Y = \beta_0+\beta_1X +\epsilon_i.\] \(Y\) es la respuesta y \(X\) es el predictor. En el caso de nuestra base de datos de publicidad seria de la forma \[\text{sales} \approx \beta_o + \beta_1 \text{TV}\] \(\beta_0\) y \(\beta_1\) son los coefficientonces de la regresion.
Advertising %>% select(Sales,TV) %>%
ggplot(aes(x=TV,y=Sales)) +
geom_point()
Advertising %>% select(Sales,TV) %>%
ggplot(aes(x=TV,y=Sales)) +
geom_point()+
stat_quantile(quantiles = seq(0.05,0.95,by=0.05))
Para encontrar la recta de mejor ajuste basta con resolver \[\text{min } RSS=\sum e^2_i = \sum \left ( Y_i-\hat Y_i \right )^2=\sum \left ( Y_i-\hat \beta_0-\hat \beta_1X_i \right )^2 \]
La solucion de este problema de optimizacion la pueden encontra en el siguiente video.
Ahora hagamolo usando la funcion lm() de R.
fit1<-lm(Sales~TV,data = Advertising)
fit1
Call:
lm(formula = Sales ~ TV, data = Advertising)
Coefficients:
(Intercept) TV
7.03259 0.04754
str(fit1)
List of 12
$ coefficients : Named num [1:2] 7.0326 0.0475
..- attr(*, "names")= chr [1:2] "(Intercept)" "TV"
$ residuals : Named num [1:200] 4.13 1.25 1.45 4.27 -2.73 ...
..- attr(*, "names")= chr [1:200] "1" "2" "3" "4" ...
$ effects : Named num [1:200] -198.31 57.57 1.08 3.99 -2.98 ...
..- attr(*, "names")= chr [1:200] "(Intercept)" "TV" "" "" ...
$ rank : int 2
$ fitted.values: Named num [1:200] 17.97 9.15 7.85 14.23 15.63 ...
..- attr(*, "names")= chr [1:200] "1" "2" "3" "4" ...
$ assign : int [1:2] 0 1
$ qr :List of 5
..$ qr : num [1:200, 1:2] -14.1421 0.0707 0.0707 0.0707 0.0707 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:200] "1" "2" "3" "4" ...
.. .. ..$ : chr [1:2] "(Intercept)" "TV"
.. ..- attr(*, "assign")= int [1:2] 0 1
..$ qraux: num [1:2] 1.07 1.09
..$ pivot: int [1:2] 1 2
..$ tol : num 1e-07
..$ rank : int 2
..- attr(*, "class")= chr "qr"
$ df.residual : int 198
$ xlevels : Named list()
$ call : language lm(formula = Sales ~ TV, data = Advertising)
$ terms :Classes 'terms', 'formula' language Sales ~ TV
.. ..- attr(*, "variables")= language list(Sales, TV)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "Sales" "TV"
.. .. .. ..$ : chr "TV"
.. ..- attr(*, "term.labels")= chr "TV"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(Sales, TV)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "Sales" "TV"
$ model :'data.frame': 200 obs. of 2 variables:
..$ Sales: num [1:200] 22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...
..$ TV : num [1:200] 230.1 44.5 17.2 151.5 180.8 ...
..- attr(*, "terms")=Classes 'terms', 'formula' language Sales ~ TV
.. .. ..- attr(*, "variables")= language list(Sales, TV)
.. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. ..$ : chr [1:2] "Sales" "TV"
.. .. .. .. ..$ : chr "TV"
.. .. ..- attr(*, "term.labels")= chr "TV"
.. .. ..- attr(*, "order")= int 1
.. .. ..- attr(*, "intercept")= int 1
.. .. ..- attr(*, "response")= int 1
.. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. .. ..- attr(*, "predvars")= language list(Sales, TV)
.. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. .. ..- attr(*, "names")= chr [1:2] "Sales" "TV"
- attr(*, "class")= chr "lm"
fit1$coefficients
(Intercept) TV
7.03259355 0.04753664
fit1$residuals[1:10]
1 2 3 4 5 6 7
4.1292255 1.2520260 1.4497762 4.2656054 -2.7272181 -0.2461623 2.0340496
8 9 10
0.4535023 -2.6414087 -5.9304143
fit1$fitted.values[1:10]
1 2 3 4 5 6 7
17.970775 9.147974 7.850224 14.234395 15.627218 7.446162 9.765950
8 9 10
12.746498 7.441409 16.530414
summary(fit1)
Call:
lm(formula = Sales ~ TV, data = Advertising)
Residuals:
Min 1Q Median 3Q Max
-8.3860 -1.9545 -0.1913 2.0671 7.2124
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.032594 0.457843 15.36 <2e-16 ***
TV 0.047537 0.002691 17.67 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
sim_data<-function(){
x<-runif(30,min = -3, max = 3)
epsilon<-rnorm(n=length(x),mean = 0,sd = 1)
y<-2+3*x+epsilon
return(data.frame(x,y))
}
dataset1<-sim_data()
dataset2<-sim_data()
dataset3<-sim_data()
dataset4<-sim_data()
sim_fit_1<-lm(y~x,dataset1)
sim_fit_2<-lm(y~x,dataset2)
sim_fit_3<-lm(y~x,dataset3)
sim_fit_4<-lm(y~x,dataset4)
| Dataset | \(\beta_0\) | \(\beta_1\) |
|---|---|---|
| Dataset1 | 2.0404619 | 3.0366712 |
| Dataset2 | 2.0478985 | 2.9865992 |
| Dataset3 | 2.2121774 | 2.9710174 |
| Dataset4 | 1.6032715 | 3.0603625 |
fit_summary <- summary(fit1)
fit_summary$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
TV 0.04753664 0.002690607 17.66763 1.46739e-42
\[\left [ \hat \beta_i - 2 \cdot \text{SE}(\hat \beta_i) , \hat \beta_i + 2 \cdot \text{SE}(\hat \beta_i) \right]\]
| Parametro | Intervalo |
|---|---|
| \(\hat \beta_0\) | [6.1169077,7.9482794] |
| \(\hat \beta_1\) | [0.0421554,0.0529179] |
Se hace una prueba de hipotesis asociada al parametro que relaciona las dos variables. En nuestro caso seria Sales y TV \(H_0\) es la hipotesis nula y \(H_a\) es la hipotesis alternativa
Sales y TVSales y TVEsto se traduce a,
| Parametro | valor t |
|---|---|
| \(\hat \beta_0\) | 15.3602752 |
| \(\hat \beta_1\) | 17.6676256 |
fit_summary$coefficients[,3]
(Intercept) TV
15.36028 17.66763
| Parametro | valor t | codigo |
|---|---|---|
| RSE | 3.2586564 | fit_summary$sigma |
| \(R^2\) | 0.6118751 | fit_summary$r.squared |
| Estadistivo F | 312.1449944 | fit_summary$fstatistic[1] |
fit_summary$sigma/mean(Advertising$Sales)*100 %>% round()
[1] 23.23877
plot(Advertising)
fit2<-lm(Sales ~ Radio + TV + Newspaper, data = Advertising)
summary(fit2)
Call:
lm(formula = Sales ~ Radio + TV + Newspaper, data = Advertising)
Residuals:
Min 1Q Median 3Q Max
-8.8277 -0.8908 0.2418 1.1893 2.8292
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.938889 0.311908 9.422 <2e-16 ***
Radio 0.188530 0.008611 21.893 <2e-16 ***
TV 0.045765 0.001395 32.809 <2e-16 ***
Newspaper -0.001037 0.005871 -0.177 0.86
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
cor(Advertising)
TV Radio Newspaper Sales
TV 1.00000000 0.05480866 0.05664787 0.7822244
Radio 0.05480866 1.00000000 0.35410375 0.5762226
Newspaper 0.05664787 0.35410375 1.00000000 0.2282990
Sales 0.78222442 0.57622257 0.22829903 1.0000000
\[\text{sales}=\beta_0+\beta_1\cdot\text{TV}+\beta_2\cdot\text{Radio}\]
En este caso se quiere verificar si un incremento en la adquisicion de anuncios en radio aumenta o disminuye el \(\beta_1\) se propone el siguiente modelo
\[\text{sales}=\beta_0+\beta_1\cdot\text{TV}+\beta_2\cdot\text{Radio}+\beta_3\cdot\text{Radio}\times\text{TV}\]
lo cual se puede simplificar como
\[\begin{align*} \text{sales} &= \beta_0+\beta_1\cdot\text{TV}+\beta_2\cdot\text{Radio}+\beta_3\cdot\text{Radio}\times\text{TV} \\ &= \beta_0 + \left(\beta_1 + \beta_3 \cdot \text{Radio}\right)\cdot\text{TV}+\beta_2 \cdot \text{Radio} \\ &= \beta_0 + \tilde \beta_1 \cdot \text{TV} + \beta_3 \cdot \text{Radio} \end{align*}\]
fit3 <- lm(Sales ~ TV + Radio + TV * Radio,data = Advertising)
summary(fit3)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.750220203 0.2478713699 27.232755 1.541461e-68
TV 0.019101074 0.0015041455 12.698953 2.363605e-27
Radio 0.028860340 0.0089052729 3.240815 1.400461e-03
TV:Radio 0.001086495 0.0000524204 20.726564 2.757681e-51
credit <- read_csv("~/OneDrive/Dropbox/Cursos/ISL Chapters/Credit.csv")
Parsed with column specification:
cols(
X1 = col_integer(),
Income = col_double(),
Limit = col_integer(),
Rating = col_integer(),
Cards = col_integer(),
Age = col_integer(),
Education = col_integer(),
Gender = col_character(),
Student = col_character(),
Married = col_character(),
Ethnicity = col_character(),
Balance = col_integer()
)
names(credit)<-tolower(names(credit))
credit<-credit[,-1]
head(credit)
fit4 <- lm(balance ~ gender , data = credit)
summary(fit4)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 529.53623 31.98819 16.5541153 3.312981e-47
genderMale -19.73312 46.05121 -0.4285039 6.685161e-01
\[gender_i = \left\{\begin{matrix} 0 & Female\\ 1 & Male \end{matrix}\right.\]
\[ balance = \beta_0 + \beta_1 \cdot gender=\left\{\begin{matrix} \beta_0 + \beta_1 & male\\ \beta_0 & female \end{matrix}\right. \]
\[gender_i = \left\{\begin{matrix} -1 & Female\\ 1 & Male \end{matrix}\right.\]
\[ balance = \beta_0 + \beta_1 \cdot gender=\left\{\begin{matrix} \beta_0 + \beta_1 & male\\ \beta_0 - \beta_1 & female \end{matrix}\right. \]
credit$coded_gender <- ifelse(credit$gender=="Female",-1,1)
fit5 <- lm(balance ~ coded_gender , data = credit)
summary(fit5)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 519.669670 23.02561 22.5692080 3.191322e-73
coded_gender -9.866562 23.02561 -0.4285039 6.685161e-01
unique(credit$ethnicity)
[1] "Caucasian" "Asian" "African American"
\[asian_i = \left\{\begin{matrix} 0 & no\\ 1 & yes \end{matrix}\right.\]
\[ caucasian_i = \left\{\begin{matrix} 0 & no\\ 1 & yes \end{matrix}\right. \]
credit$caucasian <- ifelse( credit$ethnicity == "Caucasian", 1, 0)
credit$asian <- ifelse( credit$ethnicity == "Asian", 1, 0)
\[balance = \beta_0 + \beta_1 \cdot asian + \beta_2 \cdot caucasian =\left\{\begin{matrix} \beta_0 + \beta_1 & asian\\ \beta_0 + \beta_2 & caucasian \\ \beta_0 & \text{african american} \end{matrix}\right.\]
fit6 <- lm(balance ~ asian + caucasian, data=credit)
summary(fit6)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 531.00000 46.31868 11.4640565 1.774117e-26
asian -18.68627 65.02107 -0.2873880 7.739652e-01
caucasian -12.50251 56.68104 -0.2205766 8.255355e-01