\(~\)
First we start by loading the libraries and clean the dataset :
#Load libraries
library(readr)
library(haven)
library(tidyverse)
library(ggplot2)
library(hrbrthemes)
library(stargazer)
library(sandwich)
library(lmtest)
library(lindia)
library(gplots)
library(kableExtra)
# Import data
PBset_1_data <- read_dta("/Users/bastienpatras/Downloads/Pbset 1 Econometrics/ee2002ext.dta")
# Cleaning dataset
PBset_1_data <- PBset_1_data %>%
rename(Education = ddipl1, Monthly_Income = salfr) %>%
na.omit()
# Summarize the dataset
summary(PBset_1_data)
## s agd noi fi
## Min. :1.000 Min. :16.00 Min. : 1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:30.00 1st Qu.: 1.000 1st Qu.:1.000
## Median :1.000 Median :39.00 Median : 1.000 Median :1.000
## Mean :1.484 Mean :39.13 Mean : 1.755 Mean :1.019
## 3rd Qu.:2.000 3rd Qu.:48.00 3rd Qu.: 2.000 3rd Qu.:1.000
## Max. :2.000 Max. :75.00 Max. :13.000 Max. :8.000
## pub hh Monthly_Income cspp
## Min. :1.000 Min. : 1.00 Min. : 0 Min. :10.00
## 1st Qu.:4.000 1st Qu.:35.00 1st Qu.: 6300 1st Qu.:38.00
## Median :5.000 Median :35.00 Median : 8000 Median :54.00
## Mean :4.142 Mean :35.42 Mean : 9319 Mean :49.37
## 3rd Qu.:5.000 3rd Qu.:39.00 3rd Qu.: 11000 3rd Qu.:63.00
## Max. :5.000 Max. :99.00 Max. :338894 Max. :69.00
## extri nafg4 csei dipl1
## Min. :155.0 Length:22447 Min. :10.00 Min. : 1.000
## 1st Qu.:279.0 Class :character 1st Qu.:46.00 1st Qu.: 3.000
## Median :367.0 Mode :character Median :54.00 Median : 5.000
## Mean :354.2 Mean :51.29 Mean : 6.576
## 3rd Qu.:419.0 3rd Qu.:61.00 3rd Qu.:12.000
## Max. :841.0 Max. :69.00 Max. :16.000
## Education adfe enf3 enf6
## Min. :1.000 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:16.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3.000 Median :18.00 Median :0.0000 Median :0.0000
## Mean :3.357 Mean :20.47 Mean :0.1076 Mean :0.2432
## 3rd Qu.:5.000 3rd Qu.:21.00 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :7.000 Max. :99.00 Max. :3.0000 Max. :4.0000
## enf18 np tymen90r nomen
## Min. :0.0000 Min. : 1.000 Min. :1.000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 2.000 1st Qu.:4.000 1st Qu.: 7436
## Median :0.0000 Median : 3.000 Median :5.000 Median :15173
## Mean :0.8284 Mean : 3.055 Mean :4.097 Mean :15759
## 3rd Qu.:2.0000 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.:23859
## Max. :9.0000 Max. :16.000 Max. :5.000 Max. :32739
\(~\)
\(~\)
# Select the wanted vector
Monthly_Income_data <- PBset_1_data %>%
select(Monthly_Income)
# Compute the summary statistics
summary(Monthly_Income_data)
## Monthly_Income
## Min. : 0
## 1st Qu.: 6300
## Median : 8000
## Mean : 9319
## 3rd Qu.: 11000
## Max. :338894
\(~\)
\(~\)
Education_data <- PBset_1_data %>%
select(Education) %>%
arrange(Education)
head(Education_data) %>% kbl() %>%
kable_material(c("striped", "hover"))
Education |
---|
1 |
1 |
1 |
1 |
1 |
1 |
\(~\)
\(~\)
PBset_1_data_Education <- PBset_1_data %>%
mutate(Label_Education = case_when(Education == 1 ~ "No degree",
Education == 2 ~ "Middle school degree (brevet)",
Education == 3 ~ "High school vocational degree (eg CAP)",
Education == 4 ~ "High school degree (baccalaureat)",
Education == 5 ~ "Some college (bac +2)",
Education == 6 ~ "College plus (> licence)",
Education == 7 ~ "Still in school")) %>%
select(Label_Education, Education)
head(PBset_1_data_Education) %>% kbl() %>%
kable_material(c("striped", "hover"))
Label_Education | Education |
---|---|
High school vocational degree (eg CAP) | 3 |
Still in school | 7 |
Some college (bac +2) | 5 |
High school vocational degree (eg CAP) | 3 |
Middle school degree (brevet) | 2 |
High school vocational degree (eg CAP) | 3 |
\(~\)
\(~\)
Mean_Education_Group <- PBset_1_data %>%
mutate(Label_Education = case_when(Education == 1 ~ "No degree",
Education == 2 ~ "Middle school degree (brevet)",
Education == 3 ~ "High school vocational degree (eg CAP)",
Education == 4 ~ "High school degree (baccalaureat)",
Education == 5 ~ "Some college (bac +2)",
Education == 6 ~ "College plus (> licence)",
Education == 7 ~ "Still in school")) %>%
group_by(Label_Education) %>%
summarise(Income_mean = mean(Monthly_Income),
Income_Standar_dev = sd(Monthly_Income),
Income_var = var(Monthly_Income)) %>%
arrange(Income_mean)
Mean_Education_Group %>% kbl() %>%
kable_material(c("striped", "hover"))
Label_Education | Income_mean | Income_Standar_dev | Income_var |
---|---|---|---|
Still in school | 4702.048 | 3454.259 | 11931905 |
No degree | 7028.577 | 4473.792 | 20014811 |
High school vocational degree (eg CAP) | 8390.705 | 6182.135 | 38218798 |
Middle school degree (brevet) | 8648.555 | 4634.186 | 21475677 |
High school degree (baccalaureat) | 9284.632 | 6500.892 | 42261600 |
Some college (bac +2) | 10932.263 | 5889.771 | 34689397 |
College plus (> licence) | 15381.187 | 9979.600 | 99592422 |
\(~\)
\(~\)
Education_table <- PBset_1_data %>%
rename(Graduation_age = adfe) %>%
select(Graduation_age, Education)
head(Education_table) %>% kbl() %>%
kable_material(c("striped", "hover"))
Graduation_age | Education |
---|---|
17 | 3 |
99 | 7 |
22 | 5 |
16 | 3 |
16 | 2 |
18 | 3 |
\(~\)
\(~\)
Mean_Graduation_age_by_Education <- Education_table %>%
mutate(Label_Education = case_when(Education == 1 ~ "No degree",
Education == 2 ~ "Middle school degree (brevet)",
Education == 3 ~ "High school vocational degree (eg CAP)",
Education == 4 ~ "High school degree (baccalaureat)",
Education == 5 ~ "Some college (bac +2)",
Education == 6 ~ "College plus (> licence)",
Education == 7 ~ "Still in school")) %>%
group_by(Label_Education) %>%
summarise(Graduation_age_mean = mean(Graduation_age)) %>%
arrange(Graduation_age_mean)
Mean_Graduation_age_by_Education %>% kbl() %>%
kable_material(c("striped", "hover"))
Label_Education | Graduation_age_mean |
---|---|
No degree | 15.30157 |
High school vocational degree (eg CAP) | 17.89495 |
Middle school degree (brevet) | 17.90432 |
High school degree (baccalaureat) | 20.52958 |
Some college (bac +2) | 21.98057 |
College plus (> licence) | 24.42862 |
Still in school | 68.18587 |
\(~\)
\(~\)
Scatter_plot_dataset <- PBset_1_data %>%
rename(Graduation_age = adfe) %>%
select(Monthly_Income, Graduation_age) %>%
filter(Graduation_age < 99,Graduation_age > 0)
Scatter_plot_1 <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Monthly_Income,col= Graduation_age)) +
geom_point()
show(Scatter_plot_1)
\(~\)
\(~\)
Scatter_plot_dataset <- Scatter_plot_dataset %>%
mutate(Log_Monthly_Income = log(Monthly_Income)) %>%
filter(Log_Monthly_Income>0)
head(Scatter_plot_dataset) %>% kbl() %>%
kable_material(c("striped", "hover"))
Monthly_Income | Graduation_age | Log_Monthly_Income |
---|---|---|
5523 | 17 | 8.616677 |
7500 | 22 | 8.922658 |
14000 | 16 | 9.546813 |
10800 | 16 | 9.287301 |
7800 | 18 | 8.961879 |
8800 | 17 | 9.082507 |
Scatter_plot_2 <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Log_Monthly_Income,col= Graduation_age)) +
geom_point()
show(Scatter_plot_2)
\(~\)
\(~\)
quantile(Scatter_plot_dataset$Log_Monthly_Income, probs = seq(0,1, by=0.01))
## 0% 1% 2% 3% 4% 5% 6% 7%
## 1.945910 7.090077 7.518278 7.783224 7.937375 8.006368 8.085782 8.185350
## 8% 9% 10% 11% 12% 13% 14% 15%
## 8.294050 8.342840 8.411833 8.476371 8.517193 8.575462 8.612503 8.630522
## 16% 17% 18% 19% 20% 21% 22% 23%
## 8.665613 8.677204 8.699515 8.699515 8.699515 8.716044 8.732305 8.748305
## 24% 25% 26% 27% 28% 29% 30% 31%
## 8.764053 8.779557 8.779557 8.779557 8.794825 8.809863 8.824678 8.853665
## 32% 33% 34% 35% 36% 37% 38% 39%
## 8.853665 8.853665 8.853665 8.867850 8.881836 8.894689 8.909235 8.922658
## 40% 41% 42% 43% 44% 45% 46% 47%
## 8.922658 8.922658 8.935904 8.961879 8.970940 8.987197 8.987197 8.987197
## 48% 49% 50% 51% 52% 53% 54% 55%
## 8.987197 8.999002 9.011889 9.029298 9.047821 9.047821 9.047821 9.071078
## 56% 57% 58% 59% 60% 61% 62% 63%
## 9.090534 9.104980 9.104980 9.104980 9.106423 9.126959 9.159047 9.159047
## 64% 65% 66% 67% 68% 69% 70% 71%
## 9.169518 9.190138 9.210340 9.210340 9.210340 9.210340 9.210340 9.230143
## 72% 73% 74% 75% 76% 77% 78% 79%
## 9.259131 9.268609 9.305651 9.305651 9.305651 9.319284 9.350102 9.376448
## 80% 81% 82% 83% 84% 85% 86% 87%
## 9.392662 9.392662 9.392662 9.433484 9.472705 9.472705 9.487972 9.539211
## 88% 89% 90% 91% 92% 93% 94% 95%
## 9.546813 9.581904 9.615805 9.621589 9.680344 9.740969 9.798127 9.852194
## 96% 97% 98% 99% 100%
## 9.903488 10.017832 10.126631 10.373491 12.733443
First_percentile = 7.090077
Ninety_nine_th_percentile = 10.373491
\(~\)
\(~\)
quantile(Scatter_plot_dataset$Log_Monthly_Income, probs = seq(0,1, by=0.005))
## 0.0% 0.5% 1.0% 1.5% 2.0% 2.5% 3.0% 3.5%
## 1.945910 6.684612 7.090077 7.313220 7.518278 7.610878 7.783224 7.863267
## 4.0% 4.5% 5.0% 5.5% 6.0% 6.5% 7.0% 7.5%
## 7.937375 8.006368 8.006368 8.070906 8.085782 8.160518 8.185350 8.242756
## 8.0% 8.5% 9.0% 9.5% 10.0% 10.5% 11.0% 11.5%
## 8.294050 8.294050 8.342840 8.389360 8.411833 8.455318 8.476371 8.517193
## 12.0% 12.5% 13.0% 13.5% 14.0% 14.5% 15.0% 15.5%
## 8.517193 8.556414 8.575462 8.602269 8.612503 8.630522 8.630522 8.648221
## 16.0% 16.5% 17.0% 17.5% 18.0% 18.5% 19.0% 19.5%
## 8.665613 8.665613 8.677204 8.683385 8.699515 8.699515 8.699515 8.699515
## 20.0% 20.5% 21.0% 21.5% 22.0% 22.5% 23.0% 23.5%
## 8.699515 8.699515 8.716044 8.732143 8.732305 8.732305 8.748305 8.748305
## 24.0% 24.5% 25.0% 25.5% 26.0% 26.5% 27.0% 27.5%
## 8.764053 8.770405 8.779557 8.779557 8.779557 8.779557 8.779557 8.788746
## 28.0% 28.5% 29.0% 29.5% 30.0% 30.5% 31.0% 31.5%
## 8.794825 8.809863 8.809863 8.824678 8.824678 8.839277 8.853665 8.853665
## 32.0% 32.5% 33.0% 33.5% 34.0% 34.5% 35.0% 35.5%
## 8.853665 8.853665 8.853665 8.853665 8.853665 8.853665 8.867850 8.881836
## 36.0% 36.5% 37.0% 37.5% 38.0% 38.5% 39.0% 39.5%
## 8.881836 8.884056 8.894689 8.895630 8.909235 8.917303 8.922658 8.922658
## 40.0% 40.5% 41.0% 41.5% 42.0% 42.5% 43.0% 43.5%
## 8.922658 8.922658 8.922658 8.922658 8.935904 8.948976 8.961879 8.961879
## 44.0% 44.5% 45.0% 45.5% 46.0% 46.5% 47.0% 47.5%
## 8.970940 8.974618 8.987197 8.987197 8.987197 8.987197 8.987197 8.987197
## 48.0% 48.5% 49.0% 49.5% 50.0% 50.5% 51.0% 51.5%
## 8.987197 8.987197 8.999002 9.011767 9.011889 9.024011 9.029298 9.040264
## 52.0% 52.5% 53.0% 53.5% 54.0% 54.5% 55.0% 55.5%
## 9.047821 9.047821 9.047821 9.047821 9.047821 9.054855 9.071078 9.081370
## 56.0% 56.5% 57.0% 57.5% 58.0% 58.5% 59.0% 59.5%
## 9.090534 9.104155 9.104980 9.104980 9.104980 9.104980 9.104980 9.104980
## 60.0% 60.5% 61.0% 61.5% 62.0% 62.5% 63.0% 63.5%
## 9.106423 9.125109 9.126959 9.137773 9.159047 9.159047 9.159047 9.159047
## 64.0% 64.5% 65.0% 65.5% 66.0% 66.5% 67.0% 67.5%
## 9.169518 9.184722 9.190138 9.194109 9.210340 9.210340 9.210340 9.210340
## 68.0% 68.5% 69.0% 69.5% 70.0% 70.5% 71.0% 71.5%
## 9.210340 9.210340 9.210340 9.210340 9.210340 9.210640 9.230143 9.253171
## 72.0% 72.5% 73.0% 73.5% 74.0% 74.5% 75.0% 75.5%
## 9.259131 9.259131 9.268609 9.289246 9.305651 9.305651 9.305651 9.305651
## 76.0% 76.5% 77.0% 77.5% 78.0% 78.5% 79.0% 79.5%
## 9.305651 9.305651 9.319284 9.348079 9.350102 9.364177 9.376448 9.392662
## 80.0% 80.5% 81.0% 81.5% 82.0% 82.5% 83.0% 83.5%
## 9.392662 9.392662 9.392662 9.392662 9.392662 9.421270 9.433484 9.456497
## 84.0% 84.5% 85.0% 85.5% 86.0% 86.5% 87.0% 87.5%
## 9.472705 9.472705 9.472705 9.474726 9.487972 9.510445 9.539211 9.546813
## 88.0% 88.5% 89.0% 89.5% 90.0% 90.5% 91.0% 91.5%
## 9.546813 9.577069 9.581904 9.615805 9.615805 9.615805 9.621589 9.664151
## 92.0% 92.5% 93.0% 93.5% 94.0% 94.5% 95.0% 95.5%
## 9.680344 9.704976 9.740969 9.741166 9.798127 9.798127 9.852194 9.903488
## 96.0% 96.5% 97.0% 97.5% 98.0% 98.5% 99.0% 99.5%
## 9.903488 9.952278 10.017832 10.085809 10.126631 10.250405 10.373491 10.596635
## 100.0%
## 12.733443
Zero.five_th_percentile = 6.684612
Ninety_nine.five_th_percentile = 10.596635
\(~\)
\(~\)
Notice that Scatter_plot_dataset
is already filtered to consider only adfe
in between 0 and 99
Variance_Covariance_table <- Scatter_plot_dataset %>%
filter(Log_Monthly_Income > Zero.five_th_percentile,
Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
summarise(Log_Monthly_Income_var = var(Log_Monthly_Income),
Graduation_age_var = var(Graduation_age),
Covariance = cov(Log_Monthly_Income, Graduation_age))
Variance_Covariance_table %>% kbl() %>%
kable_material(c("striped", "hover"))
Log_Monthly_Income_var | Graduation_age_var | Covariance |
---|---|---|
0.2649288 | 12.31437 | 0.5994552 |
\(~\)
\(~\)
We know that the total sum of square is equal to : \[\text{TSS} = \sum_{i=1}^{n}\left(y_i-\bar{y}\right)^2\] with :
Log_Monthly_Income
variableSST <- Scatter_plot_dataset %>%
filter(Log_Monthly_Income > Zero.five_th_percentile,
Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2))
SST %>% kbl() %>%
kable_material(c("striped", "hover"))
SST |
---|
5698.619 |
\(~\)
\(~\)
OLS_Estimator <- Variance_Covariance_table %>%
summarise(OLS_Estimator = Covariance/Graduation_age_var)
OLS_Estimator %>% kbl() %>%
kable_material(c("striped", "hover"))
OLS_Estimator |
---|
0.0486793 |
\(~\)
\(~\)
## Build the table with filter
Scatter_plot_dataset_with <- Scatter_plot_dataset %>%
filter(Log_Monthly_Income > Zero.five_th_percentile,
Log_Monthly_Income < Ninety_nine.five_th_percentile)
head(Scatter_plot_dataset_with) %>% kbl() %>%
kable_material(c("striped", "hover"))
Monthly_Income | Graduation_age | Log_Monthly_Income |
---|---|---|
5523 | 17 | 8.616677 |
7500 | 22 | 8.922658 |
14000 | 16 | 9.546813 |
10800 | 16 | 9.287301 |
7800 | 18 | 8.961879 |
8800 | 17 | 9.082507 |
model_filtered = lm(Log_Monthly_Income ~ Graduation_age, data=Scatter_plot_dataset_with)
summary(model_filtered)
##
## Call:
## lm(formula = Log_Monthly_Income ~ Graduation_age, data = Scatter_plot_dataset_with)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.61619 -0.22280 0.02423 0.28821 1.80473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.0925318 0.0181406 446.1 <2e-16 ***
## Graduation_age 0.0486793 0.0009434 51.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4855 on 21509 degrees of freedom
## Multiple R-squared: 0.1101, Adjusted R-squared: 0.1101
## F-statistic: 2662 on 1 and 21509 DF, p-value: < 2.2e-16
Note that the r lm
comand produce the same result as the OLS estimator computed using the cov()
and var()
functions
## Build the table without filter
head(Scatter_plot_dataset) %>% kbl() %>%
kable_material(c("striped", "hover"))
Monthly_Income | Graduation_age | Log_Monthly_Income |
---|---|---|
5523 | 17 | 8.616677 |
7500 | 22 | 8.922658 |
14000 | 16 | 9.546813 |
10800 | 16 | 9.287301 |
7800 | 18 | 8.961879 |
8800 | 17 | 9.082507 |
## Regress on the filtered dataset
model_without_filtered = lm(Log_Monthly_Income ~ Graduation_age, data=Scatter_plot_dataset)
summary(model_without_filtered)
##
## Call:
## lm(formula = Log_Monthly_Income ~ Graduation_age, data = Scatter_plot_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1146 -0.2218 0.0341 0.3044 3.9902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.002904 0.020920 382.55 <2e-16 ***
## Graduation_age 0.052879 0.001088 48.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5641 on 21721 degrees of freedom
## Multiple R-squared: 0.09814, Adjusted R-squared: 0.0981
## F-statistic: 2364 on 1 and 21721 DF, p-value: < 2.2e-16
\(~\)
model_with_filtered_plot <- ggplot(Scatter_plot_dataset_with, aes(x=Graduation_age, y=Log_Monthly_Income, col = Graduation_age)) + geom_point() + geom_smooth(method=lm , color="black", fill="#69b3a2", se=TRUE) + ggtitle("With trimming") + theme(plot.title = element_text(hjust = 0.5))
model_without_filtered_plot <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Log_Monthly_Income, col = Graduation_age)) + geom_point() + geom_smooth(method=lm , color="black", fill="#69b3a2", se=TRUE)+ ggtitle("Without trimming") + theme(plot.title = element_text(hjust = 0.5))
show(model_with_filtered_plot)
show(model_without_filtered_plot)
\(~\)
Adding this filter allows to suppress the extreme values (outliers) and make the coefficient consistent. The filter decreases slightly the coefficient in front of Log_Monthly_Income
, which means that, the filter allows for less variation and makes the coefficient more accurate.
\(~\)
\(~\)
In the log-linear model, the literal interpretation of the estimated coefficient in front of Graduation_age
is that a one-unit increase in Graduation_age
(additionnal year of studying) will produce an expected increase in Log_Monthly_Income
of 0.049 units. Thus each 1-unit increase in Graduation_age
multiplies the expected value of Monthly_Income
by \[e^{(0,049)}-1 = 0,05022\] .
We know that the SST, SSE and SSR are related by the following relation :
\[ \text{SST} = \text{SSE} + \text{SSR} \Longleftrightarrow \sum_{i=1}^{n}\left(y_i-\bar{y}\right)^2 = \sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2 + \sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2\]
SST <- Scatter_plot_dataset %>%
filter(Log_Monthly_Income > Zero.five_th_percentile,
Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2),
SSE = sum((fitted(model_filtered)-mean(Log_Monthly_Income))^2),
SSR = SST - SSE)
SST %>% kbl() %>%
kable_material(c("striped", "hover"))
SST | SSE | SSR |
---|---|---|
5698.619 | 627.6849 | 5070.934 |
\(~\)
\(~\)
We then compute the \(R^2\) such that :
\[ R^2 = 1 - \frac{\text{SSE}}{\text{SST}} = 1-\frac{\sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2}{\sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2}\]
SST <- Scatter_plot_dataset %>%
filter(Log_Monthly_Income > Zero.five_th_percentile,
Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2),
SSE = sum((fitted(model_filtered)-mean(Log_Monthly_Income))^2),
SSR = SST - SSE,
R_2 = 1-(SSR/SST))
SST %>% kbl() %>%
kable_material(c("striped", "hover"))
SST | SSE | SSR | R_2 |
---|---|---|---|
5698.619 | 627.6849 | 5070.934 | 0.1101469 |
\(~\)
\(~\)
Residual = resid(model_filtered)
Residual_data = data.frame(Residual)
Fitted_1 = fitted(model_filtered)
Residual_plt <- ggplot(Residual_data, aes(x=Residual, fill=cond)) +
geom_histogram(aes(y=..density..),
binwidth=.5,
colour="#3D78B2", fill="#F3F8FC") +
geom_density(alpha=.2, colour="#3D78B2", fill="#3D78B2")
show(Residual_plt)
\(~\)
\(~\)
We want to estimate the relationship between the GPA and the ACT using OLS on the following model :
\[\hat{\text{GPA}} = \hat{\beta}_0+\hat{\beta}_1\text{ACT}\]
# Read the data
library(readxl)
data_pbset_1_1 <- read_excel("/Users/bastienpatras/Downloads/Pbset 1 Econometrics/data_pbset_1_1.xlsx")
data_pbset_1_1 %>% kbl() %>%
kable_material(c("striped", "hover"))
Student | GPA | ACT |
---|---|---|
1 | 2.8 | 21 |
2 | 3.4 | 24 |
3 | 3.0 | 26 |
4 | 3.5 | 27 |
5 | 3.6 | 29 |
6 | 3.0 | 25 |
7 | 2.7 | 25 |
8 | 3.7 | 30 |
As it is a simple regression estimation, we know that the \(\hat{\beta}_1\) that minimizes the sum of squared deviations from the fitted line corresponds to :
\[ \hat{\beta}_1 = \frac{cov(\text{GPA},\text{ACT})}{var(\text{ACT})} = 0.10220 \]
# Lm model
regressor <- data_pbset_1_1 %>%
summarise(regressor = cov(GPA,ACT)/var(ACT))
regressor %>% kbl() %>%
kable_material(c("striped", "hover"))
regressor |
---|
0.1021978 |
Once we have found the estimate of \(\hat{\beta}_1\) we can find the intercept by rewriting the model as follows :
\[ \hat{\beta}_0 = \bar{\text{GPA}}-\hat{\beta}_1\bar{\text{ACT}} = 0.568 \]
We can also use the lm
function to get the summary of our regression :
lm_GPA <- lm(GPA ~ ACT, data=data_pbset_1_1)
summary(lm_GPA)
##
## Call:
## lm(formula = GPA ~ ACT, data = data_pbset_1_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42308 -0.14863 0.06703 0.10742 0.37912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.56813 0.92842 0.612 0.5630
## ACT 0.10220 0.03569 2.863 0.0287 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2692 on 6 degrees of freedom
## Multiple R-squared: 0.5774, Adjusted R-squared: 0.507
## F-statistic: 8.199 on 1 and 6 DF, p-value: 0.02868
\(~\)
\(~\)
Once we get the estimator of \(\hat{\beta}_1\) and \(\hat{\beta}_0\) we can use them to compute the fitted values by implementing the value of into the model in order to get the values of \(\hat{\text{GPA}}\) :
table_residual_fitted <- data_pbset_1_1 %>%
mutate(Residual_2 = resid(lm_GPA),
Fitted_2 = fitted(lm_GPA)) %>%
select(ACT, Residual_2, Fitted_2, GPA) %>%
rename(Actual_Value_GPA = GPA)
table_residual_fitted %>% kbl() %>%
kable_material(c("striped", "hover"))
ACT | Residual_2 | Fitted_2 | Actual_Value_GPA |
---|---|---|---|
21 | 0.0857143 | 2.714286 | 2.8 |
24 | 0.3791209 | 3.020879 | 3.4 |
26 | -0.2252747 | 3.225275 | 3.0 |
27 | 0.1725275 | 3.327472 | 3.5 |
29 | 0.0681319 | 3.531868 | 3.6 |
25 | -0.1230769 | 3.123077 | 3.0 |
25 | -0.4230769 | 3.123077 | 2.7 |
30 | 0.0659341 | 3.634066 | 3.7 |
Finally we can compute the sum of residuals to verify if it is equal to zero :
\[ \text{Residuals} = \sum_{i=0}^{n=8} \left(\hat{\text{GPA}}-\text{GPA}_i\right) = -2.775558e^{-17} \approx 0 \]
Residual_2 = resid(lm_GPA)
sum(Residual_2)
## [1] -2.775558e-17
\(~\)
\(~\)
To get the estimated value of GPA when ATC is equal to 20 we simply have to run our model when ATC is equal to 20 such that :
\[ \text{GPA}_{20} = \hat{\beta}_0+\hat{\beta}_1\left(\text{ACT=20}\right) = 2.61213 \]
\(~\)
\(~\)
To know how much of the variation in GPA is explained by ACT we simply have to compute the \(R^2\) statistic :
\[ R^2 = 1 - \frac{\text{SSE}}{\text{SST}} = 1-\frac{\sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2}{\sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2} = 1 - \frac{0.5940247}{1.02875} = 0.5774238 \]
Residual_2 = resid(lm_GPA)
Residual_data_2 = data.frame(Residual)
Fitted_2 = fitted(lm_GPA)
Regressor_table <- data_pbset_1_1 %>%
summarise(regressor = cov(GPA,ACT)/var(ACT),
SST = sum((GPA-mean(GPA))^2),
SSE = sum((Fitted_2-mean(GPA))^2),
SSR = SST - SSE,
R_2 = 1-(SSR/SST))
Regressor_table %>% kbl() %>%
kable_material(c("striped", "hover"))
regressor | SST | SSE | SSR | R_2 |
---|---|---|---|---|
0.1021978 | 1.02875 | 0.5940247 | 0.4347253 | 0.5774238 |
Therefore, 57.74% of the variation in GPA is explained by ACT.