Problem set 1 - Econometrics

Part I

\(~\)

Bastien PATRAS

Question 1 - Summarize the dataset

First we start by loading the libraries and clean the dataset :

#Load libraries

library(readr)
library(haven)
library(tidyverse)
library(ggplot2)
library(hrbrthemes)
library(stargazer)
library(sandwich)
library(lmtest)
library(lindia)
library(gplots)
library(kableExtra)

# Import data

PBset_1_data <- read_dta("/Users/bastienpatras/Downloads/Pbset 1 Econometrics/ee2002ext.dta")

# Cleaning dataset

PBset_1_data <- PBset_1_data %>%
                rename(Education = ddipl1, Monthly_Income = salfr) %>%
                na.omit()

# Summarize the dataset

summary(PBset_1_data)
##        s              agd             noi               fi       
##  Min.   :1.000   Min.   :16.00   Min.   : 1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:30.00   1st Qu.: 1.000   1st Qu.:1.000  
##  Median :1.000   Median :39.00   Median : 1.000   Median :1.000  
##  Mean   :1.484   Mean   :39.13   Mean   : 1.755   Mean   :1.019  
##  3rd Qu.:2.000   3rd Qu.:48.00   3rd Qu.: 2.000   3rd Qu.:1.000  
##  Max.   :2.000   Max.   :75.00   Max.   :13.000   Max.   :8.000  
##       pub              hh        Monthly_Income        cspp      
##  Min.   :1.000   Min.   : 1.00   Min.   :     0   Min.   :10.00  
##  1st Qu.:4.000   1st Qu.:35.00   1st Qu.:  6300   1st Qu.:38.00  
##  Median :5.000   Median :35.00   Median :  8000   Median :54.00  
##  Mean   :4.142   Mean   :35.42   Mean   :  9319   Mean   :49.37  
##  3rd Qu.:5.000   3rd Qu.:39.00   3rd Qu.: 11000   3rd Qu.:63.00  
##  Max.   :5.000   Max.   :99.00   Max.   :338894   Max.   :69.00  
##      extri          nafg4                csei           dipl1       
##  Min.   :155.0   Length:22447       Min.   :10.00   Min.   : 1.000  
##  1st Qu.:279.0   Class :character   1st Qu.:46.00   1st Qu.: 3.000  
##  Median :367.0   Mode  :character   Median :54.00   Median : 5.000  
##  Mean   :354.2                      Mean   :51.29   Mean   : 6.576  
##  3rd Qu.:419.0                      3rd Qu.:61.00   3rd Qu.:12.000  
##  Max.   :841.0                      Max.   :69.00   Max.   :16.000  
##    Education          adfe            enf3             enf6       
##  Min.   :1.000   Min.   : 0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:16.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3.000   Median :18.00   Median :0.0000   Median :0.0000  
##  Mean   :3.357   Mean   :20.47   Mean   :0.1076   Mean   :0.2432  
##  3rd Qu.:5.000   3rd Qu.:21.00   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :7.000   Max.   :99.00   Max.   :3.0000   Max.   :4.0000  
##      enf18              np            tymen90r         nomen      
##  Min.   :0.0000   Min.   : 1.000   Min.   :1.000   Min.   :    1  
##  1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.:4.000   1st Qu.: 7436  
##  Median :0.0000   Median : 3.000   Median :5.000   Median :15173  
##  Mean   :0.8284   Mean   : 3.055   Mean   :4.097   Mean   :15759  
##  3rd Qu.:2.0000   3rd Qu.: 4.000   3rd Qu.:5.000   3rd Qu.:23859  
##  Max.   :9.0000   Max.   :16.000   Max.   :5.000   Max.   :32739

\(~\)

Question 2 - Summarize variable salfr (monthly earnings)

\(~\)

# Select the wanted vector

Monthly_Income_data <- PBset_1_data %>%
                        select(Monthly_Income)

# Compute the summary statistics 

summary(Monthly_Income_data)
##  Monthly_Income  
##  Min.   :     0  
##  1st Qu.:  6300  
##  Median :  8000  
##  Mean   :  9319  
##  3rd Qu.: 11000  
##  Max.   :338894

\(~\)

Question 3 - Tabulate the education variable ddipl1

\(~\)

Education_data <- PBset_1_data %>%
                  select(Education) %>%
                  arrange(Education)

head(Education_data) %>% kbl() %>%
                   kable_material(c("striped", "hover"))
Education
1
1
1
1
1
1

\(~\)

Question 4 - Associate a label to variable ddipl1

\(~\)

PBset_1_data_Education <- PBset_1_data %>%
                          mutate(Label_Education = case_when(Education == 1 ~ "No degree",
                                                             Education == 2 ~ "Middle school degree (brevet)",
                                                             Education == 3 ~ "High school vocational degree (eg CAP)",
                                                             Education == 4 ~ "High school degree (baccalaureat)",
                                                             Education == 5 ~ "Some college (bac +2)",
                                                             Education == 6 ~ "College plus (> licence)",
                                                             Education == 7 ~ "Still in school")) %>%
                          select(Label_Education, Education)

head(PBset_1_data_Education) %>% kbl() %>%
                                 kable_material(c("striped", "hover"))
Label_Education Education
High school vocational degree (eg CAP) 3
Still in school 7
Some college (bac +2) 5
High school vocational degree (eg CAP) 3
Middle school degree (brevet) 2
High school vocational degree (eg CAP) 3

\(~\)

Question 5 - Computing mean wage by education

\(~\)

Mean_Education_Group <- PBset_1_data %>%
                        mutate(Label_Education = case_when(Education == 1 ~ "No degree",
                                                           Education == 2 ~ "Middle school degree (brevet)",
                                                           Education == 3 ~ "High school vocational degree (eg CAP)",
                                                           Education == 4 ~ "High school degree (baccalaureat)",
                                                           Education == 5 ~ "Some college (bac +2)",
                                                           Education == 6 ~ "College plus (> licence)",
                                                           Education == 7 ~ "Still in school")) %>%
                        group_by(Label_Education) %>%
                        summarise(Income_mean = mean(Monthly_Income),
                                  Income_Standar_dev = sd(Monthly_Income),
                                  Income_var = var(Monthly_Income)) %>%
                        arrange(Income_mean)
                        
Mean_Education_Group %>% kbl() %>%
                         kable_material(c("striped", "hover"))
Label_Education Income_mean Income_Standar_dev Income_var
Still in school 4702.048 3454.259 11931905
No degree 7028.577 4473.792 20014811
High school vocational degree (eg CAP) 8390.705 6182.135 38218798
Middle school degree (brevet) 8648.555 4634.186 21475677
High school degree (baccalaureat) 9284.632 6500.892 42261600
Some college (bac +2) 10932.263 5889.771 34689397
College plus (> licence) 15381.187 9979.600 99592422

\(~\)

Question 6 - Tabulating adfe (“Age de fin d’études calculé”) and ddipl1 as a two-entry table.

\(~\)

  1. 00 stands for : people that have not study at all during their lifetime
  2. 99 stands for : people that are still studying
Education_table <- PBset_1_data %>%
                   rename(Graduation_age = adfe) %>%
                   select(Graduation_age, Education)
                   
head(Education_table) %>% kbl() %>%
                         kable_material(c("striped", "hover"))
Graduation_age Education
17 3
99 7
22 5
16 3
16 2
18 3

\(~\)

Question 7 - Calculate the mean of adfe by ddipl1.

\(~\)

Mean_Graduation_age_by_Education <- Education_table %>%
                                    mutate(Label_Education = case_when(Education == 1 ~ "No degree",
                                                                       Education == 2 ~ "Middle school degree (brevet)",
                                                                       Education == 3 ~ "High school vocational degree (eg CAP)",
                                                                       Education == 4 ~ "High school degree (baccalaureat)",
                                                                       Education == 5 ~ "Some college (bac +2)",
                                                                       Education == 6 ~ "College plus (> licence)",
                                                                       Education == 7 ~ "Still in school")) %>%
                                    group_by(Label_Education) %>%
                                    summarise(Graduation_age_mean = mean(Graduation_age)) %>%
                                    arrange(Graduation_age_mean)

Mean_Graduation_age_by_Education %>% kbl() %>%
                                     kable_material(c("striped", "hover"))
Label_Education Graduation_age_mean
No degree 15.30157
High school vocational degree (eg CAP) 17.89495
Middle school degree (brevet) 17.90432
High school degree (baccalaureat) 20.52958
Some college (bac +2) 21.98057
College plus (> licence) 24.42862
Still in school 68.18587

\(~\)

Question 8 - Produce a scatterplot of salfr and adfe for adfe different from 0 and 99.

\(~\)

  1. First build the correct dataset with filter
Scatter_plot_dataset <- PBset_1_data %>%
                        rename(Graduation_age = adfe) %>%
                        select(Monthly_Income, Graduation_age) %>%
                        filter(Graduation_age < 99,Graduation_age > 0)
  1. Second build the graph with ggplot2
Scatter_plot_1 <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Monthly_Income,col= Graduation_age)) +
                  geom_point()

show(Scatter_plot_1)

\(~\)

Question 9 - Generate a variable lw that is the log of salfr.

\(~\)

  1. First modify the dataset by introducing Log_Monthly_Income
Scatter_plot_dataset <- Scatter_plot_dataset %>%
                        mutate(Log_Monthly_Income = log(Monthly_Income)) %>%
                        filter(Log_Monthly_Income>0)

head(Scatter_plot_dataset) %>% kbl() %>%
                         kable_material(c("striped", "hover"))
Monthly_Income Graduation_age Log_Monthly_Income
5523 17 8.616677
7500 22 8.922658
14000 16 9.546813
10800 16 9.287301
7800 18 8.961879
8800 17 9.082507
  1. Second build the 2nd graph with ggplot2 with the ‘Log_Monthly_Income’ variable
Scatter_plot_2 <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Log_Monthly_Income,col= Graduation_age)) + 
                  geom_point()

show(Scatter_plot_2)

\(~\)

Question 10 - Calculate the 1st and the 99th percentiles

\(~\)

quantile(Scatter_plot_dataset$Log_Monthly_Income, probs = seq(0,1, by=0.01))
##        0%        1%        2%        3%        4%        5%        6%        7% 
##  1.945910  7.090077  7.518278  7.783224  7.937375  8.006368  8.085782  8.185350 
##        8%        9%       10%       11%       12%       13%       14%       15% 
##  8.294050  8.342840  8.411833  8.476371  8.517193  8.575462  8.612503  8.630522 
##       16%       17%       18%       19%       20%       21%       22%       23% 
##  8.665613  8.677204  8.699515  8.699515  8.699515  8.716044  8.732305  8.748305 
##       24%       25%       26%       27%       28%       29%       30%       31% 
##  8.764053  8.779557  8.779557  8.779557  8.794825  8.809863  8.824678  8.853665 
##       32%       33%       34%       35%       36%       37%       38%       39% 
##  8.853665  8.853665  8.853665  8.867850  8.881836  8.894689  8.909235  8.922658 
##       40%       41%       42%       43%       44%       45%       46%       47% 
##  8.922658  8.922658  8.935904  8.961879  8.970940  8.987197  8.987197  8.987197 
##       48%       49%       50%       51%       52%       53%       54%       55% 
##  8.987197  8.999002  9.011889  9.029298  9.047821  9.047821  9.047821  9.071078 
##       56%       57%       58%       59%       60%       61%       62%       63% 
##  9.090534  9.104980  9.104980  9.104980  9.106423  9.126959  9.159047  9.159047 
##       64%       65%       66%       67%       68%       69%       70%       71% 
##  9.169518  9.190138  9.210340  9.210340  9.210340  9.210340  9.210340  9.230143 
##       72%       73%       74%       75%       76%       77%       78%       79% 
##  9.259131  9.268609  9.305651  9.305651  9.305651  9.319284  9.350102  9.376448 
##       80%       81%       82%       83%       84%       85%       86%       87% 
##  9.392662  9.392662  9.392662  9.433484  9.472705  9.472705  9.487972  9.539211 
##       88%       89%       90%       91%       92%       93%       94%       95% 
##  9.546813  9.581904  9.615805  9.621589  9.680344  9.740969  9.798127  9.852194 
##       96%       97%       98%       99%      100% 
##  9.903488 10.017832 10.126631 10.373491 12.733443
  1. We can then define the variable of interest corresponding to the 1st and the 99th percentiles:
First_percentile = 7.090077
Ninety_nine_th_percentile = 10.373491

\(~\)

Question 11 - Calculate the 0.5th and 99.5th percentiles

\(~\)

quantile(Scatter_plot_dataset$Log_Monthly_Income, probs = seq(0,1, by=0.005))
##      0.0%      0.5%      1.0%      1.5%      2.0%      2.5%      3.0%      3.5% 
##  1.945910  6.684612  7.090077  7.313220  7.518278  7.610878  7.783224  7.863267 
##      4.0%      4.5%      5.0%      5.5%      6.0%      6.5%      7.0%      7.5% 
##  7.937375  8.006368  8.006368  8.070906  8.085782  8.160518  8.185350  8.242756 
##      8.0%      8.5%      9.0%      9.5%     10.0%     10.5%     11.0%     11.5% 
##  8.294050  8.294050  8.342840  8.389360  8.411833  8.455318  8.476371  8.517193 
##     12.0%     12.5%     13.0%     13.5%     14.0%     14.5%     15.0%     15.5% 
##  8.517193  8.556414  8.575462  8.602269  8.612503  8.630522  8.630522  8.648221 
##     16.0%     16.5%     17.0%     17.5%     18.0%     18.5%     19.0%     19.5% 
##  8.665613  8.665613  8.677204  8.683385  8.699515  8.699515  8.699515  8.699515 
##     20.0%     20.5%     21.0%     21.5%     22.0%     22.5%     23.0%     23.5% 
##  8.699515  8.699515  8.716044  8.732143  8.732305  8.732305  8.748305  8.748305 
##     24.0%     24.5%     25.0%     25.5%     26.0%     26.5%     27.0%     27.5% 
##  8.764053  8.770405  8.779557  8.779557  8.779557  8.779557  8.779557  8.788746 
##     28.0%     28.5%     29.0%     29.5%     30.0%     30.5%     31.0%     31.5% 
##  8.794825  8.809863  8.809863  8.824678  8.824678  8.839277  8.853665  8.853665 
##     32.0%     32.5%     33.0%     33.5%     34.0%     34.5%     35.0%     35.5% 
##  8.853665  8.853665  8.853665  8.853665  8.853665  8.853665  8.867850  8.881836 
##     36.0%     36.5%     37.0%     37.5%     38.0%     38.5%     39.0%     39.5% 
##  8.881836  8.884056  8.894689  8.895630  8.909235  8.917303  8.922658  8.922658 
##     40.0%     40.5%     41.0%     41.5%     42.0%     42.5%     43.0%     43.5% 
##  8.922658  8.922658  8.922658  8.922658  8.935904  8.948976  8.961879  8.961879 
##     44.0%     44.5%     45.0%     45.5%     46.0%     46.5%     47.0%     47.5% 
##  8.970940  8.974618  8.987197  8.987197  8.987197  8.987197  8.987197  8.987197 
##     48.0%     48.5%     49.0%     49.5%     50.0%     50.5%     51.0%     51.5% 
##  8.987197  8.987197  8.999002  9.011767  9.011889  9.024011  9.029298  9.040264 
##     52.0%     52.5%     53.0%     53.5%     54.0%     54.5%     55.0%     55.5% 
##  9.047821  9.047821  9.047821  9.047821  9.047821  9.054855  9.071078  9.081370 
##     56.0%     56.5%     57.0%     57.5%     58.0%     58.5%     59.0%     59.5% 
##  9.090534  9.104155  9.104980  9.104980  9.104980  9.104980  9.104980  9.104980 
##     60.0%     60.5%     61.0%     61.5%     62.0%     62.5%     63.0%     63.5% 
##  9.106423  9.125109  9.126959  9.137773  9.159047  9.159047  9.159047  9.159047 
##     64.0%     64.5%     65.0%     65.5%     66.0%     66.5%     67.0%     67.5% 
##  9.169518  9.184722  9.190138  9.194109  9.210340  9.210340  9.210340  9.210340 
##     68.0%     68.5%     69.0%     69.5%     70.0%     70.5%     71.0%     71.5% 
##  9.210340  9.210340  9.210340  9.210340  9.210340  9.210640  9.230143  9.253171 
##     72.0%     72.5%     73.0%     73.5%     74.0%     74.5%     75.0%     75.5% 
##  9.259131  9.259131  9.268609  9.289246  9.305651  9.305651  9.305651  9.305651 
##     76.0%     76.5%     77.0%     77.5%     78.0%     78.5%     79.0%     79.5% 
##  9.305651  9.305651  9.319284  9.348079  9.350102  9.364177  9.376448  9.392662 
##     80.0%     80.5%     81.0%     81.5%     82.0%     82.5%     83.0%     83.5% 
##  9.392662  9.392662  9.392662  9.392662  9.392662  9.421270  9.433484  9.456497 
##     84.0%     84.5%     85.0%     85.5%     86.0%     86.5%     87.0%     87.5% 
##  9.472705  9.472705  9.472705  9.474726  9.487972  9.510445  9.539211  9.546813 
##     88.0%     88.5%     89.0%     89.5%     90.0%     90.5%     91.0%     91.5% 
##  9.546813  9.577069  9.581904  9.615805  9.615805  9.615805  9.621589  9.664151 
##     92.0%     92.5%     93.0%     93.5%     94.0%     94.5%     95.0%     95.5% 
##  9.680344  9.704976  9.740969  9.741166  9.798127  9.798127  9.852194  9.903488 
##     96.0%     96.5%     97.0%     97.5%     98.0%     98.5%     99.0%     99.5% 
##  9.903488  9.952278 10.017832 10.085809 10.126631 10.250405 10.373491 10.596635 
##    100.0% 
## 12.733443
  1. We can then define the variable of interest corresponding to the 0.5th and the 99.5th percentiles:
Zero.five_th_percentile = 6.684612
Ninety_nine.five_th_percentile = 10.596635

\(~\)

Question 14 - Calculate the variances of lw and adfe and the covariance between lw and adfe for 0 < adfe < 99 and p0050 < lw < p9995

\(~\)

Notice that Scatter_plot_dataset is already filtered to consider only adfe in between 0 and 99

Variance_Covariance_table <- Scatter_plot_dataset %>%
  
                             filter(Log_Monthly_Income > Zero.five_th_percentile, 
                                    Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
  
                             summarise(Log_Monthly_Income_var = var(Log_Monthly_Income), 
                                       Graduation_age_var = var(Graduation_age), 
                                       Covariance = cov(Log_Monthly_Income, Graduation_age))

Variance_Covariance_table %>% kbl() %>%
                         kable_material(c("striped", "hover"))
Log_Monthly_Income_var Graduation_age_var Covariance
0.2649288 12.31437 0.5994552

\(~\)

Question 15 - Calculate the total sum of squares (SST) for lw

\(~\)

We know that the total sum of square is equal to : \[\text{TSS} = \sum_{i=1}^{n}\left(y_i-\bar{y}\right)^2\] with :

  • \(n\) : The number of observation
  • \(y_i\) : Log_Monthly_Income variable
  • \(y\) : The mean of the value sample
SST <- Scatter_plot_dataset %>% 
       filter(Log_Monthly_Income > Zero.five_th_percentile, 
              Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
       summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2))

SST %>% kbl() %>%
        kable_material(c("striped", "hover"))
SST
5698.619

\(~\)

16 - Deduce the OLS estimator of the regression of lw on adfe

\(~\)

OLS_Estimator <- Variance_Covariance_table %>%
                 summarise(OLS_Estimator = Covariance/Graduation_age_var)

OLS_Estimator %>% kbl() %>%
                  kable_material(c("striped", "hover"))
OLS_Estimator
0.0486793

\(~\)

Question 17 - Regress lw on adfe without selection and repeat the exercise with the selection.

\(~\)

  1. First we build the table with the correct filter with the trimming
## Build the table with filter

Scatter_plot_dataset_with <- Scatter_plot_dataset %>%
                             filter(Log_Monthly_Income > Zero.five_th_percentile, 
                                    Log_Monthly_Income < Ninety_nine.five_th_percentile) 

head(Scatter_plot_dataset_with) %>% kbl() %>%
                                    kable_material(c("striped", "hover"))
Monthly_Income Graduation_age Log_Monthly_Income
5523 17 8.616677
7500 22 8.922658
14000 16 9.546813
10800 16 9.287301
7800 18 8.961879
8800 17 9.082507
  1. Then we regress on the filtered dataset
model_filtered = lm(Log_Monthly_Income ~ Graduation_age, data=Scatter_plot_dataset_with)
summary(model_filtered)
## 
## Call:
## lm(formula = Log_Monthly_Income ~ Graduation_age, data = Scatter_plot_dataset_with)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.61619 -0.22280  0.02423  0.28821  1.80473 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.0925318  0.0181406   446.1   <2e-16 ***
## Graduation_age 0.0486793  0.0009434    51.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4855 on 21509 degrees of freedom
## Multiple R-squared:  0.1101, Adjusted R-squared:  0.1101 
## F-statistic:  2662 on 1 and 21509 DF,  p-value: < 2.2e-16

Note that the r lm comand produce the same result as the OLS estimator computed using the cov() and var() functions

  1. We now repeat the same process without trimming :
## Build the table without filter

head(Scatter_plot_dataset) %>% kbl() %>%
                               kable_material(c("striped", "hover"))
Monthly_Income Graduation_age Log_Monthly_Income
5523 17 8.616677
7500 22 8.922658
14000 16 9.546813
10800 16 9.287301
7800 18 8.961879
8800 17 9.082507
## Regress on the filtered dataset

model_without_filtered = lm(Log_Monthly_Income ~ Graduation_age, data=Scatter_plot_dataset)
summary(model_without_filtered)
## 
## Call:
## lm(formula = Log_Monthly_Income ~ Graduation_age, data = Scatter_plot_dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1146 -0.2218  0.0341  0.3044  3.9902 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.002904   0.020920  382.55   <2e-16 ***
## Graduation_age 0.052879   0.001088   48.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5641 on 21721 degrees of freedom
## Multiple R-squared:  0.09814,    Adjusted R-squared:  0.0981 
## F-statistic:  2364 on 1 and 21721 DF,  p-value: < 2.2e-16
  1. To get a better idea of the effect of the trimming we can produce the corresponding plots of the tow models and produce a latex output summarizing the two models estimates :

\(~\)

model_with_filtered_plot <- ggplot(Scatter_plot_dataset_with, aes(x=Graduation_age, y=Log_Monthly_Income, col = Graduation_age)) + geom_point() + geom_smooth(method=lm , color="black", fill="#69b3a2", se=TRUE) + ggtitle("With trimming") + theme(plot.title = element_text(hjust = 0.5))
model_without_filtered_plot <- ggplot(Scatter_plot_dataset, aes(x=Graduation_age, y=Log_Monthly_Income, col = Graduation_age)) + geom_point() + geom_smooth(method=lm , color="black", fill="#69b3a2", se=TRUE)+ ggtitle("Without trimming") + theme(plot.title = element_text(hjust = 0.5))

show(model_with_filtered_plot)
show(model_without_filtered_plot)

\(~\)

Adding this filter allows to suppress the extreme values (outliers) and make the coefficient consistent. The filter decreases slightly the coefficient in front of Log_Monthly_Income, which means that, the filter allows for less variation and makes the coefficient more accurate.

\(~\)

Question 18 - For the regression with the trimming:

\(~\)

How much more earnings yield an additional year of education?

In the log-linear model, the literal interpretation of the estimated coefficient in front of Graduation_age is that a one-unit increase in Graduation_age (additionnal year of studying) will produce an expected increase in Log_Monthly_Income of 0.049 units. Thus each 1-unit increase in Graduation_age multiplies the expected value of Monthly_Income by \[e^{(0,049)}-1 = 0,05022\] .

Provide the SST, SSE and SSR from the above regression.

We know that the SST, SSE and SSR are related by the following relation :

\[ \text{SST} = \text{SSE} + \text{SSR} \Longleftrightarrow \sum_{i=1}^{n}\left(y_i-\bar{y}\right)^2 = \sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2 + \sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2\]

SST <- Scatter_plot_dataset %>% 
       filter(Log_Monthly_Income > Zero.five_th_percentile, 
              Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
       summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2), 
                 SSE = sum((fitted(model_filtered)-mean(Log_Monthly_Income))^2),
                 SSR = SST - SSE)

SST %>% kbl() %>%
        kable_material(c("striped", "hover"))
SST SSE SSR
5698.619 627.6849 5070.934

\(~\)

Calculate the R-squared.

\(~\)

We then compute the \(R^2\) such that :

\[ R^2 = 1 - \frac{\text{SSE}}{\text{SST}} = 1-\frac{\sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2}{\sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2}\]

SST <- Scatter_plot_dataset %>% 
       filter(Log_Monthly_Income > Zero.five_th_percentile, 
              Log_Monthly_Income < Ninety_nine.five_th_percentile) %>%
       summarise(SST = sum((Log_Monthly_Income-mean(Log_Monthly_Income))^2), 
                 SSE = sum((fitted(model_filtered)-mean(Log_Monthly_Income))^2),
                 SSR = SST - SSE,
                 R_2 = 1-(SSR/SST))

SST %>% kbl() %>%
        kable_material(c("striped", "hover"))
SST SSE SSR R_2
5698.619 627.6849 5070.934 0.1101469

\(~\)

Question 19 - Plot a histogram of the residuals. What does the “normal” option of command “hist” do? Use it and provide a brief conclusion about the distribution of the residuals.

\(~\)

Residual = resid(model_filtered)
Residual_data = data.frame(Residual)
Fitted_1 = fitted(model_filtered)



Residual_plt <- ggplot(Residual_data, aes(x=Residual, fill=cond)) +
                geom_histogram(aes(y=..density..),
                               binwidth=.5,
                               colour="#3D78B2", fill="#F3F8FC") +
                geom_density(alpha=.2, colour="#3D78B2", fill="#3D78B2")
          
show(Residual_plt)

Part II

\(~\)

Question - 1

\(~\)

We want to estimate the relationship between the GPA and the ACT using OLS on the following model :

\[\hat{\text{GPA}} = \hat{\beta}_0+\hat{\beta}_1\text{ACT}\]

# Read the data

library(readxl)
data_pbset_1_1 <- read_excel("/Users/bastienpatras/Downloads/Pbset 1 Econometrics/data_pbset_1_1.xlsx")

data_pbset_1_1 %>% kbl() %>%
                    kable_material(c("striped", "hover"))  
Student GPA ACT
1 2.8 21
2 3.4 24
3 3.0 26
4 3.5 27
5 3.6 29
6 3.0 25
7 2.7 25
8 3.7 30

As it is a simple regression estimation, we know that the \(\hat{\beta}_1\) that minimizes the sum of squared deviations from the fitted line corresponds to :

\[ \hat{\beta}_1 = \frac{cov(\text{GPA},\text{ACT})}{var(\text{ACT})} = 0.10220 \]

# Lm model 

regressor <- data_pbset_1_1 %>%
             summarise(regressor = cov(GPA,ACT)/var(ACT))

regressor %>% kbl() %>%
              kable_material(c("striped", "hover"))
regressor
0.1021978

Once we have found the estimate of \(\hat{\beta}_1\) we can find the intercept by rewriting the model as follows :

\[ \hat{\beta}_0 = \bar{\text{GPA}}-\hat{\beta}_1\bar{\text{ACT}} = 0.568 \]

We can also use the lm function to get the summary of our regression :

lm_GPA <- lm(GPA ~ ACT, data=data_pbset_1_1)
summary(lm_GPA)
## 
## Call:
## lm(formula = GPA ~ ACT, data = data_pbset_1_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42308 -0.14863  0.06703  0.10742  0.37912 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.56813    0.92842   0.612   0.5630  
## ACT          0.10220    0.03569   2.863   0.0287 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2692 on 6 degrees of freedom
## Multiple R-squared:  0.5774, Adjusted R-squared:  0.507 
## F-statistic: 8.199 on 1 and 6 DF,  p-value: 0.02868

\(~\)

Question - 2

\(~\)

Once we get the estimator of \(\hat{\beta}_1\) and \(\hat{\beta}_0\) we can use them to compute the fitted values by implementing the value of into the model in order to get the values of \(\hat{\text{GPA}}\) :

table_residual_fitted <- data_pbset_1_1 %>% 
                         mutate(Residual_2 = resid(lm_GPA),
                                Fitted_2 = fitted(lm_GPA))  %>% 
                         select(ACT, Residual_2, Fitted_2, GPA)  %>% 
                         rename(Actual_Value_GPA = GPA)

table_residual_fitted %>% kbl() %>%
                          kable_material(c("striped", "hover"))
ACT Residual_2 Fitted_2 Actual_Value_GPA
21 0.0857143 2.714286 2.8
24 0.3791209 3.020879 3.4
26 -0.2252747 3.225275 3.0
27 0.1725275 3.327472 3.5
29 0.0681319 3.531868 3.6
25 -0.1230769 3.123077 3.0
25 -0.4230769 3.123077 2.7
30 0.0659341 3.634066 3.7

Finally we can compute the sum of residuals to verify if it is equal to zero :

\[ \text{Residuals} = \sum_{i=0}^{n=8} \left(\hat{\text{GPA}}-\text{GPA}_i\right) = -2.775558e^{-17} \approx 0 \]

Residual_2 = resid(lm_GPA)
sum(Residual_2)
## [1] -2.775558e-17

\(~\)

Question - 3

\(~\)

To get the estimated value of GPA when ATC is equal to 20 we simply have to run our model when ATC is equal to 20 such that :

\[ \text{GPA}_{20} = \hat{\beta}_0+\hat{\beta}_1\left(\text{ACT=20}\right) = 2.61213 \]

\(~\)

Question - 4

\(~\)

To know how much of the variation in GPA is explained by ACT we simply have to compute the \(R^2\) statistic :

\[ R^2 = 1 - \frac{\text{SSE}}{\text{SST}} = 1-\frac{\sum_{i=1}^{n}\left(\hat{y}_i-\bar{y}\right)^2}{\sum_{i=1}^{n}\left(\hat{y}_i-y_i\right)^2} = 1 - \frac{0.5940247}{1.02875} = 0.5774238 \]

Residual_2 = resid(lm_GPA)
Residual_data_2 = data.frame(Residual)
Fitted_2 = fitted(lm_GPA)

Regressor_table <- data_pbset_1_1 %>%
                   summarise(regressor = cov(GPA,ACT)/var(ACT),
                             SST = sum((GPA-mean(GPA))^2), 
                             SSE = sum((Fitted_2-mean(GPA))^2),
                             SSR = SST - SSE,
                             R_2 = 1-(SSR/SST))

Regressor_table %>% kbl() %>%
                    kable_material(c("striped", "hover"))
regressor SST SSE SSR R_2
0.1021978 1.02875 0.5940247 0.4347253 0.5774238

Therefore, 57.74% of the variation in GPA is explained by ACT.