## Descriptive Statistics ~ 
#Load Libraries
library(wooldridge)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
data(bwght)
?bwght
## starting httpd help server ... done
head(bwght)
##   faminc cigtax cigprice bwght fatheduc motheduc parity male white cigs
## 1   13.5   16.5    122.3   109       12       12      1    1     1    0
## 2    7.5   16.5    122.3   133        6       12      2    1     0    0
## 3    0.5   16.5    122.3   129       NA       12      2    0     0    0
## 4   15.5   16.5    122.3   126       12       12      2    1     0    0
## 5   27.5   16.5    122.3   134       14       12      2    1     1    0
## 6    7.5   16.5    122.3   118       12       14      6    1     0    0
##     lbwght bwghtlbs packs    lfaminc
## 1 4.691348   6.8125     0  2.6026897
## 2 4.890349   8.3125     0  2.0149031
## 3 4.859812   8.0625     0 -0.6931472
## 4 4.836282   7.8750     0  2.7408400
## 5 4.897840   8.3750     0  3.3141861
## 6 4.770685   7.3750     0  2.0149031
bwght_data <- bwght %>%
  select(motheduc, faminc, cigs, parity)
describe(bwght_data)
##          vars    n  mean    sd median trimmed   mad min max range  skew
## motheduc    1 1387 12.94  2.38   12.0   12.95  1.48 2.0  18  16.0 -0.03
## faminc      2 1388 29.03 18.74   27.5   27.72 17.79 0.5  65  64.5  0.62
## cigs        3 1388  2.09  5.97    0.0    0.39  0.00 0.0  50  50.0  3.56
## parity      4 1388  1.63  0.89    1.0    1.47  0.00 1.0   6   5.0  1.63
##          kurtosis   se
## motheduc     0.64 0.06
## faminc      -0.53 0.50
## cigs        14.91 0.16
## parity       2.93 0.02
stargazer(bwght_data, type = "text", title = "Descriptive statistics", digits = 2)
## 
## Descriptive statistics
## =========================================
## Statistic   N   Mean  St. Dev. Min   Max 
## -----------------------------------------
## motheduc  1,387 12.94   2.38    2    18  
## faminc    1,388 29.03  18.74   0.50 65.00
## cigs      1,388 2.09    5.97    0    50  
## parity    1,388 1.63    0.89    1     6  
## -----------------------------------------
# this is a data based on descriptive statistics. This gives- mean, median, standard deviation, min/max and kurtosis. 


# The data shows that most of the mothers have completed high school with an average income of $29,000 per year. Cigarette used during pregnancy is generally low. Average parity is 1.63, showing most mothers are first or second time parents. It shows how mother education, income, health behavior and reproductive history influences maternal and child outcome. 
## Histogram and scatterplot
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(wooldridge)
#load the data set 
data(wage1)
head(wage1)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
ggplot(wage1, aes(x = wage)) + geom_histogram(binwidth = 1, fill = "plum", color = "blue") +
  labs(title = "histogram of wages",
       x = "wages ($ per hour)",
       y = "Frequency") +
  theme_minimal()

#this histogram shows how wages are distributed. Y-axis shows how many individuals earn wages within each bin range. As it shows right skewed, more people earn lower wages and very less people earn high wages.
ggplot(wage1, aes(x = educ, y = wage)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "wage vs education",
       x = "years of eductaion",
y = "wage ($ per hour)") + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# the above scatterpot shows a positive slop, indicating a positive relationship between education and higher wages
ggplot(wage1, aes(x = exper, y = wage)) + 
  geom_point(colour = "Purple", alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE, color = "blue") + 
  labs(title = "wage vs experience", 
     x = "years of experience",
     y = "wage ($ per hour)") + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# This scatterplot shows wage trends along with work experience
# blue - loess curve, indicates a non linear pattern where the wages rise early in career
## Bivariate regression ~
#install.packages("wooldgridge")
library(wooldridge)
data("wage1")
head(wage1)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
model1 <-lm(wage ~ educ, data = wage1)
summary(model1)
## 
## Call:
## lm(formula = wage ~ educ, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3396 -2.1501 -0.9674  1.1921 16.6085 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.90485    0.68497  -1.321    0.187    
## educ         0.54136    0.05325  10.167   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.378 on 524 degrees of freedom
## Multiple R-squared:  0.1648, Adjusted R-squared:  0.1632 
## F-statistic: 103.4 on 1 and 524 DF,  p-value: < 2.2e-16
coef(summary(model1))
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -0.9048516 0.68496782 -1.321013 1.870735e-01
## educ         0.5413593 0.05324804 10.166746 2.782599e-22
summary(model1)$r.squared
## [1] 0.1647575
model2 <- lm(wage ~ exper, data = wage1)
summary(model2)
## 
## Call:
## lm(formula = wage ~ exper, data = wage1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.936 -2.458 -1.112  1.077 18.716 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.37331    0.25699  20.908  < 2e-16 ***
## exper        0.03072    0.01181   2.601  0.00955 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.673 on 524 degrees of freedom
## Multiple R-squared:  0.01275,    Adjusted R-squared:  0.01086 
## F-statistic: 6.766 on 1 and 524 DF,  p-value: 0.009555
stargazer(model1, model2, type = "text",
          title = "Bivariate Regression results",
          column.labels = c("Educ vs wage", "Exper vs wage"),
          dep.var.labels = "wage",
          covariate.labels = 
            c("Education", "Experience"),
          align = TRUE)
## 
## Bivariate Regression results
## ===========================================================
##                                    Dependent variable:     
##                                ----------------------------
##                                            wage            
##                                 Educ vs wage  Exper vs wage
##                                     (1)            (2)     
## -----------------------------------------------------------
## Education                         0.541***                 
##                                   (0.053)                  
##                                                            
## Experience                                      0.031***   
##                                                  (0.012)   
##                                                            
## Constant                           -0.905       5.373***   
##                                   (0.685)        (0.257)   
##                                                            
## -----------------------------------------------------------
## Observations                        526            526     
## R2                                 0.165          0.013    
## Adjusted R2                        0.163          0.011    
## Residual Std. Error (df = 524)     3.378          3.673    
## F Statistic (df = 1; 524)        103.363***     6.766***   
## ===========================================================
## Note:                           *p<0.1; **p<0.05; ***p<0.01
# Both education and experience affect wages significantly. However, education has a stronger impact explaining more variation in hourly wages than experience alone. 
##Multivariate regression ~
library(wooldridge)
library(stargazer)
library(dplyr)
library(ggplot2)
data("wage1")
head(wage1)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
model_multi <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model_multi)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16
stargazer(model_multi, type = "text", 
          title = "Multivariate regression : Determinants of wage",
          dep.var.labels = "wage",
          covariate.labels = 
            c("Education", "Experience", "Tenure"),
          report = "vcstp", 
          digits = 3)
## 
## Multivariate regression : Determinants of wage
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                wage            
## -----------------------------------------------
## Education                      0.599           
##                               (0.051)          
##                             t = 11.679         
##                              p = 0.000         
##                                                
## Experience                     0.022           
##                               (0.012)          
##                              t = 1.853         
##                              p = 0.065         
##                                                
## Tenure                         0.169           
##                               (0.022)          
##                              t = 7.820         
##                              p = 0.000         
##                                                
## Constant                      -2.873           
##                               (0.729)          
##                             t = -3.941         
##                             p = 0.0001         
##                                                
## -----------------------------------------------
## Observations                    526            
## R2                             0.306           
## Adjusted R2                    0.302           
## Residual Std. Error      3.084 (df = 522)      
## F Statistic           76.873*** (df = 3; 522)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
#Education has strongest positive effect  as each year it increases the age by $0.59, showing highly significant result

#tenure significantly increases wage by about 0.16

# experience has smaller impact