library(modelsummary)
library(tidyverse)
library(sjPlot)
wage=read_csv("E:/hw/wage.csv")
summary(wage)
##       wage            educ           exper         nonwhite        
##  Min.   : 5.53   Min.   : 4.00   Min.   : 1.00   Length:100        
##  1st Qu.: 8.60   1st Qu.:12.00   1st Qu.: 5.00   Class :character  
##  Median :10.81   Median :12.00   Median :13.00   Mode  :character  
##  Mean   :11.85   Mean   :12.82   Mean   :15.03                     
##  3rd Qu.:13.64   3rd Qu.:14.00   3rd Qu.:22.00                     
##  Max.   :27.20   Max.   :18.00   Max.   :45.00                     
##     female            married         
##  Length:100         Length:100        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
#datasummary_skim(wage)
#draw a historgram graph for the variabe wage
ggplot(wage, aes(x = wage)) +
  geom_histogram(binwidth = 2, fill = "blue", color = "black", alpha = 0.7) + # Change bindwith and alpha values to see what happens
  labs(title = "Histogram of wage",
       x = "wage", y = "Count") + # Add labels and title
  theme_minimal() # Use a minimal theme for a clean look

#draw a scatter plot of wage against educ
plot(x = wage$educ, y = wage$wage, xlab = "years of education", ylab = "hourly wage", pch = 16, cex=0.3, xlim=c(3,20))
# Add text annotations
text(x = wage$educ, y = wage$wage, pos = 4, cex = 0.4)
# Fit a simple linear regression and add the regression line
m1 <- lm(wage ~ educ, data=wage)
abline(a=coef(m1)[1], b=coef(m1)[2], col="red")

#run OLS of wage on educ
model1<- lm(wage~ educ, data = wage)
wage$female <- car::recode(wage$female, "'Y'='female'; 'N'='male'", as.factor=TRUE)
wage$nonwhite <- car::recode(wage$nonwhite, "'Y'='nonwhite'; 'N'='white'", as.factor=TRUE)
wage$married <- car::recode(wage$married, "'Y'='married'; 'N'='unmarried'", as.factor=TRUE)
#run OLS of wage on educ and female
model2<-lm(wage~ educ+female, data = wage)
#run OLS of wage on educ, female and the interaction term between them
model3<-lm(wage~ educ+female+educ:female, data = wage)
#run OLS adding nonwhite and married
model4<-lm(wage~ educ+female+educ:female+nonwhite+married, data = wage)
#Create regression tables for the model using  sjPlot packages
tab_model(model1, model2, model3, model4)
  wage wage wage wage
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 3.19 -1.18 – 7.56 0.151 2.57 -1.55 – 6.69 0.219 0.25 -6.86 – 7.36 0.945 1.20 -6.17 – 8.57 0.747
educ 0.68 0.34 – 1.01 <0.001 0.62 0.30 – 0.93 <0.001 0.80 0.24 – 1.36 0.005 0.86 0.31 – 1.42 0.003
female [male] 2.83 1.32 – 4.35 <0.001 6.30 -2.48 – 15.09 0.158 6.74 -1.97 – 15.46 0.128
educ * female [male] -0.27 -0.95 – 0.41 0.428 -0.33 -1.01 – 0.34 0.331
nonwhite [white] -1.02 -3.43 – 1.39 0.404
married [unmarried] -1.43 -3.00 – 0.15 0.075
Observations 100 100 100 100
R2 / R2 adjusted 0.140 / 0.132 0.247 / 0.232 0.252 / 0.229 0.285 / 0.247
#Diagnostic of heteroskedasticity
fitted_values <- fitted(model4)  
residuals <- residuals(model4)  
plot(fitted_values, residuals, main = "Residuals vs Fitted Values", xlab = "Fitted Values", ylab = "Residuals")  
abline(h = 0, lty = 2, col = "red")