# Read in the data
ddi <- read_ipums_ddi("/Users/xiaoling/Desktop/ecp3203/Assignments /Assignment_4/cps_00010.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.
# Change column names to all lowercase 
colnames(data) <- tolower(colnames(data))
# Sample to use in the analysis 

# Filter the data using filter() function 
filtered_data <- data %>%
  filter(age >= 21 & age <= 64,  # Workers aged 21-64
         incwage > 0 & incwage < 99999998,  # Having valid values for annual wage and salary income
         wkswork2 > 1 & wkswork2 < 6, # Weeks worked greater than 1 and less than 6 
         asecwt >= 0) # Exclude persons who have a negative sampling weight 
# Create variables
filtered_data <- filtered_data %>%
  mutate(
    schooling = case_when (educ == 20 ~ 5.5, 
                           TRUE ~ educ), # Code 20 corresponds to 5 or 6 years of schooling
    weeks_worked = case_when (wkswork2 == 4 ~ 43.5, 
                              TRUE ~ wkswork2), # Code 4 represents 40-47 weeks worked 
    lwkearnings = log(incwage / weeks_worked), # Log of weekly earnings 
    experience = age - schooling - 6 # Years of labor market experience 
  )
# Statistical analysis

# 1.Estimate the Mincer earning function separately for men and women.
library(broom)

# Create a squared experience variable
filtered_data <- filtered_data %>%
  mutate(
    experience_squared = experience^2
  )

# Separate the data for men and women
data_men <- filtered_data %>%
  filter(sex == 1)

data_women <- filtered_data %>%
  filter(sex == 2)

# Fit the model for men
model_men <- lm(lwkearnings ~ schooling + experience + experience_squared, data = data_men)
summary_men <- summary(model_men)
tidy_men <- tidy(model_men)

# Fit the model for women
model_women <- lm(lwkearnings ~ schooling + experience + experience_squared, data = data_women)
summary_women <- summary(model_women)
tidy_women <- tidy(model_women)

# Print the results
print("Regression results for men:")
## [1] "Regression results for men:"
print(summary_men)
## 
## Call:
## lm(formula = lwkearnings ~ schooling + experience + experience_squared, 
##     data = data_men)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9009  -1.0670   0.2252   1.0912  19.5114 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.993e+00  1.029e-02  582.46   <2e-16 ***
## schooling           2.848e-02  2.336e-04  121.92   <2e-16 ***
## experience          1.567e-02  2.199e-04   71.24   <2e-16 ***
## experience_squared -3.227e-05  1.249e-06  -25.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.455 on 327773 degrees of freedom
## Multiple R-squared:  0.05422,    Adjusted R-squared:  0.05421 
## F-statistic:  6264 on 3 and 327773 DF,  p-value: < 2.2e-16
print(tidy_men)
## # A tibble: 4 × 5
##   term                 estimate  std.error statistic   p.value
##   <chr>                   <dbl>      <dbl>     <dbl>     <dbl>
## 1 (Intercept)         5.99      0.0103         582.  0        
## 2 schooling           0.0285    0.000234       122.  0        
## 3 experience          0.0157    0.000220        71.2 0        
## 4 experience_squared -0.0000323 0.00000125     -25.8 5.39e-147
print("Regression results for women:")
## [1] "Regression results for women:"
print(summary_women)
## 
## Call:
## lm(formula = lwkearnings ~ schooling + experience + experience_squared, 
##     data = data_women)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3494  -0.9933   0.1572   1.0838   5.8929 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.632e+00  1.040e-02 541.662  < 2e-16 ***
## schooling          1.939e-02  2.201e-04  88.065  < 2e-16 ***
## experience         6.797e-03  2.477e-04  27.444  < 2e-16 ***
## experience_squared 1.523e-05  1.958e-06   7.778  7.4e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.464 on 407453 degrees of freedom
## Multiple R-squared:  0.05423,    Adjusted R-squared:  0.05422 
## F-statistic:  7788 on 3 and 407453 DF,  p-value: < 2.2e-16
print(tidy_women)
## # A tibble: 4 × 5
##   term                estimate  std.error statistic   p.value
##   <chr>                  <dbl>      <dbl>     <dbl>     <dbl>
## 1 (Intercept)        5.63      0.0104        542.   0        
## 2 schooling          0.0194    0.000220       88.1  0        
## 3 experience         0.00680   0.000248       27.4  1.15e-165
## 4 experience_squared 0.0000152 0.00000196      7.78 7.40e- 15
# 2. Retrieve the coefficient of the education variable as the rate of return to schooling
# for men and women 

# For men
schooling_coef_men <- coef(model_men)["schooling"]
# Convert log points to percentage 
percentage_change_men <- (exp(schooling_coef_men) - 1) * 100

# For women
schooling_coef_women <- coef(model_women)["schooling"]
# Convert log points to percentage 
percentage_change_women <- (exp(schooling_coef_women) - 1) * 100

# Print the report
cat("
# Report on the Rate of Return to Schooling for Men and Women

This report presents the results of an analysis on the rate of return to schooling for men and women. The analysis was conducted using a Mincer earnings function, with log weekly earnings as the dependent variable and schooling, experience, and experience squared as the independent variables.

## Results

The rate of return to schooling was calculated for both men and women. The results are as follows:

### Men

The coefficient of the schooling variable for men is ", schooling_coef_men, ". This represents the rate of return to schooling for men. When converted to a percentage, the rate of return to schooling for men is approximately ", round(percentage_change_men, 2), "%.

### Women

The coefficient of the schooling variable for women is ", schooling_coef_women, ". This represents the rate of return to schooling for women. When converted to a percentage, the rate of return to schooling for women is approximately ", round(percentage_change_women, 2), "%.

## Conclusion

These results provide valuable insights into the rate of return to schooling for men and women. Further analysis could explore the factors contributing to these rates and how they change over time.
")
## 
## # Report on the Rate of Return to Schooling for Men and Women
## 
## This report presents the results of an analysis on the rate of return to schooling for men and women. The analysis was conducted using a Mincer earnings function, with log weekly earnings as the dependent variable and schooling, experience, and experience squared as the independent variables.
## 
## ## Results
## 
## The rate of return to schooling was calculated for both men and women. The results are as follows:
## 
## ### Men
## 
## The coefficient of the schooling variable for men is  0.02848117 . This represents the rate of return to schooling for men. When converted to a percentage, the rate of return to schooling for men is approximately  2.89 %.
## 
## ### Women
## 
## The coefficient of the schooling variable for women is  0.01938579 . This represents the rate of return to schooling for women. When converted to a percentage, the rate of return to schooling for women is approximately  1.96 %.
## 
## ## Conclusion
## 
## These results provide valuable insights into the rate of return to schooling for men and women. Further analysis could explore the factors contributing to these rates and how they change over time.
# 3. Calculate the slope of the age-earnings profile at 10 years of experience for men and women.

# For men
experience_coef_men <- coef(model_men)["experience"]
experience_squared_coef_men <- coef(model_men)["experience_squared"]
slope_men <- experience_coef_men + 2 * 10 * experience_squared_coef_men

# For women
experience_coef_women <- coef(model_women)["experience"]
experience_squared_coef_women <- coef(model_women)["experience_squared"]
slope_women <- experience_coef_women + 2 * 10 * experience_squared_coef_women

# Print the results
cat("The slope of the age-earnings profile at 10 years of experience is:\n")
## The slope of the age-earnings profile at 10 years of experience is:
cat("\nFor men: ", round(slope_men, 6), "\n")
## 
## For men:  0.015022
cat("This means that, for men, each additional year of experience beyond 10 years is associated with a", round(slope_men * 100, 2), "% increase in weekly earnings, holding all other variables constant.\n")
## This means that, for men, each additional year of experience beyond 10 years is associated with a 1.5 % increase in weekly earnings, holding all other variables constant.
cat("\nFor women: ", round(slope_women, 6), "\n")
## 
## For women:  0.007101
cat("This means that, for women, each additional year of experience beyond 10 years is associated with a", round(slope_women * 100, 2), "% increase in weekly earnings, holding all other variables constant.\n")
## This means that, for women, each additional year of experience beyond 10 years is associated with a 0.71 % increase in weekly earnings, holding all other variables constant.
# 4. Produce a diagram illustrating the trends in the rate of return to schooling. 

# Initialize an empty data frame to store the results
results <- data.frame()

# Get the unique years in your data
years <- unique(filtered_data$year)

# Loop over the years
for (year in years) {
  # Filter the data for the current year
  data_year <- filtered_data %>%
    filter(year == !!year)
  
  # Loop over the genders
  for (gender in c(1, 2)) {
    # Filter the data for the current gender
    data_gender <- data_year %>%
      filter(sex == !!gender)
    
    # Fit the model
    model <- lm(lwkearnings ~ schooling + experience + experience_squared, data = data_gender)
    
    # Extract the schooling coefficient
    schooling_coef <- coef(model)["schooling"]
    
    # Store the results
    results <- rbind(results, data.frame(year = year, gender = gender, schooling_coef = schooling_coef))
  }
}

# Convert the gender variable to a factor
results$gender <- factor(results$gender, levels = c(1, 2), labels = c("Men", "Women"))

# Plot the results
ggplot(results, aes(x = year, y = schooling_coef, color = gender)) +
  geom_line() +
  labs(x = "Year of the Survey", y = "Rate of Return to Schooling", color = "Gender") +
  theme_minimal()

# Initialize an empty data frame to store the results
results_exp <- data.frame()

# Loop over the years
for (year in years) {
  # Filter the data for the current year
  data_year <- filtered_data %>%
    filter(year == !!year)
  
  # Loop over the genders
  for (gender in c(1, 2)) {
    # Filter the data for the current gender
    data_gender <- data_year %>%
      filter(sex == !!gender)
    
    # Fit the model
    model <- lm(lwkearnings ~ schooling + experience + experience_squared, data = data_gender)
    
    # Extract the experience coefficient
    experience_coef <- coef(model)["experience"]
    
    # Store the results
    results_exp <- rbind(results_exp, data.frame(year = year, gender = gender, experience_coef = experience_coef))
  }
}

# Convert the gender variable to a factor
results_exp$gender <- factor(results_exp$gender, levels = c(1, 2), labels = c("Men", "Women"))

# Plot the results
ggplot(results_exp, aes(x = year, y = experience_coef, color = gender)) +
  geom_line() +
  labs(x = "Year of the Survey", y = "Rate of Return to Experience", color = "Gender") +
  theme_minimal()