# Add additional packages you need
library(tidyverse)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'
library(psych)
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.1.2
library(interactions)
library(gridExtra)
#install.packages("sjPlot")
library(sjPlot)
# Use R to import the data file `salary.txt`.
salary_dat <- read.table("salary.txt", header=TRUE)
Example: \[\text{salary}_i = \beta_0 + \beta_1 \text{sex}_i + \beta_2 \text{time}_i + e_i\] \[e_i \sim N(0, \sigma)\]
# Run regression using lm()
salary_regression = lm(salary ~ sex + time, data = salary_dat)
summary(salary_regression)
##
## Call:
## lm(formula = salary ~ sex + time, data = salary_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15139.0 -5351.9 -464.2 4289.7 24117.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46349.0 2237.8 20.712 < 2e-16 ***
## sex -1493.8 2043.6 -0.731 0.468
## time 1342.7 238.8 5.623 5.41e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7801 on 59 degrees of freedom
## Multiple R-squared: 0.3752, Adjusted R-squared: 0.354
## F-statistic: 17.71 on 2 and 59 DF, p-value: 9.429e-07
# Summarize your regression coefficients in a table.
msummary(salary_regression)
| Model 1 | |
|---|---|
| (Intercept) | 46348.999 |
| (2237.773) | |
| sex | −1493.783 |
| (2043.559) | |
| time | 1342.687 |
| (238.793) | |
| Num.Obs. | 62 |
| R2 | 0.375 |
| R2 Adj. | 0.354 |
| AIC | 1292.2 |
| BIC | 1300.7 |
| Log.Lik. | −642.081 |
| F | 17.715 |
| RMSE | 7801.04 |
# Insert code for plotting the slope. You can use the
# `sjPlot::plot_model(..., show.data = TRUE)` function.
sjPlot::plot_model(salary_regression, type = "pred", show.data = TRUE)
## $sex
##
## $time
Based on the model, for two people with the same level of time, the person with one unit higher on sex is expected to report a higher salary by $1,342.70.
[The only change you need to change should be m3 in the following code.]
# Simulate 100 data sets based on your model results by modifying the following
# codes
your_model <- salary_regression # change `m3` to the object name of your regression model
# Set the seed so that you always get the same simulated data set
set.seed(1331)
# No need to change the following
num_simulations <- 1000
# Simulate new salary variables
simulated_salary100 <- simulate(your_model, nsim = num_simulations)
# Placeholders for storing simulation results
simulated_slopes <- matrix(nrow = num_simulations, ncol = 2,
dimnames = list(NULL,
c("slopes_x1", "slopes_x2")))
# Run regression 100 times, each time using a different simulated salary outcome
for (i in 1:num_simulations) {
reg_model_sim <- update(your_model,
simulated_salary100[ , i] ~ .)
# Store slope results
simulated_slopes[i, ] <- coef(reg_model_sim)[2:3]
}
# Plot the sampling distributions
as_tibble(simulated_slopes) %>%
pivot_longer(1:2, names_to = "predictor",
values_to = "slopes",
names_prefix = "slopes_") %>%
ggplot(aes(x = slopes)) +
geom_histogram() +
facet_wrap(~ predictor, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histograms both had very messy distributions at 100 permutations, but when changing it from 100 to 1,000 allowed the histograms to better converge to a normal distribution.
# Include interaction in lm()
salary_dat <- salary_dat %>%
mutate(time_c = time - mean(time),
sex_c = sex - mean(sex))
cor(select(salary_dat, time, time_c, sex, sex_c))
## time time_c sex sex_c
## time 1.0000000 1.0000000 -0.2095909 -0.2095909
## time_c 1.0000000 1.0000000 -0.2095909 -0.2095909
## sex -0.2095909 -0.2095909 1.0000000 1.0000000
## sex_c -0.2095909 -0.2095909 1.0000000 1.0000000
mra3 <- lm(salary ~ time * sex, salary_dat)
mra3_c <- lm(salary ~ time_c * sex_c, salary_dat)
summary(mra3)
##
## Call:
## lm(formula = salary ~ time * sex, data = salary_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14396.5 -5596.2 -47.5 4513.7 23037.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45150.0 2406.3 18.763 < 2e-16 ***
## time 1501.0 266.5 5.632 5.46e-07 ***
## sex 3215.3 4137.0 0.777 0.440
## time:sex -765.9 586.1 -1.307 0.196
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7755 on 58 degrees of freedom
## Multiple R-squared: 0.3931, Adjusted R-squared: 0.3617
## F-statistic: 12.52 on 3 and 58 DF, p-value: 2.034e-06
summary(mra3_c)
##
## Call:
## lm(formula = salary ~ time_c * sex_c, data = salary_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14396.5 -5596.2 -47.5 4513.7 23037.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54478.0 1018.2 53.505 < 2e-16 ***
## time_c 1167.5 272.6 4.283 7.03e-05 ***
## sex_c -1985.2 2065.9 -0.961 0.341
## time_c:sex_c -765.9 586.1 -1.307 0.196
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7755 on 58 degrees of freedom
## Multiple R-squared: 0.3931, Adjusted R-squared: 0.3617
## F-statistic: 12.52 on 3 and 58 DF, p-value: 2.034e-06
# Interaction plot
interact_plot(mra3_c, pred=time_c, modx = sex_c)
interact_plot(mra3_c, pred=sex_c, modx = time_c)