Descriptive Statistics

summary_stats <- salary_data %>% 
  summarise(
    count = n(),
    mean_salary = mean(salary, na.rm = TRUE),
    median_salary = median(salary, na.rm = TRUE),
    sd_salary = sd(salary, na.rm = TRUE),
    min_salary = min(salary, na.rm = TRUE),
    max_salary = max(salary, na.rm = TRUE),
    mean_years = mean(years_empl, na.rm = TRUE),
    sd_years = sd(years_empl, na.rm = TRUE),
    min_years = min(years_empl, na.rm = TRUE),
    max_years = max(years_empl, na.rm = TRUE)
  )
kable(summary_stats, caption = "Descriptive statistics for salary and years of employment")
Descriptive statistics for salary and years of employment
count mean_salary median_salary sd_salary min_salary max_salary mean_years sd_years min_years max_years
200 122303.5 97496.12 79030.12 30202.92 331348.3 15.73436 9.035618 0.0071669 29.66675
# Gender distribution
table(salary_data$gender)
## 
## Female   Male 
##    100    100
cat("The dataset includes", table(salary_data$gender)[["Male"]], "male and", table(salary_data$gender)[["Female"]], "female employees, indicating a balanced gender distribution.")
## The dataset includes 100 male and 100 female employees, indicating a balanced gender distribution.

Scatterplot: Salary vs. Years of Employment

ggplot(salary_data, aes(x = years_empl, y = salary)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Salary vs. Years of Employment", x = "Years of Employment", y = "Salary (€)")

Model: Log-Transformed Salary

salary_data$log_salary <- log(salary_data$salary)

model_log <- lm(log_salary ~ years_empl, data = salary_data)
summary(model_log)
## 
## Call:
## lm(formula = log_salary ~ years_empl, data = salary_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77041 -0.12197 -0.00111  0.15234  0.41044 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.382774   0.027501  377.54   <2e-16 ***
## years_empl   0.070998   0.001517   46.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1933 on 198 degrees of freedom
## Multiple R-squared:  0.9171, Adjusted R-squared:  0.9167 
## F-statistic:  2191 on 1 and 198 DF,  p-value: < 2.2e-16

Interpretation

The model estimates the natural log of salary based on years of employment. To interpret the slope:

  • If the slope is β, then for each additional year of employment, the salary increases by approximately (exp(β) - 1) * 100%.
beta <- coef(model_log)[2]
paste("Each additional year of employment increases salary by approximately", round((exp(beta)-1)*100, 2), "%")
## [1] "Each additional year of employment increases salary by approximately 7.36 %"

Gender-Specific Models

model_male <- lm(log_salary ~ years_empl, data = subset(salary_data, gender == "Male"))
model_female <- lm(log_salary ~ years_empl, data = subset(salary_data, gender == "Female"))

summary(model_male)
## 
## Call:
## lm(formula = log_salary ~ years_empl, data = subset(salary_data, 
##     gender == "Male"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56063 -0.08644  0.00333  0.06960  0.38121 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.380951   0.030790  337.15   <2e-16 ***
## years_empl   0.076372   0.001698   44.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.153 on 98 degrees of freedom
## Multiple R-squared:  0.9538, Adjusted R-squared:  0.9533 
## F-statistic:  2023 on 1 and 98 DF,  p-value: < 2.2e-16
summary(model_female)
## 
## Call:
## lm(formula = log_salary ~ years_empl, data = subset(salary_data, 
##     gender == "Female"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71847 -0.07628  0.01426  0.10656  0.40887 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.384598   0.036725   282.8   <2e-16 ***
## years_empl   0.065623   0.002025    32.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1825 on 98 degrees of freedom
## Multiple R-squared:  0.9146, Adjusted R-squared:  0.9138 
## F-statistic:  1050 on 1 and 98 DF,  p-value: < 2.2e-16