summary_stats <- salary_data %>%
summarise(
count = n(),
mean_salary = mean(salary, na.rm = TRUE),
median_salary = median(salary, na.rm = TRUE),
sd_salary = sd(salary, na.rm = TRUE),
min_salary = min(salary, na.rm = TRUE),
max_salary = max(salary, na.rm = TRUE),
mean_years = mean(years_empl, na.rm = TRUE),
sd_years = sd(years_empl, na.rm = TRUE),
min_years = min(years_empl, na.rm = TRUE),
max_years = max(years_empl, na.rm = TRUE)
)
kable(summary_stats, caption = "Descriptive statistics for salary and years of employment")
| count | mean_salary | median_salary | sd_salary | min_salary | max_salary | mean_years | sd_years | min_years | max_years |
|---|---|---|---|---|---|---|---|---|---|
| 200 | 122303.5 | 97496.12 | 79030.12 | 30202.92 | 331348.3 | 15.73436 | 9.035618 | 0.0071669 | 29.66675 |
# Gender distribution
table(salary_data$gender)
##
## Female Male
## 100 100
cat("The dataset includes", table(salary_data$gender)[["Male"]], "male and", table(salary_data$gender)[["Female"]], "female employees, indicating a balanced gender distribution.")
## The dataset includes 100 male and 100 female employees, indicating a balanced gender distribution.
ggplot(salary_data, aes(x = years_empl, y = salary)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Salary vs. Years of Employment", x = "Years of Employment", y = "Salary (€)")
salary_data$log_salary <- log(salary_data$salary)
model_log <- lm(log_salary ~ years_empl, data = salary_data)
summary(model_log)
##
## Call:
## lm(formula = log_salary ~ years_empl, data = salary_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77041 -0.12197 -0.00111 0.15234 0.41044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.382774 0.027501 377.54 <2e-16 ***
## years_empl 0.070998 0.001517 46.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1933 on 198 degrees of freedom
## Multiple R-squared: 0.9171, Adjusted R-squared: 0.9167
## F-statistic: 2191 on 1 and 198 DF, p-value: < 2.2e-16
The model estimates the natural log of salary based on years of employment. To interpret the slope:
beta <- coef(model_log)[2]
paste("Each additional year of employment increases salary by approximately", round((exp(beta)-1)*100, 2), "%")
## [1] "Each additional year of employment increases salary by approximately 7.36 %"
model_male <- lm(log_salary ~ years_empl, data = subset(salary_data, gender == "Male"))
model_female <- lm(log_salary ~ years_empl, data = subset(salary_data, gender == "Female"))
summary(model_male)
##
## Call:
## lm(formula = log_salary ~ years_empl, data = subset(salary_data,
## gender == "Male"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56063 -0.08644 0.00333 0.06960 0.38121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.380951 0.030790 337.15 <2e-16 ***
## years_empl 0.076372 0.001698 44.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.153 on 98 degrees of freedom
## Multiple R-squared: 0.9538, Adjusted R-squared: 0.9533
## F-statistic: 2023 on 1 and 98 DF, p-value: < 2.2e-16
summary(model_female)
##
## Call:
## lm(formula = log_salary ~ years_empl, data = subset(salary_data,
## gender == "Female"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71847 -0.07628 0.01426 0.10656 0.40887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.384598 0.036725 282.8 <2e-16 ***
## years_empl 0.065623 0.002025 32.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1825 on 98 degrees of freedom
## Multiple R-squared: 0.9146, Adjusted R-squared: 0.9138
## F-statistic: 1050 on 1 and 98 DF, p-value: < 2.2e-16