1. Sample description

This dataset contains salary information (€), years of employment, and gender of public service employees.

# replace this by a basic sample description (by applying 
# Clean column names to avoid issues with spaces
names(df) <- trimws(names(df))

# Convert data types 
df$salary <- as.numeric(df$salary)
df$years <- as.numeric(df$years)
df$gender <- as.factor(df$gender)

# Number of rows (observations)
nrow(df)
## [1] 200
# Frequency table for gender
table(df$gender)
## 
## Female   Male 
##    100    100
# Means
mean_salary <- mean(df$salary)
mean_years <- mean(df$years)

# Standard deviations
sd_salary <- sd(df$salary)
sd_years <- sd(df$years)
summary(df$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30203   54208   97496  122304  179447  331348
overall_mean = mean(df$salary, na.rm = TRUE)
female_mean  = mean(df$salary[df$gender == "Female"], na.rm = TRUE)
male_mean    = mean(df$salary[df$gender == "Male"], na.rm = TRUE)
means_df = data.frame(Group = c("Overall", "Female", "Male"),Mean_Salary = c(overall_mean, female_mean, male_mean))

female_mean= mean(df$salary[df$gender == "Female"], na.rm = TRUE)
male_mean= mean(df$salary[df$gender == "Male"], na.rm = TRUE)

print(means_df)
##     Group Mean_Salary
## 1 Overall    122303.5
## 2  Female    109140.8
## 3    Male    135466.1


2. Association between years and salary as scatterplot.

The scatterplot below shows the relationship between years of employment and salary. There appears to be a positive association — as the number of years increases, salary also tends to increase. The pattern suggests a possible non-linear relationship, with salary growth flattening at higher experience levels.

# Scatterplot of Years (independent) vs Salary (dependent)
plot(x=df$years, y=df$salary)
abline(lm(salary ~ years, data = df), col = "pink", lwd = 2)

# replace this by plot(independent variable, dependent variable)
# Load ggplot2
library(ggplot2)

# Create a sequence of years for plotting lines
years_empl <- seq(min(df$years, na.rm = TRUE), max(df$years, na.rm = TRUE), length.out = 100)

# adjust for Male and Female 
log_model_male = lm(log(salary) ~ years, data = df[df$gender=="Male",])
summary(log_model_male)
## 
## Call:
## lm(formula = log(salary) ~ years, data = df[df$gender == "Male", 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56063 -0.08644  0.00333  0.06960  0.38121 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.380951   0.030790  337.15   <2e-16 ***
## years        0.076372   0.001698   44.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.153 on 98 degrees of freedom
## Multiple R-squared:  0.9538, Adjusted R-squared:  0.9533 
## F-statistic:  2023 on 1 and 98 DF,  p-value: < 2.2e-16
log_model_female = lm(log(salary) ~ years, data = df[df$gender=="Female",])
summary(log_model_female)
## 
## Call:
## lm(formula = log(salary) ~ years, data = df[df$gender == "Female", 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71847 -0.07628  0.01426  0.10656  0.40887 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.384598   0.036725   282.8   <2e-16 ***
## years        0.065623   0.002025    32.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1825 on 98 degrees of freedom
## Multiple R-squared:  0.9146, Adjusted R-squared:  0.9138 
## F-statistic:  1050 on 1 and 98 DF,  p-value: < 2.2e-16
plot(p <- ggplot(df, aes(x = years, y = salary, color = gender)) + geom_line(aes(x=years_empl, y=2**(coef(log_model_male)[1] + coef(log_model_male)[2]*years_empl)), color = "steelblue", alpha = 1, linetype = "longdash") + geom_line(aes(x=years_empl, y=2**(coef(log_model_female)[1] + coef(log_model_female)[2]*years_empl)), color = "darkorange", alpha = 1, linetype = "longdash"))

# replace this by plot(independent variable, dependent variable)


3. Estimate salary by years of employment

We observe a non-linear relationship between salary and years of employment. To linearize the association, we apply a logarithmic transformation to the salary variable and fit a linear regression model.

df$log_salary <- log(df$salary)

model <- lm(log_salary ~ years, data = df)
df$log_salary_pred <- predict(model)  # predicted log(salary)
df$salary_pred <- exp(df$log_salary_pred)  # retransform to original scale
summary(model)
## 
## Call:
## lm(formula = log_salary ~ years, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77041 -0.12197 -0.00111  0.15234  0.41044 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.382774   0.027501  377.54   <2e-16 ***
## years        0.070998   0.001517   46.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1933 on 198 degrees of freedom
## Multiple R-squared:  0.9171, Adjusted R-squared:  0.9167 
## F-statistic:  2191 on 1 and 198 DF,  p-value: < 2.2e-16


4. Interpretation

The regression model estimates the relationship between years of employment and the natural logarithm of salary. The coefficient for years is 0.071. This indicates a non-linear relationship, where salary increases with more years of experience. The relationship is statistically significant, suggesting that years of employment is an important predictor of salary.