library(tidyverse) # data loading, manipulation and plotting
library(carData) # Salary dataset
library(broom) # tidy model output
library(car)
library(regclass)
library(MASS)
# load data into R
data(Salaries, package = "carData")
# six first rows of the data
head(Salaries)
## rank discipline yrs.since.phd yrs.service sex salary
## 1 Prof B 19 18 Male 139750
## 2 Prof B 20 16 Male 173200
## 3 AsstProf B 4 3 Male 79750
## 4 Prof B 45 39 Male 115000
## 5 Prof B 40 41 Male 141500
## 6 AssocProf B 6 6 Male 97000
# structure of the data
str(Salaries)
## 'data.frame': 397 obs. of 6 variables:
## $ rank : Factor w/ 3 levels "AsstProf","AssocProf",..: 3 3 1 3 3 2 3 3 3 3 ...
## $ discipline : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
## $ yrs.since.phd: int 19 20 4 45 40 6 30 45 21 18 ...
## $ yrs.service : int 18 16 3 39 41 6 23 45 20 18 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
## $ salary : int 139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...
# dimension of the data
dim(Salaries)
## [1] 397 6
# summery of the data
summary(Salaries)
## rank discipline yrs.since.phd yrs.service sex
## AsstProf : 67 A:181 Min. : 1.00 Min. : 0.00 Female: 39
## AssocProf: 64 B:216 1st Qu.:12.00 1st Qu.: 7.00 Male :358
## Prof :266 Median :21.00 Median :16.00
## Mean :22.31 Mean :17.61
## 3rd Qu.:32.00 3rd Qu.:27.00
## Max. :56.00 Max. :60.00
## salary
## Min. : 57800
## 1st Qu.: 91000
## Median :107300
## Mean :113706
## 3rd Qu.:134185
## Max. :231545
# pre-processing (excluding missing data)
data <- na.omit(Salaries)
dim(data)
## [1] 397 6
# define qualitative variables
data$rank <- as.factor(data$rank)
data$discipline <- as.factor(data$discipline)
data$sex <- as.factor(data$sex)
# response (Salary) based on other independent variables
ggplot(data, aes(x = sex, y = salary, color = rank)) +
geom_boxplot()+
geom_point(size = 2, position = position_jitter(width = 0.2)) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
theme_classic() +
facet_grid(.~rank)

ggplot(data, aes(x = sex, y = salary, color = discipline)) +
geom_boxplot()+
geom_point(size = 2, position = position_jitter(width = 0.2)) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
theme_classic() +
facet_grid(.~discipline)

ggplot(data, aes(x = rank, y = salary, color = discipline)) +
geom_boxplot()+
geom_point(size = 2, position = position_jitter(width = 0.2)) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
theme_classic() +
facet_grid(.~discipline)

ggplot(data, aes(x = rank, y = salary, color = yrs.since.phd)) +
geom_boxplot()+
geom_point(size = 2, position = position_jitter(width = 0.2)) +
theme_classic()

ggplot(data, aes(x = sex, y = salary, color = yrs.since.phd)) +
geom_boxplot()+
geom_point(size = 3, position = position_jitter(width = 0.2)) +
theme_classic()

ggplot(data, aes(x = sex, y = salary, color = yrs.service)) +
geom_boxplot()+
geom_point(size = 3, position = position_jitter(width = 0.2)) +
theme_classic()

# check the normality of the response variable
qqnorm(data$salary,lwd=2,col=2); qqline(data$salary,col=5,lwd=3)

shapiro.test(data$salary)
##
## Shapiro-Wilk normality test
##
## data: data$salary
## W = 0.95988, p-value = 6.076e-09
# Fitting a multiple linear regression vs sex
fit1 <- lm(salary~sex, data=data)
summary(fit1)
##
## Call:
## lm(formula = salary ~ sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## sexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
# relevel the sex variables
data$sex <- relevel(data$sex, ref = "Male")
levels(data$sex)
## [1] "Male" "Female"
# Fitting after relevel sex
fit2 <- lm(salary~sex, data=data)
summary(fit2)
##
## Call:
## lm(formula = salary ~ sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115090 1587 72.503 < 2e-16 ***
## sexFemale -14088 5065 -2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
# Fit a complete regression model
fit.total <- lm(salary~., data=data)
summary(fit.total)
##
## Call:
## lm(formula = salary ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70738.7 3403.0 20.787 < 2e-16 ***
## rankAssocProf 12907.6 4145.3 3.114 0.00198 **
## rankProf 45066.0 4237.5 10.635 < 2e-16 ***
## disciplineB 14417.6 2342.9 6.154 1.88e-09 ***
## yrs.since.phd 535.1 241.0 2.220 0.02698 *
## yrs.service -489.5 211.9 -2.310 0.02143 *
## sexFemale -4783.5 3858.7 -1.240 0.21584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
# Box-Cox transformation
library(MASS)
fit <- lm(salary~., data=data)
boxcox_model <- boxcox(fit)

# Exact lambda
lambda <- boxcox_model$x[which.max(boxcox_model$y)]
lambda
## [1] -0.8282828
# boxcox transformation
salary.transformed <- (data$salary ^ lambda - 1) / lambda
data$salary.transformed <- salary.transformed
ggplot(data, aes(x = salary.transformed)) +
geom_histogram(fill = 'darkblue', col='red',bins = 30)+
theme_classic()

# salary.transformed <- 1/sqrt(data$salary)
# Fit a model after transformation
fit.total <- lm(salary.transformed~rank+discipline+yrs.since.phd+yrs.service+sex, data=data)
summary(fit.total)
##
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.since.phd +
## yrs.service + sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.505e-05 -6.282e-06 -1.010e-07 7.931e-06 3.345e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.207e+00 1.762e-06 685232.047 < 2e-16 ***
## rankAssocProf 1.263e-05 2.146e-06 5.884 8.61e-09 ***
## rankProf 3.308e-05 2.194e-06 15.078 < 2e-16 ***
## disciplineB 9.221e-06 1.213e-06 7.602 2.20e-13 ***
## yrs.since.phd 1.534e-07 1.248e-07 1.229 0.2197
## yrs.service -2.499e-07 1.097e-07 -2.277 0.0233 *
## sexFemale -3.449e-06 1.998e-06 -1.727 0.0850 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.167e-05 on 390 degrees of freedom
## Multiple R-squared: 0.5655, Adjusted R-squared: 0.5588
## F-statistic: 84.59 on 6 and 390 DF, p-value: < 2.2e-16
# linearity of data
plot(fit.total, 1)

# Homogeneity of variance
plot(fit.total, 3)

# Normality of residuals
plot(fit.total, 2)

# independence of residuals
acf(fit.total$residuals)

Box.test(fit.total$residuals, type='Ljung-Box')
##
## Box-Ljung test
##
## data: fit.total$residuals
## X-squared = 3.5483, df = 1, p-value = 0.05961
# Fit Stepwise regression
step(fit.total, direction = "backward")
## Start: AIC=-9011.81
## salary.transformed ~ rank + discipline + yrs.since.phd + yrs.service +
## sex
##
## Df Sum of Sq RSS AIC
## - yrs.since.phd 1 2.0600e-10 5.3306e-08 -9012.3
## <none> 5.3100e-08 -9011.8
## - sex 1 4.0600e-10 5.3506e-08 -9010.8
## - yrs.service 1 7.0600e-10 5.3806e-08 -9008.6
## - discipline 1 7.8680e-09 6.0968e-08 -8958.9
## - rank 2 3.4341e-08 8.7441e-08 -8817.8
##
## Step: AIC=-9012.27
## salary.transformed ~ rank + discipline + yrs.service + sex
##
## Df Sum of Sq RSS AIC
## <none> 5.3306e-08 -9012.3
## - sex 1 4.0500e-10 5.3711e-08 -9011.3
## - yrs.service 1 7.5000e-10 5.4056e-08 -9008.7
## - discipline 1 7.6660e-09 6.0971e-08 -8960.9
## - rank 2 4.6369e-08 9.9674e-08 -8767.8
##
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service +
## sex, data = data)
##
## Coefficients:
## (Intercept) rankAssocProf rankProf disciplineB yrs.service
## 1.207e+00 1.310e-05 3.425e-05 8.950e-06 -1.350e-07
## sexFemale
## -3.446e-06
# Fitting the improved model
lm_backward <- lm(salary.transformed ~ rank + discipline + yrs.service + sex,
data = data)
summary(lm_backward)
##
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service +
## sex, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.494e-05 -6.409e-06 1.300e-08 7.646e-06 3.325e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.207e+00 1.673e-06 721659.146 < 2e-16 ***
## rankAssocProf 1.310e-05 2.113e-06 6.202 1.42e-09 ***
## rankProf 3.425e-05 1.977e-06 17.329 < 2e-16 ***
## disciplineB 8.950e-06 1.194e-06 7.499 4.37e-13 ***
## yrs.service -1.350e-07 5.755e-08 -2.346 0.0195 *
## sexFemale -3.446e-06 1.999e-06 -1.724 0.0855 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.168e-05 on 391 degrees of freedom
## Multiple R-squared: 0.5638, Adjusted R-squared: 0.5582
## F-statistic: 101.1 on 5 and 391 DF, p-value: < 2.2e-16
VIF(lm_backward)
## GVIF Df GVIF^(1/(2*Df))
## rank 1.597441 2 1.124233
## discipline 1.029040 1 1.014416
## yrs.service 1.627110 1 1.275582
## sex 1.030803 1 1.015284
# Fit a trend line
library(ggplot2)
ggplot(data, aes(x = yrs.service, y = salary.transformed)) +
geom_point()+
stat_smooth()+
theme_classic()

# Converting it to categorical
range(data$yrs.service)
## [1] 0 60
# Binning yrs.service variable into categories
library(tidyverse)
Salaries_mod <- data %>%
mutate(service_time_cat = case_when(
between(yrs.service, 0, 20)~"upto20",
between(yrs.service, 20, 40)~"20_40",
between(yrs.service, 40, 60)~"40_60"))
# Converting it to a factor variable
Salaries_mod$service_time_cat <- as.factor(Salaries_mod$service_time_cat)
levels(Salaries_mod$service_time_cat)
## [1] "20_40" "40_60" "upto20"
# Changing the order of the levels
Salaries_mod$service_time_cat <- relevel(Salaries_mod$service_time_cat, ref = "upto20")
levels(Salaries_mod$service_time_cat)
## [1] "upto20" "20_40" "40_60"
# Generating a count table
table(Salaries_mod$service_time_cat)
##
## upto20 20_40 40_60
## 250 126 21
# Model Fitting
fit4 <- lm(salary.transformed ~ rank + discipline + yrs.service + sex + service_time_cat, data = Salaries_mod)
summary(fit4)
##
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service +
## sex + service_time_cat, data = Salaries_mod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.300e-05 -6.668e-06 7.800e-08 7.694e-06 3.613e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.207e+00 1.685e-06 716545.175 < 2e-16 ***
## rankAssocProf 1.238e-05 2.198e-06 5.632 3.42e-08 ***
## rankProf 3.304e-05 2.181e-06 15.150 < 2e-16 ***
## disciplineB 8.883e-06 1.195e-06 7.431 6.89e-13 ***
## yrs.service -1.619e-08 1.224e-07 -0.132 0.8949
## sexFemale -3.552e-06 2.002e-06 -1.774 0.0768 .
## service_time_cat20_40 -1.734e-06 2.469e-06 -0.702 0.4829
## service_time_cat40_60 -6.565e-06 5.016e-06 -1.309 0.1914
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.168e-05 on 389 degrees of freedom
## Multiple R-squared: 0.5659, Adjusted R-squared: 0.5581
## F-statistic: 72.46 on 7 and 389 DF, p-value: < 2.2e-16
# checking the collinearity
VIF(fit4)
## GVIF Df GVIF^(1/(2*Df))
## rank 1.958284 2 1.182957
## discipline 1.031846 1 1.015798
## yrs.service 7.362341 1 2.713363
## sex 1.033842 1 1.016780
## service_time_cat 5.763038 2 1.549398
# Performing a stepwise regression
step(fit4, direction = "backward")
## Start: AIC=-9010.23
## salary.transformed ~ rank + discipline + yrs.service + sex +
## service_time_cat
##
## Df Sum of Sq RSS AIC
## - service_time_cat 2 2.6200e-10 5.3306e-08 -9012.3
## - yrs.service 1 2.0000e-12 5.3046e-08 -9012.2
## <none> 5.3044e-08 -9010.2
## - sex 1 4.2900e-10 5.3473e-08 -9009.0
## - discipline 1 7.5300e-09 6.0574e-08 -8959.5
## - rank 2 3.6343e-08 8.9387e-08 -8807.1
##
## Step: AIC=-9012.27
## salary.transformed ~ rank + discipline + yrs.service + sex
##
## Df Sum of Sq RSS AIC
## <none> 5.3306e-08 -9012.3
## - sex 1 4.0500e-10 5.3711e-08 -9011.3
## - yrs.service 1 7.5000e-10 5.4056e-08 -9008.7
## - discipline 1 7.6660e-09 6.0971e-08 -8960.9
## - rank 2 4.6369e-08 9.9674e-08 -8767.8
##
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service +
## sex, data = Salaries_mod)
##
## Coefficients:
## (Intercept) rankAssocProf rankProf disciplineB yrs.service
## 1.207e+00 1.310e-05 3.425e-05 8.950e-06 -1.350e-07
## sexFemale
## -3.446e-06
# Fitting final model
fit5 <- lm(formula = salary ~ rank + discipline + yrs.service + sex,
data = Salaries_mod)
summary(fit5)
##
## Call:
## lm(formula = salary ~ rank + discipline + yrs.service + sex,
## data = Salaries_mod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64202 -14255 -1533 10571 99163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73122.92 3245.27 22.532 < 2e-16 ***
## rankAssocProf 14560.40 4098.32 3.553 0.000428 ***
## rankProf 49159.64 3834.49 12.820 < 2e-16 ***
## disciplineB 13473.38 2315.50 5.819 1.24e-08 ***
## yrs.service -88.78 111.64 -0.795 0.426958
## sexFemale -4771.25 3878.00 -1.230 0.219311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 391 degrees of freedom
## Multiple R-squared: 0.4478, Adjusted R-squared: 0.4407
## F-statistic: 63.41 on 5 and 391 DF, p-value: < 2.2e-16
library(car)
car::qqPlot(rstandard(fit5))

## [1] 44 365
shapiro.test(fit5$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit5$residuals
## W = 0.96073, p-value = 8.202e-09
# checking the collinearity
VIF(fit5)
## GVIF Df GVIF^(1/(2*Df))
## rank 1.597441 2 1.124233
## discipline 1.029040 1 1.014416
## yrs.service 1.627110 1 1.275582
## sex 1.030803 1 1.015284
# Making result tidy
tidy(fit5)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 73123. 3245. 22.5 1.17e-72
## 2 rankAssocProf 14560. 4098. 3.55 4.28e- 4
## 3 rankProf 49160. 3834. 12.8 1.18e-31
## 4 disciplineB 13473. 2315. 5.82 1.24e- 8
## 5 yrs.service -88.8 112. -0.795 4.27e- 1
## 6 sexFemale -4771. 3878. -1.23 2.19e- 1
# ANOVA analysis
anova(fit5)
## Analysis of Variance Table
##
## Response: salary
## Df Sum Sq Mean Sq F value Pr(>F)
## rank 2 1.4323e+11 7.1616e+10 139.5761 < 2.2e-16 ***
## discipline 1 1.8430e+10 1.8430e+10 35.9191 4.675e-09 ***
## yrs.service 1 2.4187e+08 2.4187e+08 0.4714 0.4928
## sex 1 7.7669e+08 7.7669e+08 1.5137 0.2193
## Residuals 391 2.0062e+11 5.1310e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1