library(tidyverse)  # data loading, manipulation and plotting
library(carData)    # Salary dataset
library(broom)      # tidy model output
library(car)
library(regclass)
library(MASS)

# load data into R
data(Salaries, package = "carData")
# six first rows of the data
head(Salaries)
##        rank discipline yrs.since.phd yrs.service  sex salary
## 1      Prof          B            19          18 Male 139750
## 2      Prof          B            20          16 Male 173200
## 3  AsstProf          B             4           3 Male  79750
## 4      Prof          B            45          39 Male 115000
## 5      Prof          B            40          41 Male 141500
## 6 AssocProf          B             6           6 Male  97000
# structure of the data
str(Salaries)
## 'data.frame':    397 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AsstProf","AssocProf",..: 3 3 1 3 3 2 3 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yrs.since.phd: int  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : int  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ salary       : int  139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...
# dimension of the data
dim(Salaries)
## [1] 397   6
# summery of the data
summary(Salaries)
##         rank     discipline yrs.since.phd    yrs.service        sex     
##  AsstProf : 67   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AssocProf: 64   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      salary      
##  Min.   : 57800  
##  1st Qu.: 91000  
##  Median :107300  
##  Mean   :113706  
##  3rd Qu.:134185  
##  Max.   :231545
# pre-processing (excluding missing data)
data <- na.omit(Salaries)
dim(data)
## [1] 397   6
# define qualitative variables
data$rank <- as.factor(data$rank)
data$discipline <- as.factor(data$discipline)
data$sex <- as.factor(data$sex)


# response (Salary) based on other independent variables
ggplot(data, aes(x = sex, y = salary, color = rank)) +
  geom_boxplot()+
  geom_point(size = 2, position = position_jitter(width = 0.2)) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
  theme_classic() +
  facet_grid(.~rank)

ggplot(data, aes(x = sex, y = salary, color = discipline)) +
  geom_boxplot()+
  geom_point(size = 2, position = position_jitter(width = 0.2)) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
  theme_classic() +
  facet_grid(.~discipline)

ggplot(data, aes(x = rank, y = salary, color = discipline)) +
  geom_boxplot()+
  geom_point(size = 2, position = position_jitter(width = 0.2)) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 6, color = "blue")+
  theme_classic() +
  facet_grid(.~discipline)

ggplot(data, aes(x = rank, y = salary, color = yrs.since.phd)) +
  geom_boxplot()+
  geom_point(size = 2, position = position_jitter(width = 0.2)) +
  theme_classic()

ggplot(data, aes(x = sex, y = salary, color = yrs.since.phd)) +
  geom_boxplot()+
  geom_point(size = 3, position = position_jitter(width = 0.2)) +
  theme_classic()

ggplot(data, aes(x = sex, y = salary, color = yrs.service)) +
  geom_boxplot()+
  geom_point(size = 3, position = position_jitter(width = 0.2)) +
  theme_classic()

# check the normality of the response variable
qqnorm(data$salary,lwd=2,col=2); qqline(data$salary,col=5,lwd=3)

shapiro.test(data$salary)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$salary
## W = 0.95988, p-value = 6.076e-09
# Fitting a multiple linear regression vs sex
fit1 <- lm(salary~sex, data=data)
summary(fit1)
## 
## Call:
## lm(formula = salary ~ sex, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## sexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667
# relevel the sex variables
data$sex <- relevel(data$sex, ref = "Male")
levels(data$sex)
## [1] "Male"   "Female"
# Fitting after relevel sex
fit2 <- lm(salary~sex, data=data)
summary(fit2)
## 
## Call:
## lm(formula = salary ~ sex, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   115090       1587  72.503  < 2e-16 ***
## sexFemale     -14088       5065  -2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667
# Fit a complete regression model
fit.total <- lm(salary~., data=data)
summary(fit.total)
## 
## Call:
## lm(formula = salary ~ ., data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    70738.7     3403.0  20.787  < 2e-16 ***
## rankAssocProf  12907.6     4145.3   3.114  0.00198 ** 
## rankProf       45066.0     4237.5  10.635  < 2e-16 ***
## disciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd    535.1      241.0   2.220  0.02698 *  
## yrs.service     -489.5      211.9  -2.310  0.02143 *  
## sexFemale      -4783.5     3858.7  -1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
# Box-Cox transformation
library(MASS)
fit <- lm(salary~., data=data)
boxcox_model <- boxcox(fit)

# Exact lambda
lambda <- boxcox_model$x[which.max(boxcox_model$y)]
lambda
## [1] -0.8282828
# boxcox transformation
salary.transformed <- (data$salary ^ lambda - 1) / lambda
data$salary.transformed <- salary.transformed
ggplot(data, aes(x = salary.transformed)) +
  geom_histogram(fill = 'darkblue', col='red',bins = 30)+
  theme_classic()

# salary.transformed <- 1/sqrt(data$salary)

# Fit a model after transformation
fit.total <- lm(salary.transformed~rank+discipline+yrs.since.phd+yrs.service+sex, data=data)
summary(fit.total)
## 
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.since.phd + 
##     yrs.service + sex, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.505e-05 -6.282e-06 -1.010e-07  7.931e-06  3.345e-05 
## 
## Coefficients:
##                 Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)    1.207e+00  1.762e-06 685232.047  < 2e-16 ***
## rankAssocProf  1.263e-05  2.146e-06      5.884 8.61e-09 ***
## rankProf       3.308e-05  2.194e-06     15.078  < 2e-16 ***
## disciplineB    9.221e-06  1.213e-06      7.602 2.20e-13 ***
## yrs.since.phd  1.534e-07  1.248e-07      1.229   0.2197    
## yrs.service   -2.499e-07  1.097e-07     -2.277   0.0233 *  
## sexFemale     -3.449e-06  1.998e-06     -1.727   0.0850 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.167e-05 on 390 degrees of freedom
## Multiple R-squared:  0.5655, Adjusted R-squared:  0.5588 
## F-statistic: 84.59 on 6 and 390 DF,  p-value: < 2.2e-16
# linearity of data
plot(fit.total, 1)

# Homogeneity of variance
plot(fit.total, 3)

# Normality of residuals
plot(fit.total, 2)

# independence of residuals
acf(fit.total$residuals)

Box.test(fit.total$residuals, type='Ljung-Box')
## 
##  Box-Ljung test
## 
## data:  fit.total$residuals
## X-squared = 3.5483, df = 1, p-value = 0.05961
# Fit Stepwise regression
step(fit.total, direction = "backward")
## Start:  AIC=-9011.81
## salary.transformed ~ rank + discipline + yrs.since.phd + yrs.service + 
##     sex
## 
##                 Df  Sum of Sq        RSS     AIC
## - yrs.since.phd  1 2.0600e-10 5.3306e-08 -9012.3
## <none>                        5.3100e-08 -9011.8
## - sex            1 4.0600e-10 5.3506e-08 -9010.8
## - yrs.service    1 7.0600e-10 5.3806e-08 -9008.6
## - discipline     1 7.8680e-09 6.0968e-08 -8958.9
## - rank           2 3.4341e-08 8.7441e-08 -8817.8
## 
## Step:  AIC=-9012.27
## salary.transformed ~ rank + discipline + yrs.service + sex
## 
##               Df  Sum of Sq        RSS     AIC
## <none>                      5.3306e-08 -9012.3
## - sex          1 4.0500e-10 5.3711e-08 -9011.3
## - yrs.service  1 7.5000e-10 5.4056e-08 -9008.7
## - discipline   1 7.6660e-09 6.0971e-08 -8960.9
## - rank         2 4.6369e-08 9.9674e-08 -8767.8
## 
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service + 
##     sex, data = data)
## 
## Coefficients:
##   (Intercept)  rankAssocProf       rankProf    disciplineB    yrs.service  
##     1.207e+00      1.310e-05      3.425e-05      8.950e-06     -1.350e-07  
##     sexFemale  
##    -3.446e-06
# Fitting the improved model
lm_backward <- lm(salary.transformed ~ rank + discipline + yrs.service + sex, 
                       data = data)
summary(lm_backward)
## 
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service + 
##     sex, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.494e-05 -6.409e-06  1.300e-08  7.646e-06  3.325e-05 
## 
## Coefficients:
##                 Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)    1.207e+00  1.673e-06 721659.146  < 2e-16 ***
## rankAssocProf  1.310e-05  2.113e-06      6.202 1.42e-09 ***
## rankProf       3.425e-05  1.977e-06     17.329  < 2e-16 ***
## disciplineB    8.950e-06  1.194e-06      7.499 4.37e-13 ***
## yrs.service   -1.350e-07  5.755e-08     -2.346   0.0195 *  
## sexFemale     -3.446e-06  1.999e-06     -1.724   0.0855 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.168e-05 on 391 degrees of freedom
## Multiple R-squared:  0.5638, Adjusted R-squared:  0.5582 
## F-statistic: 101.1 on 5 and 391 DF,  p-value: < 2.2e-16
VIF(lm_backward)
##                 GVIF Df GVIF^(1/(2*Df))
## rank        1.597441  2        1.124233
## discipline  1.029040  1        1.014416
## yrs.service 1.627110  1        1.275582
## sex         1.030803  1        1.015284
# Fit a trend line
library(ggplot2)
ggplot(data, aes(x = yrs.service, y = salary.transformed)) +
  geom_point()+
  stat_smooth()+
  theme_classic()

# Converting it to categorical
range(data$yrs.service)
## [1]  0 60
# Binning yrs.service variable into categories
library(tidyverse)
Salaries_mod <- data %>% 
  mutate(service_time_cat = case_when(
    between(yrs.service, 0, 20)~"upto20",
    between(yrs.service, 20, 40)~"20_40",
    between(yrs.service, 40, 60)~"40_60"))

# Converting it to a factor variable
Salaries_mod$service_time_cat <- as.factor(Salaries_mod$service_time_cat)
levels(Salaries_mod$service_time_cat)
## [1] "20_40"  "40_60"  "upto20"
# Changing the order of the levels
Salaries_mod$service_time_cat <- relevel(Salaries_mod$service_time_cat, ref = "upto20")
levels(Salaries_mod$service_time_cat)
## [1] "upto20" "20_40"  "40_60"
# Generating a count table
table(Salaries_mod$service_time_cat)
## 
## upto20  20_40  40_60 
##    250    126     21
# Model Fitting
fit4 <- lm(salary.transformed ~ rank + discipline + yrs.service + sex + service_time_cat, data = Salaries_mod)
summary(fit4)
## 
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service + 
##     sex + service_time_cat, data = Salaries_mod)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.300e-05 -6.668e-06  7.800e-08  7.694e-06  3.613e-05 
## 
## Coefficients:
##                         Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)            1.207e+00  1.685e-06 716545.175  < 2e-16 ***
## rankAssocProf          1.238e-05  2.198e-06      5.632 3.42e-08 ***
## rankProf               3.304e-05  2.181e-06     15.150  < 2e-16 ***
## disciplineB            8.883e-06  1.195e-06      7.431 6.89e-13 ***
## yrs.service           -1.619e-08  1.224e-07     -0.132   0.8949    
## sexFemale             -3.552e-06  2.002e-06     -1.774   0.0768 .  
## service_time_cat20_40 -1.734e-06  2.469e-06     -0.702   0.4829    
## service_time_cat40_60 -6.565e-06  5.016e-06     -1.309   0.1914    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.168e-05 on 389 degrees of freedom
## Multiple R-squared:  0.5659, Adjusted R-squared:  0.5581 
## F-statistic: 72.46 on 7 and 389 DF,  p-value: < 2.2e-16
# checking the collinearity
VIF(fit4)
##                      GVIF Df GVIF^(1/(2*Df))
## rank             1.958284  2        1.182957
## discipline       1.031846  1        1.015798
## yrs.service      7.362341  1        2.713363
## sex              1.033842  1        1.016780
## service_time_cat 5.763038  2        1.549398
# Performing a stepwise regression
step(fit4, direction = "backward")
## Start:  AIC=-9010.23
## salary.transformed ~ rank + discipline + yrs.service + sex + 
##     service_time_cat
## 
##                    Df  Sum of Sq        RSS     AIC
## - service_time_cat  2 2.6200e-10 5.3306e-08 -9012.3
## - yrs.service       1 2.0000e-12 5.3046e-08 -9012.2
## <none>                           5.3044e-08 -9010.2
## - sex               1 4.2900e-10 5.3473e-08 -9009.0
## - discipline        1 7.5300e-09 6.0574e-08 -8959.5
## - rank              2 3.6343e-08 8.9387e-08 -8807.1
## 
## Step:  AIC=-9012.27
## salary.transformed ~ rank + discipline + yrs.service + sex
## 
##               Df  Sum of Sq        RSS     AIC
## <none>                      5.3306e-08 -9012.3
## - sex          1 4.0500e-10 5.3711e-08 -9011.3
## - yrs.service  1 7.5000e-10 5.4056e-08 -9008.7
## - discipline   1 7.6660e-09 6.0971e-08 -8960.9
## - rank         2 4.6369e-08 9.9674e-08 -8767.8
## 
## Call:
## lm(formula = salary.transformed ~ rank + discipline + yrs.service + 
##     sex, data = Salaries_mod)
## 
## Coefficients:
##   (Intercept)  rankAssocProf       rankProf    disciplineB    yrs.service  
##     1.207e+00      1.310e-05      3.425e-05      8.950e-06     -1.350e-07  
##     sexFemale  
##    -3.446e-06
# Fitting final model
fit5 <- lm(formula = salary ~ rank + discipline + yrs.service + sex, 
          data = Salaries_mod)
summary(fit5)
## 
## Call:
## lm(formula = salary ~ rank + discipline + yrs.service + sex, 
##     data = Salaries_mod)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64202 -14255  -1533  10571  99163 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   73122.92    3245.27  22.532  < 2e-16 ***
## rankAssocProf 14560.40    4098.32   3.553 0.000428 ***
## rankProf      49159.64    3834.49  12.820  < 2e-16 ***
## disciplineB   13473.38    2315.50   5.819 1.24e-08 ***
## yrs.service     -88.78     111.64  -0.795 0.426958    
## sexFemale     -4771.25    3878.00  -1.230 0.219311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22650 on 391 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4407 
## F-statistic: 63.41 on 5 and 391 DF,  p-value: < 2.2e-16
library(car)
car::qqPlot(rstandard(fit5))

## [1]  44 365
shapiro.test(fit5$residuals)       
## 
##  Shapiro-Wilk normality test
## 
## data:  fit5$residuals
## W = 0.96073, p-value = 8.202e-09
# checking the collinearity
VIF(fit5)
##                 GVIF Df GVIF^(1/(2*Df))
## rank        1.597441  2        1.124233
## discipline  1.029040  1        1.014416
## yrs.service 1.627110  1        1.275582
## sex         1.030803  1        1.015284
# Making result tidy
tidy(fit5)
## # A tibble: 6 x 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    73123.      3245.    22.5   1.17e-72
## 2 rankAssocProf  14560.      4098.     3.55  4.28e- 4
## 3 rankProf       49160.      3834.    12.8   1.18e-31
## 4 disciplineB    13473.      2315.     5.82  1.24e- 8
## 5 yrs.service      -88.8      112.    -0.795 4.27e- 1
## 6 sexFemale      -4771.      3878.    -1.23  2.19e- 1
# ANOVA analysis
anova(fit5)
## Analysis of Variance Table
## 
## Response: salary
##              Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## rank          2 1.4323e+11 7.1616e+10 139.5761 < 2.2e-16 ***
## discipline    1 1.8430e+10 1.8430e+10  35.9191 4.675e-09 ***
## yrs.service   1 2.4187e+08 2.4187e+08   0.4714    0.4928    
## sex           1 7.7669e+08 7.7669e+08   1.5137    0.2193    
## Residuals   391 2.0062e+11 5.1310e+08                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

follow me for more on

  1. RPubs

  2. Telegram

    1. Channel

    2. Q & A

  3. YouTube

  4. Website

  5. Aparat