Linear Regression

Importing data

# The standard model: Y = β0 + β1X1 + · · · + βnXn + ϵ, is commonly used 
# The basic syntax is lm(y ∼ x, data)

library(readxl)
df = read_excel("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Professorial Salaries.xlsx")

head(df, 10)
## # A tibble: 10 × 7
##       ID Rank      Discipline Yrs.since.phd Yrs.service Sex    Salary
##    <dbl> <chr>     <chr>              <dbl>       <dbl> <chr>   <dbl>
##  1     1 Prof      B                     19          18 Male   139750
##  2     2 Prof      B                     20          16 Male   173200
##  3     3 AsstProf  B                      4           3 Male    79750
##  4     4 Prof      B                     45          39 Male   115000
##  5     5 Prof      B                     40          41 Male   141500
##  6     6 AssocProf B                      6           6 Male    97000
##  7     7 Prof      B                     30          23 Male   175000
##  8     8 Prof      B                     45          45 Male   147765
##  9     9 Prof      B                     21          20 Male   119250
## 10    10 Prof      B                     18          18 Female 129000
tail(df, 10)
## # A tibble: 10 × 7
##       ID Rank     Discipline Yrs.since.phd Yrs.service Sex   Salary
##    <dbl> <chr>    <chr>              <dbl>       <dbl> <chr>  <dbl>
##  1   388 Prof     A                     29          15 Male  109305
##  2   389 Prof     A                     38          36 Male  119450
##  3   390 Prof     A                     33          18 Male  186023
##  4   391 Prof     A                     40          19 Male  166605
##  5   392 Prof     A                     30          19 Male  151292
##  6   393 Prof     A                     33          30 Male  103106
##  7   394 Prof     A                     31          19 Male  150564
##  8   395 Prof     A                     42          25 Male  101738
##  9   396 Prof     A                     25          15 Male   95329
## 10   397 AsstProf A                      8           4 Male   81035
str(df)
## tibble [397 × 7] (S3: tbl_df/tbl/data.frame)
##  $ ID           : num [1:397] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Rank         : chr [1:397] "Prof" "Prof" "AsstProf" "Prof" ...
##  $ Discipline   : chr [1:397] "B" "B" "B" "B" ...
##  $ Yrs.since.phd: num [1:397] 19 20 4 45 40 6 30 45 21 18 ...
##  $ Yrs.service  : num [1:397] 18 16 3 39 41 6 23 45 20 18 ...
##  $ Sex          : chr [1:397] "Male" "Male" "Male" "Male" ...
##  $ Salary       : num [1:397] 139750 173200 79750 115000 141500 ...

1. Simple linear model

# Simple Linear model about the relationship between Salary and years after PhD graduation
mod_linear = lm(Salary ~ Yrs.since.phd, data = df)
mod_linear # --> Salary ~ 91718.7 + 985.3 x Yrs.since.phd
## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
## 
## Coefficients:
##   (Intercept)  Yrs.since.phd  
##       91718.7          985.3
summary(mod_linear)
## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84171 -19432  -2858  16086 102383 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    91718.7     2765.8  33.162   <2e-16 ***
## Yrs.since.phd    985.3      107.4   9.177   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1737 
## F-statistic: 84.23 on 1 and 395 DF,  p-value: < 2.2e-16

1.1

# In order to obtain a confidence interval for the coefficient estimates, we can use the confint() command.
confint (mod_linear)
##                    2.5 %    97.5 %
## (Intercept)   86281.1714 97156.199
## Yrs.since.phd   774.2636  1196.421
predict (mod_linear, data.frame ( Yrs.since.phd = (c(5, 10, 15) )),
         interval = "confidence")
##        fit       lwr      upr
## 1  96645.4  92091.47 101199.3
## 2 101572.1  97812.11 105332.1
## 3 106498.8 103373.97 109623.7

1.2 Plot

attach(df)
plot (Yrs.since.phd, Salary)
abline (mod_linear)
abline (mod_linear, lwd = 3)
abline (mod_linear, lwd = 3, col = "red")

plot (Yrs.since.phd, Salary , col = " red ")

plot (Yrs.since.phd, Salary , pch = 20)

plot (Yrs.since.phd, Salary , pch = "+")

2. Multiple Linear Regression

multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)
## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, 
##     data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64844 -14939  -1401  12137 107498 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    89596.4     4891.1  18.318  < 2e-16 ***
## Yrs.since.phd    265.8      247.9   1.072  0.28423    
## Yrs.service     -373.8      220.8  -1.693  0.09132 .  
## SexMale         5499.1     4034.7   1.363  0.17368    
## RankAsstProf  -13886.6     4333.1  -3.205  0.00146 ** 
## RankProf       33053.4     3700.7   8.932  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared:  0.4017, Adjusted R-squared:  0.3941 
## F-statistic: 52.51 on 5 and 391 DF,  p-value: < 2.2e-16
# Change reference

df$Rank = as.factor(df$Rank)
df$Sex = as.factor(df$Sex)

df$Rank = relevel(df$Rank, ref = "AsstProf")
df$Sex = relevel(df$Sex, ref = "Female")
multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)
## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, 
##     data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64844 -14939  -1401  12137 107498 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    75709.8     4504.6  16.807  < 2e-16 ***
## Yrs.since.phd    265.8      247.9   1.072  0.28423    
## Yrs.service     -373.8      220.8  -1.693  0.09132 .  
## SexMale         5499.1     4034.7   1.363  0.17368    
## RankAssocProf  13886.6     4333.1   3.205  0.00146 ** 
## RankProf       46940.0     4421.4  10.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared:  0.4017, Adjusted R-squared:  0.3941 
## F-statistic: 52.51 on 5 and 391 DF,  p-value: < 2.2e-16
# VIFs are commonly used as a diagnostic tool to identify collinearity in multiple regression models. 

library(car)
## Warning: package 'car' was built under R version 4.2.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.2
vif(multi_linear)
##                   GVIF Df GVIF^(1/(2*Df))
## Yrs.since.phd 7.271171  1        2.696511
## Yrs.service   5.876405  1        2.424130
## Sex           1.029868  1        1.014824
## Rank          2.002583  2        1.189591

If high VIFs are detected, one approach to addressing the problem is to remove one or more of the highly correlated predictor variables from the model