Linear Regression

Importing data

# The standard model: Y = β0 + β1X1 + · · · + βnXn + ϵ, is commonly used 
# The basic syntax is lm(y ∼ x, data)

library(readxl)
df = read_excel("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Professorial Salaries.xlsx")

head(df, 10)

## # A tibble: 10 × 7
##       ID Rank      Discipline Yrs.since.phd Yrs.service Sex    Salary
##    <dbl> <chr>     <chr>              <dbl>       <dbl> <chr>   <dbl>
##  1     1 Prof      B                     19          18 Male   139750
##  2     2 Prof      B                     20          16 Male   173200
##  3     3 AsstProf  B                      4           3 Male    79750
##  4     4 Prof      B                     45          39 Male   115000
##  5     5 Prof      B                     40          41 Male   141500
##  6     6 AssocProf B                      6           6 Male    97000
##  7     7 Prof      B                     30          23 Male   175000
##  8     8 Prof      B                     45          45 Male   147765
##  9     9 Prof      B                     21          20 Male   119250
## 10    10 Prof      B                     18          18 Female 129000

tail(df, 10)

## # A tibble: 10 × 7
##       ID Rank     Discipline Yrs.since.phd Yrs.service Sex   Salary
##    <dbl> <chr>    <chr>              <dbl>       <dbl> <chr>  <dbl>
##  1   388 Prof     A                     29          15 Male  109305
##  2   389 Prof     A                     38          36 Male  119450
##  3   390 Prof     A                     33          18 Male  186023
##  4   391 Prof     A                     40          19 Male  166605
##  5   392 Prof     A                     30          19 Male  151292
##  6   393 Prof     A                     33          30 Male  103106
##  7   394 Prof     A                     31          19 Male  150564
##  8   395 Prof     A                     42          25 Male  101738
##  9   396 Prof     A                     25          15 Male   95329
## 10   397 AsstProf A                      8           4 Male   81035

str(df)

## tibble [397 × 7] (S3: tbl_df/tbl/data.frame)
##  $ ID           : num [1:397] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Rank         : chr [1:397] "Prof" "Prof" "AsstProf" "Prof" ...
##  $ Discipline   : chr [1:397] "B" "B" "B" "B" ...
##  $ Yrs.since.phd: num [1:397] 19 20 4 45 40 6 30 45 21 18 ...
##  $ Yrs.service  : num [1:397] 18 16 3 39 41 6 23 45 20 18 ...
##  $ Sex          : chr [1:397] "Male" "Male" "Male" "Male" ...
##  $ Salary       : num [1:397] 139750 173200 79750 115000 141500 ...

1. Simple linear model

# Simple Linear model about the relationship between Salary and years after PhD graduation
mod_linear = lm(Salary ~ Yrs.since.phd, data = df)
mod_linear # --> Salary ~ 91718.7 + 985.3 x Yrs.since.phd

## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
## 
## Coefficients:
##   (Intercept)  Yrs.since.phd  
##       91718.7          985.3

summary(mod_linear)

## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84171 -19432  -2858  16086 102383 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    91718.7     2765.8  33.162   <2e-16 ***
## Yrs.since.phd    985.3      107.4   9.177   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1737 
## F-statistic: 84.23 on 1 and 395 DF,  p-value: < 2.2e-16

1.1

# In order to obtain a confidence interval for the coefficient estimates, we can use the confint() command.
confint (mod_linear)

##                    2.5 %    97.5 %
## (Intercept)   86281.1714 97156.199
## Yrs.since.phd   774.2636  1196.421

predict (mod_linear, data.frame ( Yrs.since.phd = (c(5, 10, 15) )),
         interval = "confidence")

##        fit       lwr      upr
## 1  96645.4  92091.47 101199.3
## 2 101572.1  97812.11 105332.1
## 3 106498.8 103373.97 109623.7

1.2 Plot

attach(df)
plot (Yrs.since.phd, Salary)
abline (mod_linear)
abline (mod_linear, lwd = 3)
abline (mod_linear, lwd = 3, col = "red")

plot (Yrs.since.phd, Salary , col = " red ")

plot (Yrs.since.phd, Salary , pch = 20)

plot (Yrs.since.phd, Salary , pch = "+")

2. Multiple Linear Regression

multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)

## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, 
##     data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64844 -14939  -1401  12137 107498 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    89596.4     4891.1  18.318  < 2e-16 ***
## Yrs.since.phd    265.8      247.9   1.072  0.28423    
## Yrs.service     -373.8      220.8  -1.693  0.09132 .  
## SexMale         5499.1     4034.7   1.363  0.17368    
## RankAsstProf  -13886.6     4333.1  -3.205  0.00146 ** 
## RankProf       33053.4     3700.7   8.932  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared:  0.4017, Adjusted R-squared:  0.3941 
## F-statistic: 52.51 on 5 and 391 DF,  p-value: < 2.2e-16

# Change reference

df$Rank = as.factor(df$Rank)
df$Sex = as.factor(df$Sex)

df$Rank = relevel(df$Rank, ref = "AsstProf")
df$Sex = relevel(df$Sex, ref = "Female")

multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)

## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, 
##     data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64844 -14939  -1401  12137 107498 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    75709.8     4504.6  16.807  < 2e-16 ***
## Yrs.since.phd    265.8      247.9   1.072  0.28423    
## Yrs.service     -373.8      220.8  -1.693  0.09132 .  
## SexMale         5499.1     4034.7   1.363  0.17368    
## RankAssocProf  13886.6     4333.1   3.205  0.00146 ** 
## RankProf       46940.0     4421.4  10.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared:  0.4017, Adjusted R-squared:  0.3941 
## F-statistic: 52.51 on 5 and 391 DF,  p-value: < 2.2e-16

# VIFs are commonly used as a diagnostic tool to identify collinearity in multiple regression models. 

library(car)

## Warning: package 'car' was built under R version 4.2.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.2

vif(multi_linear)

##                   GVIF Df GVIF^(1/(2*Df))
## Yrs.since.phd 7.271171  1        2.696511
## Yrs.service   5.876405  1        2.424130
## Sex           1.029868  1        1.014824
## Rank          2.002583  2        1.189591

If high VIFs are detected, one approach to addressing the problem is to remove one or more of the highly correlated predictor variables from the model

SUMS - Linear Regression

Vu Thien

2023-03-18

Linear Regression

Importing data

1. Simple linear model

1.1

1.2 Plot

2. Multiple Linear Regression

If high VIFs are detected, one approach to addressing the problem is to remove one or more of the highly correlated predictor variables from the model