Linear Regression
Importing data
# The standard model: Y = β0 + β1X1 + · · · + βnXn + ϵ, is commonly used
# The basic syntax is lm(y ∼ x, data)
library(readxl)
df = read_excel("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Professorial Salaries.xlsx")
head(df, 10)
## # A tibble: 10 × 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
## 7 7 Prof B 30 23 Male 175000
## 8 8 Prof B 45 45 Male 147765
## 9 9 Prof B 21 20 Male 119250
## 10 10 Prof B 18 18 Female 129000
tail(df, 10)
## # A tibble: 10 × 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 388 Prof A 29 15 Male 109305
## 2 389 Prof A 38 36 Male 119450
## 3 390 Prof A 33 18 Male 186023
## 4 391 Prof A 40 19 Male 166605
## 5 392 Prof A 30 19 Male 151292
## 6 393 Prof A 33 30 Male 103106
## 7 394 Prof A 31 19 Male 150564
## 8 395 Prof A 42 25 Male 101738
## 9 396 Prof A 25 15 Male 95329
## 10 397 AsstProf A 8 4 Male 81035
str(df)
## tibble [397 × 7] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:397] 1 2 3 4 5 6 7 8 9 10 ...
## $ Rank : chr [1:397] "Prof" "Prof" "AsstProf" "Prof" ...
## $ Discipline : chr [1:397] "B" "B" "B" "B" ...
## $ Yrs.since.phd: num [1:397] 19 20 4 45 40 6 30 45 21 18 ...
## $ Yrs.service : num [1:397] 18 16 3 39 41 6 23 45 20 18 ...
## $ Sex : chr [1:397] "Male" "Male" "Male" "Male" ...
## $ Salary : num [1:397] 139750 173200 79750 115000 141500 ...
1. Simple linear model
# Simple Linear model about the relationship between Salary and years after PhD graduation
mod_linear = lm(Salary ~ Yrs.since.phd, data = df)
mod_linear # --> Salary ~ 91718.7 + 985.3 x Yrs.since.phd
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
##
## Coefficients:
## (Intercept) Yrs.since.phd
## 91718.7 985.3
summary(mod_linear)
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84171 -19432 -2858 16086 102383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91718.7 2765.8 33.162 <2e-16 ***
## Yrs.since.phd 985.3 107.4 9.177 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1737
## F-statistic: 84.23 on 1 and 395 DF, p-value: < 2.2e-16
1.1
# In order to obtain a confidence interval for the coefficient estimates, we can use the confint() command.
confint (mod_linear)
## 2.5 % 97.5 %
## (Intercept) 86281.1714 97156.199
## Yrs.since.phd 774.2636 1196.421
predict (mod_linear, data.frame ( Yrs.since.phd = (c(5, 10, 15) )),
interval = "confidence")
## fit lwr upr
## 1 96645.4 92091.47 101199.3
## 2 101572.1 97812.11 105332.1
## 3 106498.8 103373.97 109623.7
1.2 Plot
attach(df)
plot (Yrs.since.phd, Salary)
abline (mod_linear)
abline (mod_linear, lwd = 3)
abline (mod_linear, lwd = 3, col = "red")

plot (Yrs.since.phd, Salary , col = " red ")

plot (Yrs.since.phd, Salary , pch = 20)

plot (Yrs.since.phd, Salary , pch = "+")

2. Multiple Linear Regression
multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64844 -14939 -1401 12137 107498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89596.4 4891.1 18.318 < 2e-16 ***
## Yrs.since.phd 265.8 247.9 1.072 0.28423
## Yrs.service -373.8 220.8 -1.693 0.09132 .
## SexMale 5499.1 4034.7 1.363 0.17368
## RankAsstProf -13886.6 4333.1 -3.205 0.00146 **
## RankProf 33053.4 3700.7 8.932 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared: 0.4017, Adjusted R-squared: 0.3941
## F-statistic: 52.51 on 5 and 391 DF, p-value: < 2.2e-16
# Change reference
df$Rank = as.factor(df$Rank)
df$Sex = as.factor(df$Sex)
df$Rank = relevel(df$Rank, ref = "AsstProf")
df$Sex = relevel(df$Sex, ref = "Female")
multi_linear = lm(Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank, df)
summary(multi_linear)
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd + Yrs.service + Sex + Rank,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64844 -14939 -1401 12137 107498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75709.8 4504.6 16.807 < 2e-16 ***
## Yrs.since.phd 265.8 247.9 1.072 0.28423
## Yrs.service -373.8 220.8 -1.693 0.09132 .
## SexMale 5499.1 4034.7 1.363 0.17368
## RankAssocProf 13886.6 4333.1 3.205 0.00146 **
## RankProf 46940.0 4421.4 10.617 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23580 on 391 degrees of freedom
## Multiple R-squared: 0.4017, Adjusted R-squared: 0.3941
## F-statistic: 52.51 on 5 and 391 DF, p-value: < 2.2e-16
# VIFs are commonly used as a diagnostic tool to identify collinearity in multiple regression models.
library(car)
## Warning: package 'car' was built under R version 4.2.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.2
vif(multi_linear)
## GVIF Df GVIF^(1/(2*Df))
## Yrs.since.phd 7.271171 1 2.696511
## Yrs.service 5.876405 1 2.424130
## Sex 1.029868 1 1.014824
## Rank 2.002583 2 1.189591
If high VIFs are detected, one approach to addressing the problem is
to remove one or more of the highly correlated predictor variables from
the model