rm(list = ls())
graphics.off()
setwd("/Users/iamarmenain2001/Desktop/UNI/B3/ECONOMETRICS R/Project")
dataset <- read.csv("Salary.csv", sep = ",", dec = ".", header = TRUE)
dataset_cleaned <- subset(dataset, Country == "USA")
colnames(dataset_cleaned)[3] <- "Education"
dataset_cleaned <- dataset_cleaned[, !names(dataset_cleaned) %in% c("Job.Title")]
str(dataset_cleaned)
## 'data.frame': 1356 obs. of 8 variables:
## $ Age : num 28 36 52 29 42 40 33 41 29 46 ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Education : int 2 1 2 1 2 2 2 2 2 3 ...
## $ Years.of.Experience: num 3 7 20 2 12 14 7 13 3 20 ...
## $ Salary : num 65000 60000 200000 55000 120000 130000 90000 140000 75000 170000 ...
## $ Country : chr "USA" "USA" "USA" "USA" ...
## $ Race : chr "Hispanic" "Hispanic" "Asian" "Hispanic" ...
## $ Senior : int 0 0 0 0 0 0 0 0 0 1 ...
head(dataset_cleaned)
## Age Gender Education Years.of.Experience Salary Country Race
## 2 28 Female 2 3 65000 USA Hispanic
## 4 36 Female 1 7 60000 USA Hispanic
## 5 52 Male 2 20 200000 USA Asian
## 6 29 Male 1 2 55000 USA Hispanic
## 7 42 Female 2 12 120000 USA Asian
## 14 40 Female 2 14 130000 USA African American
## Senior
## 2 0
## 4 0
## 5 0
## 6 0
## 7 0
## 14 0
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effectsize)
library(report)
library(parameters)
library(visreg)
library(interactions)
library(corrplot)
## corrplot 0.95 loaded
# Male dummy
dataset_cleaned$Male <- ifelse(dataset_cleaned$Gender == "Male", 1, 0)
# Race dummies (White = reference: 0 0 0)
dataset_cleaned$Race_AA <- ifelse(dataset_cleaned$Race == "African American", 1, 0)
dataset_cleaned$Race_Asian <- ifelse(dataset_cleaned$Race == "Asian", 1, 0)
dataset_cleaned$Race_Hisp <- ifelse(dataset_cleaned$Race == "Hispanic", 1, 0)
num_only <- dataset_cleaned[sapply(dataset_cleaned, is.numeric)]
corr <- cor(num_only, use = "pairwise.complete.obs")
corrplot(corr, method = "number")
corrplot(corr, method = "ellipse", type = "upper", tl.srt = 45)
colSums(is.na(dataset_cleaned))
## Age Gender Education Years.of.Experience
## 0 0 0 0
## Salary Country Race Senior
## 0 0 0 0
## Male Race_AA Race_Asian Race_Hisp
## 0 0 0 0
m_age <- lm(Salary ~ Age, data = dataset_cleaned)
m_exp <- lm(Salary ~ Years.of.Experience, data = dataset_cleaned)
m_edu <- lm(Salary ~ Education, data = dataset_cleaned)
m_gender <- lm(Salary ~ Male, data = dataset_cleaned)
m_race <- lm(Salary ~ Race_AA + Race_Asian + Race_Hisp,
data = dataset_cleaned)
m_senior <- lm(Salary ~ Senior, data = dataset_cleaned)
m_all <- lm(Salary ~ Age + Years.of.Experience + Education +
Male + Race_AA + Race_Asian + Race_Hisp + Senior,
data = dataset_cleaned)
summary(m_all)
##
## Call:
## lm(formula = Salary ~ Age + Years.of.Experience + Education +
## Male + Race_AA + Race_Asian + Race_Hisp + Senior, data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101305 -18083 -6055 15844 78239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104443.7 7731.4 13.509 < 2e-16 ***
## Age -2751.8 314.2 -8.757 < 2e-16 ***
## Years.of.Experience 9035.4 398.6 22.670 < 2e-16 ***
## Education 15863.3 1141.7 13.895 < 2e-16 ***
## Male 7949.8 1566.4 5.075 4.41e-07 ***
## Race_AA -723.1 2138.0 -0.338 0.73526
## Race_Asian 3381.9 2168.2 1.560 0.11906
## Race_Hisp 119.6 2190.3 0.055 0.95648
## Senior -7818.1 2411.4 -3.242 0.00122 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28230 on 1347 degrees of freedom
## Multiple R-squared: 0.7059, Adjusted R-squared: 0.7041
## F-statistic: 404.1 on 8 and 1347 DF, p-value: < 2.2e-16
m_age_exp <- lm(Salary ~ Age + Years.of.Experience, data = dataset_cleaned)
m_age_edu <- lm(Salary ~ Age + Education, data = dataset_cleaned)
m_exp_edu <- lm(Salary ~ Years.of.Experience + Education, data = dataset_cleaned)
m_exp_gender <- lm(Salary ~ Years.of.Experience + Male, data = dataset_cleaned)
m_edu_race <- lm(Salary ~ Education + Race_AA + Race_Asian + Race_Hisp,
data = dataset_cleaned)
m_gender_race <- lm(Salary ~ Male + Race_AA + Race_Asian + Race_Hisp,
data = dataset_cleaned)
m_senior_exp <- lm(Salary ~ Senior + Years.of.Experience, data = dataset_cleaned)
par(mfrow = c(1,3))
plot(dataset_cleaned$Age, dataset_cleaned$Salary,
main = "Salary vs Age", pch=8, col="blue")
abline(m_age)
plot(dataset_cleaned$Education, dataset_cleaned$Salary,
main = "Salary vs Education", pch=8, col="blue")
abline(m_edu)
plot(dataset_cleaned$Years.of.Experience, dataset_cleaned$Salary,
main = "Salary vs Experience", pch=8, col="blue")
abline(m_exp)
############################################################ # STARGAZER
— SIMPLE + FULL
############################################################
stargazer(m_exp, m_age, m_edu, m_all, type="text")
##
## ===============================================================================================================================
## Dependent variable:
## -----------------------------------------------------------------------------------------------------------
## Salary
## (1) (2) (3) (4)
## -------------------------------------------------------------------------------------------------------------------------------
## Years.of.Experience 7,151.466*** 9,035.395***
## (144.093) (398.569)
##
## Age 5,089.125*** -2,751.754***
## (134.353) (314.221)
##
## Education 38,581.560*** 15,863.340***
## (1,260.373) (1,141.656)
##
## Male 7,949.754***
## (1,566.386)
##
## Race_AA -723.095
## (2,138.029)
##
## Race_Asian 3,381.855
## (2,168.220)
##
## Race_Hisp 119.557
## (2,190.329)
##
## Senior -7,818.115***
## (2,411.405)
##
## Constant 57,021.130*** -56,139.580*** 51,484.500*** 104,443.700***
## (1,406.154) (4,572.052) (2,283.227) (7,731.420)
##
## -------------------------------------------------------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356 1,356
## R2 0.645 0.514 0.409 0.706
## Adjusted R2 0.645 0.514 0.409 0.704
## Residual Std. Error 30,922.480 (df = 1354) 36,177.590 (df = 1354) 39,914.580 (df = 1354) 28,230.290 (df = 1347)
## F Statistic 2,463.231*** (df = 1; 1354) 1,434.804*** (df = 1; 1354) 937.048*** (df = 1; 1354) 404.126*** (df = 8; 1347)
## ===============================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
par(mfrow = c(2,2))
plot(m_exp$residuals); abline(h=0, col="red")
plot(m_age$residuals); abline(h=0, col="red")
plot(m_edu$residuals); abline(h=0, col="red")
plot(m_all$residuals); abline(h=0, col="red")
############################################################ # OUTLIER
DETECTION AND CLEANING FOR SALARY
############################################################
# Boxplot (visual check)
boxplot(dataset_cleaned$Salary,
main = "Boxplot of Salary",
ylab = "Salary",
col = "lightblue")
# Compute IQR bounds for Salary
Q1_S <- quantile(dataset_cleaned$Salary, 0.25, na.rm = TRUE)
Q3_S <- quantile(dataset_cleaned$Salary, 0.75, na.rm = TRUE)
IQR_S <- Q3_S - Q1_S
LOW_S <- Q1_S - 1.5 * IQR_S
UPP_S <- Q3_S + 1.5 * IQR_S
# Identify outliers
is_out_S <- dataset_cleaned$Salary < LOW_S | dataset_cleaned$Salary > UPP_S
table(is_out_S)
## is_out_S
## FALSE
## 1356
# CREATE REMOVED-OUTLIER DATASET
data_removeS <- dataset_cleaned[!is_out_S, ]
# CREATE CAPPED-OUTLIER DATASET
data_capS <- dataset_cleaned
data_capS$Salary[data_capS$Salary < LOW_S] <- LOW_S
data_capS$Salary[data_capS$Salary > UPP_S] <- UPP_S
# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeS)
## [1] 1356
nrow(data_capS)
## [1] 1356
# Boxplot (visual check)
boxplot(dataset_cleaned$Age,
main = "Boxplot of Age",
ylab = "Age",
col = "lightblue")
# Compute IQR bounds for Age
Q1_A <- quantile(dataset_cleaned$Age, 0.25, na.rm = TRUE)
Q3_A <- quantile(dataset_cleaned$Age, 0.75, na.rm = TRUE)
IQR_A <- Q3_A - Q1_A
LOW_A <- Q1_A - 1.5 * IQR_A
UPP_A <- Q3_A + 1.5 * IQR_A
# Identify outliers
is_out_A <- dataset_cleaned$Age < LOW_A | dataset_cleaned$Age > UPP_A
table(is_out_A)
## is_out_A
## FALSE TRUE
## 1322 34
# CREATE REMOVED-OUTLIER DATASET
data_removeA <- dataset_cleaned[!is_out_A, ]
# CREATE CAPPED-OUTLIER DATASET
data_capA <- dataset_cleaned
data_capA$Age[data_capA$Age < LOW_A] <- LOW_A
data_capA$Age[data_capA$Age > UPP_A] <- UPP_A
# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeA)
## [1] 1322
nrow(data_capA)
## [1] 1356
# Boxplot (visual check)
boxplot(dataset_cleaned$Years.of.Experience,
main = "Boxplot of Years of Experience",
ylab = "Years of Experience",
col = "lightblue")
# Compute IQR bounds for Experience
Q1_E <- quantile(dataset_cleaned$Years.of.Experience, 0.25, na.rm = TRUE)
Q3_E <- quantile(dataset_cleaned$Years.of.Experience, 0.75, na.rm = TRUE)
IQR_E <- Q3_E - Q1_E
LOW_E <- Q1_E - 1.5 * IQR_E
UPP_E <- Q3_E + 1.5 * IQR_E
# Identify outliers
is_out_E <- dataset_cleaned$Years.of.Experience < LOW_E | dataset_cleaned$Years.of.Experience > UPP_E
table(is_out_E)
## is_out_E
## FALSE TRUE
## 1334 22
# CREATE REMOVED-OUTLIER DATASET
data_removeE <- dataset_cleaned[!is_out_E, ]
# CREATE CAPPED-OUTLIER DATASET
data_capE <- dataset_cleaned
data_capE$Years.of.Experience[data_capE$Years.of.Experience < LOW_E] <- LOW_E
data_capE$Years.of.Experience[data_capE$Years.of.Experience > UPP_E] <- UPP_E
# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeE)
## [1] 1334
nrow(data_capE)
## [1] 1356
# Unified function to produce removed + capped datasets
trim_outliers <- function(df, var){
Q1 <- quantile(df[[var]], 0.25, na.rm=TRUE)
Q3 <- quantile(df[[var]], 0.75, na.rm=TRUE)
IQRv <- Q3 - Q1
LOW <- Q1 - 1.5*IQRv
UPP <- Q3 + 1.5*IQRv
is_out <- df[[var]] < LOW | df[[var]] > UPP
df_remove <- df[!is_out, ]
df_cap <- df
df_cap[df_cap[[var]] < LOW, var] <- LOW
df_cap[df_cap[[var]] > UPP, var] <- UPP
return(list(remove=df_remove, cap=df_cap))
}
out_S <- trim_outliers(dataset_cleaned, "Salary")
out_A <- trim_outliers(dataset_cleaned, "Age")
out_E <- trim_outliers(dataset_cleaned, "Years.of.Experience")
data_removeS <- out_S$remove
data_capS <- out_S$cap
m_age_edu_O <- m_age_edu
m_age_edu_R <- lm(Salary ~ Age + Education, data=data_removeS)
m_age_edu_C <- lm(Salary ~ Age + Education, data=data_capS)
m_exp_edu_O <- m_exp_edu
m_exp_edu_R <- lm(Salary ~ Years.of.Experience + Education, data=data_removeS)
m_exp_edu_C <- lm(Salary ~ Years.of.Experience + Education, data=data_capS)
m_age_exp_O <- m_age_exp
m_age_exp_R <- lm(Salary ~ Age + Years.of.Experience, data=data_removeS)
m_age_exp_C <- lm(Salary ~ Age + Years.of.Experience, data=data_capS)
stargazer(m_age_edu_O, m_age_edu_R, m_age_edu_C, type="text",
title="Age + Education: Outlier Comparison")
##
## Age + Education: Outlier Comparison
## ============================================================================
## Dependent variable:
## --------------------------------------------
## Salary
## (1) (2) (3)
## ----------------------------------------------------------------------------
## Age 3,699.390*** 3,699.390*** 3,699.390***
## (154.096) (154.096) (154.096)
##
## Education 19,956.250*** 19,956.250*** 19,956.250***
## (1,310.244) (1,310.244) (1,310.244)
##
## Constant -41,769.540*** -41,769.540*** -41,769.540***
## (4,329.836) (4,329.836) (4,329.836)
##
## ----------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.586 0.586 0.586
## Adjusted R2 0.585 0.585 0.585
## Residual Std. Error (df = 1353) 33,437.730 33,437.730 33,437.730
## F Statistic (df = 2; 1353) 955.776*** 955.776*** 955.776***
## ============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(m_exp_edu_O, m_exp_edu_R, m_exp_edu_C, type="text",
title="Experience + Education: Outlier Comparison")
##
## Experience + Education: Outlier Comparison
## =========================================================================
## Dependent variable:
## -----------------------------------------
## Salary
## (1) (2) (3)
## -------------------------------------------------------------------------
## Years.of.Experience 5,859.633*** 5,859.633*** 5,859.633***
## (172.685) (172.685) (172.685)
##
## Education 14,337.230*** 14,337.230*** 14,337.230***
## (1,170.182) (1,170.182) (1,170.182)
##
## Constant 44,273.660*** 44,273.660*** 44,273.660***
## (1,692.220) (1,692.220) (1,692.220)
##
## -------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.681 0.681 0.681
## Adjusted R2 0.680 0.680 0.680
## Residual Std. Error (df = 1353) 29,348.620 29,348.620 29,348.620
## F Statistic (df = 2; 1353) 1,442.310*** 1,442.310*** 1,442.310***
## =========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(m_age_exp_O, m_age_exp_R, m_age_exp_C, type="text",
title="Age + Experience: Outlier Comparison")
##
## Age + Experience: Outlier Comparison
## ============================================================================
## Dependent variable:
## --------------------------------------------
## Salary
## (1) (2) (3)
## ----------------------------------------------------------------------------
## Age -2,443.303*** -2,443.303*** -2,443.303***
## (334.504) (334.504) (334.504)
##
## Years.of.Experience 10,038.050*** 10,038.050*** 10,038.050***
## (419.722) (419.722) (419.722)
##
## Constant 115,630.400*** 115,630.400*** 115,630.400***
## (8,141.734) (8,141.734) (8,141.734)
##
## ----------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.659 0.659 0.659
## Adjusted R2 0.658 0.658 0.658
## Residual Std. Error (df = 1353) 30,341.470 30,341.470 30,341.470
## F Statistic (df = 2; 1353) 1,305.912*** 1,305.912*** 1,305.912***
## ============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
dat <- dataset_cleaned
dat$Salary_norm <- (dat$Salary - min(dat$Salary))/(max(dat$Salary)-min(dat$Salary))
dat$Salary_z <- scale(dat$Salary)
m_norm <- lm(Salary_norm ~ Age + Education + Years.of.Experience, data=dat)
m_z <- lm(Salary_z ~ Age + Education + Years.of.Experience, data=dat)
m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience, data=dataset_cleaned)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience, data=data_removeS)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience, data=data_capS)
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
type="text",
title="Scaling + Outlier Strategies Combined")
##
## Scaling + Outlier Strategies Combined
## ======================================================================================================
## Dependent variable:
## ----------------------------------------------------------------------
## Salary Salary_norm Salary_z
## (1) (2) (3) (4) (5)
## ------------------------------------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680*** -0.011*** -0.052***
## (315.866) (315.866) (315.866) (0.001) (0.006)
##
## Education 14,970.470*** 14,970.470*** 14,970.470*** 0.062*** 0.288***
## (1,142.364) (1,142.364) (1,142.364) (0.005) (0.022)
##
## Years.of.Experience 9,005.050*** 9,005.050*** 9,005.050*** 0.038*** 0.174***
## (403.287) (403.287) (403.287) (0.002) (0.008)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600*** 0.452*** -0.082
## (7,690.070) (7,690.070) (7,690.070) (0.032) (0.148)
##
## ------------------------------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356 1,356 1,356
## R2 0.697 0.697 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697 0.697 0.697
## Residual Std. Error (df = 1352) 28,591.080 28,591.080 28,591.080 0.119 0.551
## F Statistic (df = 3; 1352) 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716***
## ======================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
m_full <- lm(Salary ~ Age + Education + Years.of.Experience + Male,
data = dataset_cleaned)
summary(m_full)
##
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience +
## Male, data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98564 -18166 -5796 15763 77636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107814.0 7626.4 14.137 < 2e-16 ***
## Age -2844.1 314.3 -9.048 < 2e-16 ***
## Education 15550.6 1138.6 13.657 < 2e-16 ***
## Years.of.Experience 9029.5 399.9 22.582 < 2e-16 ***
## Male 7773.3 1570.6 4.949 8.39e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28350 on 1351 degrees of freedom
## Multiple R-squared: 0.7026, Adjusted R-squared: 0.7017
## F-statistic: 797.9 on 4 and 1351 DF, p-value: < 2.2e-16
vif(m_full)
## Age Education Years.of.Experience Male
## 8.915277 1.618227 9.164269 1.032334
t_exp <- summary(m_full)$coefficients["Years.of.Experience", "t value"]
p_two <- summary(m_full)$coefficients["Years.of.Experience","Pr(>|t|)"]
p_one_gt <- ifelse(t_exp > 0, p_two/2, 1 - p_two/2)
p_one_gt
## [1] 2.343384e-96
library(car)
vif(m_all)
## Age Years.of.Experience Education Male
## 8.983097 9.179956 1.640231 1.035230
## Race_AA Race_Asian Race_Hisp Senior
## 1.494903 1.487944 1.478103 1.117492
library(stargazer)
stargazer(m_all_O, m_all_R, m_all_C,
type = "text",
title = "Outlier Comparison: Original vs Removed vs Capped",
dep.var.labels = "Salary",
column.labels = c("Original", "Removed", "Capped"),
covariate.labels = c("Age","Education","Experience"))
##
## Outlier Comparison: Original vs Removed vs Capped
## ============================================================================
## Dependent variable:
## --------------------------------------------
## Salary
## Original Removed Capped
## (1) (2) (3)
## ----------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680***
## (315.866) (315.866) (315.866)
##
## Education 14,970.470*** 14,970.470*** 14,970.470***
## (1,142.364) (1,142.364) (1,142.364)
##
## Experience 9,005.050*** 9,005.050*** 9,005.050***
## (403.287) (403.287) (403.287)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600***
## (7,690.070) (7,690.070) (7,690.070)
##
## ----------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697
## Residual Std. Error (df = 1352) 28,591.080 28,591.080 28,591.080
## F Statistic (df = 3; 1352) 1,037.716*** 1,037.716*** 1,037.716***
## ============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
type = "text",
title = "Scaling Comparison",
dep.var.labels = "Salary (scaled)",
column.labels = c("Orig","RemOut","CapOut","Norm","Std"))
##
## Scaling Comparison
## ======================================================================================================
## Dependent variable:
## ----------------------------------------------------------------------
## Salary (scaled) Salary_norm Salary_z
## Orig RemOut CapOut Norm Std
## (1) (2) (3) (4) (5)
## ------------------------------------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680*** -0.011*** -0.052***
## (315.866) (315.866) (315.866) (0.001) (0.006)
##
## Education 14,970.470*** 14,970.470*** 14,970.470*** 0.062*** 0.288***
## (1,142.364) (1,142.364) (1,142.364) (0.005) (0.022)
##
## Years.of.Experience 9,005.050*** 9,005.050*** 9,005.050*** 0.038*** 0.174***
## (403.287) (403.287) (403.287) (0.002) (0.008)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600*** 0.452*** -0.082
## (7,690.070) (7,690.070) (7,690.070) (0.032) (0.148)
##
## ------------------------------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356 1,356 1,356
## R2 0.697 0.697 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697 0.697 0.697
## Residual Std. Error (df = 1352) 28,591.080 28,591.080 28,591.080 0.119 0.551
## F Statistic (df = 3; 1352) 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716***
## ======================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Original model (O)
m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience,
data = dataset_cleaned)
# Removed-outlier model (R)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience,
data = data_removeS)
# Capped-outlier model (C)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience,
data = data_capS)
summary(m_all_O)
##
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience,
## data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104163 -18677 -4894 16549 80997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.6 7690.1 14.139 <2e-16 ***
## Age -2710.7 315.9 -8.582 <2e-16 ***
## Education 14970.5 1142.4 13.105 <2e-16 ***
## Years.of.Experience 9005.0 403.3 22.329 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6965
## F-statistic: 1038 on 3 and 1352 DF, p-value: < 2.2e-16
summary(m_all_R)
##
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience,
## data = data_removeS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104163 -18677 -4894 16549 80997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.6 7690.1 14.139 <2e-16 ***
## Age -2710.7 315.9 -8.582 <2e-16 ***
## Education 14970.5 1142.4 13.105 <2e-16 ***
## Years.of.Experience 9005.0 403.3 22.329 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6965
## F-statistic: 1038 on 3 and 1352 DF, p-value: < 2.2e-16
summary(m_all_C)
##
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience,
## data = data_capS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104163 -18677 -4894 16549 80997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.6 7690.1 14.139 <2e-16 ***
## Age -2710.7 315.9 -8.582 <2e-16 ***
## Education 14970.5 1142.4 13.105 <2e-16 ***
## Years.of.Experience 9005.0 403.3 22.329 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6965
## F-statistic: 1038 on 3 and 1352 DF, p-value: < 2.2e-16
stargazer(m_all_O, m_all_R, m_all_C, type = "text")
##
## ============================================================================
## Dependent variable:
## --------------------------------------------
## Salary
## (1) (2) (3)
## ----------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680***
## (315.866) (315.866) (315.866)
##
## Education 14,970.470*** 14,970.470*** 14,970.470***
## (1,142.364) (1,142.364) (1,142.364)
##
## Years.of.Experience 9,005.050*** 9,005.050*** 9,005.050***
## (403.287) (403.287) (403.287)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600***
## (7,690.070) (7,690.070) (7,690.070)
##
## ----------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697
## Residual Std. Error (df = 1352) 28,591.080 28,591.080 28,591.080
## F Statistic (df = 3; 1352) 1,037.716*** 1,037.716*** 1,037.716***
## ============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z, type = "text")
##
## ======================================================================================================
## Dependent variable:
## ----------------------------------------------------------------------
## Salary Salary_norm Salary_z
## (1) (2) (3) (4) (5)
## ------------------------------------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680*** -0.011*** -0.052***
## (315.866) (315.866) (315.866) (0.001) (0.006)
##
## Education 14,970.470*** 14,970.470*** 14,970.470*** 0.062*** 0.288***
## (1,142.364) (1,142.364) (1,142.364) (0.005) (0.022)
##
## Years.of.Experience 9,005.050*** 9,005.050*** 9,005.050*** 0.038*** 0.174***
## (403.287) (403.287) (403.287) (0.002) (0.008)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600*** 0.452*** -0.082
## (7,690.070) (7,690.070) (7,690.070) (0.032) (0.148)
##
## ------------------------------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356 1,356 1,356
## R2 0.697 0.697 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697 0.697 0.697
## Residual Std. Error (df = 1352) 28,591.080 28,591.080 28,591.080 0.119 0.551
## F Statistic (df = 3; 1352) 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716*** 1,037.716***
## ======================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Use capped salary dataset if you created it earlier:
dat <- dataset_cleaned # or data_capS if you used outlier capping
# Normalization (0–1)
min_S <- min(dat$Salary, na.rm = TRUE)
max_S <- max(dat$Salary, na.rm = TRUE)
dat$Salary_norm <- (dat$Salary - min_S) / (max_S - min_S)
# Standardization (z-score)
mean_S <- mean(dat$Salary, na.rm = TRUE)
sd_S <- sd(dat$Salary, na.rm = TRUE)
dat$Salary_z <- (dat$Salary - mean_S) / sd_S
# Normalized model
m_norm <- lm(Salary_norm ~ Age + Education + Years.of.Experience,
data = dat)
# Standardized model (THIS IS m_z)
m_z <- lm(Salary_z ~ Age + Education + Years.of.Experience,
data = dat)
summary(m_norm)
##
## Call:
## lm(formula = Salary_norm ~ Age + Education + Years.of.Experience,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43465 -0.07794 -0.02042 0.06905 0.33798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.452258 0.032089 14.094 <2e-16 ***
## Age -0.011311 0.001318 -8.582 <2e-16 ***
## Education 0.062468 0.004767 13.105 <2e-16 ***
## Years.of.Experience 0.037576 0.001683 22.329 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1193 on 1352 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6965
## F-statistic: 1038 on 3 and 1352 DF, p-value: < 2.2e-16
summary(m_z)
##
## Call:
## lm(formula = Salary_z ~ Age + Education + Years.of.Experience,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0069 -0.3599 -0.0943 0.3189 1.5606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.082178 0.148167 -0.555 0.579
## Age -0.052228 0.006086 -8.582 <2e-16 ***
## Education 0.288441 0.022010 13.105 <2e-16 ***
## Years.of.Experience 0.173503 0.007770 22.329 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5509 on 1352 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6965
## F-statistic: 1038 on 3 and 1352 DF, p-value: < 2.2e-16
best_model <- m_z # standardized-salary model from earlier
# Residuals vs Fitted
plot(fitted(best_model), resid(best_model),
main = "Residuals vs Fitted (Standardized Salary)",
xlab = "Fitted values", ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
# Histogram + QQ plot
par(mfrow = c(1,2))
hist(resid(best_model), breaks = 20, col = "gray",
main = "Residuals Histogram", xlab = "Residuals")
qqnorm(resid(best_model))
qqline(resid(best_model), col = "red")
par(mfrow = c(1,1))
# Compare outlier strategies – multiple regression
stargazer(m_all_O, m_all_R, m_all_C,
type = "text",
title = "Outliers on Salary: Original vs Removed vs Capped",
dep.var.labels = "Salary (USD)",
column.labels = c("Original", "Removed outliers", "Capped outliers"),
covariate.labels = c("Age", "Education", "Experience"),
digits = 3,
omit.stat = c("f", "ser", "adj.rsq"))
##
## Outliers on Salary: Original vs Removed vs Capped
## ============================================================
## Dependent variable:
## -----------------------------------------------
## Salary (USD)
## Original Removed outliers Capped outliers
## (1) (2) (3)
## ------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680***
## (315.866) (315.866) (315.866)
##
## Education 14,970.470*** 14,970.470*** 14,970.470***
## (1,142.364) (1,142.364) (1,142.364)
##
## Experience 9,005.050*** 9,005.050*** 9,005.050***
## (403.287) (403.287) (403.287)
##
## Constant 108,733.600*** 108,733.600*** 108,733.600***
## (7,690.070) (7,690.070) (7,690.070)
##
## ------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.697 0.697 0.697
## ============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Compare scaling choices – original vs normalized vs standardized salary
m_all_cap <- lm(Salary ~ Age + Education + Years.of.Experience,
data = dat)
stargazer(m_all_cap, m_norm, m_z,
type = "text",
title = "Scaling Salary: Original vs Normalized vs Standardized",
dep.var.labels = "Salary (scaled in different ways)",
column.labels = c("Original (capped)", "Normalized (0-1)", "Standardized (z)"),
covariate.labels = c("Age", "Education", "Experience"),
digits = 3,
omit.stat = c("f", "ser", "adj.rsq"))
##
## Scaling Salary: Original vs Normalized vs Standardized
## ================================================================================
## Dependent variable:
## -------------------------------------------------------------------
## Salary (scaled in different ways) Salary_norm Salary_z
## Original (capped) Normalized (0-1) Standardized (z)
## (1) (2) (3)
## --------------------------------------------------------------------------------
## Age -2,710.680*** -0.011*** -0.052***
## (315.866) (0.001) (0.006)
##
## Education 14,970.470*** 0.062*** 0.288***
## (1,142.364) (0.005) (0.022)
##
## Experience 9,005.050*** 0.038*** 0.174***
## (403.287) (0.002) (0.008)
##
## Constant 108,733.600*** 0.452*** -0.082
## (7,690.070) (0.032) (0.148)
##
## --------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356
## R2 0.697 0.697 0.697
## ================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Combined table
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
type = "text",
title = "All models: Salary Outliers and Scaling",
dep.var.labels = "Salary (original and scaled)",
column.labels = c("Orig", "RemOut", "CapOut", "Norm", "Std"),
digits = 3,
keep.stat = c("n","rsq","adj.rsq"),
no.space = TRUE)
##
## All models: Salary Outliers and Scaling
## ======================================================================================
## Dependent variable:
## ------------------------------------------------------------------
## Salary (original and scaled) Salary_norm Salary_z
## Orig RemOut CapOut Norm Std
## (1) (2) (3) (4) (5)
## --------------------------------------------------------------------------------------
## Age -2,710.680*** -2,710.680*** -2,710.680*** -0.011*** -0.052***
## (315.866) (315.866) (315.866) (0.001) (0.006)
## Education 14,970.470*** 14,970.470*** 14,970.470*** 0.062*** 0.288***
## (1,142.364) (1,142.364) (1,142.364) (0.005) (0.022)
## Years.of.Experience 9,005.050*** 9,005.050*** 9,005.050*** 0.038*** 0.174***
## (403.287) (403.287) (403.287) (0.002) (0.008)
## Constant 108,733.600*** 108,733.600*** 108,733.600*** 0.452*** -0.082
## (7,690.070) (7,690.070) (7,690.070) (0.032) (0.148)
## --------------------------------------------------------------------------------------
## Observations 1,356 1,356 1,356 1,356 1,356
## R2 0.697 0.697 0.697 0.697 0.697
## Adjusted R2 0.697 0.697 0.697 0.697 0.697
## ======================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
summary(m_full)
##
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience +
## Male, data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98564 -18166 -5796 15763 77636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107814.0 7626.4 14.137 < 2e-16 ***
## Age -2844.1 314.3 -9.048 < 2e-16 ***
## Education 15550.6 1138.6 13.657 < 2e-16 ***
## Years.of.Experience 9029.5 399.9 22.582 < 2e-16 ***
## Male 7773.3 1570.6 4.949 8.39e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28350 on 1351 degrees of freedom
## Multiple R-squared: 0.7026, Adjusted R-squared: 0.7017
## F-statistic: 797.9 on 4 and 1351 DF, p-value: < 2.2e-16
# Two-sided test for Experience coefficient
t_exp <- summary(m_full)$coefficients["Years.of.Experience", "t value"]
p_twos <- summary(m_full)$coefficients["Years.of.Experience", "Pr(>|t|)"]
# One-sided test H0: beta_exp <= 0 vs H1: > 0
p_one_gt <- ifelse(t_exp > 0, p_twos / 2, 1 - p_two_)
library(visreg)
visreg(m_all, "Years.of.Experience")
visreg(m_all, "Age")
visreg(m_all, "Education")
############################################################ #
HETEROSKEDASTICITY TESTS AND ROBUST STANDARD ERRORS
############################################################
# Breusch–Pagan test for main model with core predictors
bptest(m_full)
##
## studentized Breusch-Pagan test
##
## data: m_full
## BP = 128.16, df = 4, p-value < 2.2e-16
# If you want, also for the richer model m_all
bptest(m_all)
##
## studentized Breusch-Pagan test
##
## data: m_all
## BP = 134.35, df = 8, p-value < 2.2e-16
# Robust (HC3) standard errors for m_full
coeftest(m_full, vcov = vcovHC(m_full, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107814.00 8978.19 12.0084 < 2.2e-16 ***
## Age -2844.07 361.93 -7.8581 7.910e-15 ***
## Education 15550.56 1241.09 12.5298 < 2.2e-16 ***
## Years.of.Experience 9029.51 533.99 16.9094 < 2.2e-16 ***
## Male 7773.33 1589.51 4.8904 1.127e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Robust (HC3) standard errors for m_all
coeftest(m_all, vcov = vcovHC(m_all, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104443.67 8887.92 11.7512 < 2.2e-16 ***
## Age -2751.75 353.61 -7.7818 1.415e-14 ***
## Years.of.Experience 9035.40 529.30 17.0704 < 2.2e-16 ***
## Education 15863.34 1231.82 12.8780 < 2.2e-16 ***
## Male 7949.75 1585.59 5.0138 6.048e-07 ***
## Race_AA -723.09 2120.55 -0.3410 0.733162
## Race_Asian 3381.86 2173.03 1.5563 0.119876
## Race_Hisp 119.56 2259.63 0.0529 0.957812
## Senior -7818.12 2145.12 -3.6446 0.000278 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Choose which model you treat as final:
final_lm <- m_full # or m_all if that is your preferred one
# Leverage values
lev <- hatvalues(final_lm)
plot(lev,
main = "Leverage values – final salary model",
ylab = "Leverage", xlab = "Observation")
abline(h = 2 * length(coef(final_lm)) / nrow(dataset_cleaned),
col = "red", lty = 2)
# Cook's distance
cd <- cooks.distance(final_lm)
plot(cd,
main = "Cook's distance – final salary model",
ylab = "Cook's D", xlab = "Observation")
abline(h = 4 / (nrow(dataset_cleaned) - length(coef(final_lm)) - 1),
col = "red", lty = 2)
# Identify potentially influential points
which(cd > 4 / (nrow(dataset_cleaned) - length(coef(final_lm)) - 1))
## 88 96 117 134 259 428 435 439 462 530 537 542 554 566 570 586
## 21 22 30 34 61 82 83 84 91 101 102 103 107 108 109 113
## 657 666 671 696 703 712 716 721 741 757 777 780 790 807 818 876
## 121 122 123 129 133 134 135 137 143 145 149 151 153 157 159 171
## 887 893 903 907 926 932 936 937 942 946 951 1030 1040 1060 1107 1111
## 173 174 176 178 179 180 181 182 184 186 189 205 208 210 221 222
## 1126 1163 1175 1187 1200 1239 1276 1318 1368 1414 1429 1432 1500 1637 1639 1641
## 224 228 230 235 237 241 246 253 265 277 280 281 292 318 319 320
## 1730 1749 2379 2411 2440 2454 2469 2486 2514 2529 2550 2559 2573 2593 2610 2616
## 335 338 468 472 480 483 485 488 495 498 502 504 509 516 519 521
## 2639 2792 2832 2840 2874 2882 2894 2919 2927 2935 2992 3010 3017 3104 4545 4597
## 527 565 571 575 581 586 588 597 599 602 612 617 618 634 929 939
## 4616 5508 5657 5945 6140 6351 6393 6539 6635
## 944 1140 1167 1230 1265 1296 1303 1331 1350
m_full_quad <- lm(Salary ~ Age + I(Age^2) +
Education +
Years.of.Experience + I(Years.of.Experience^2),
data = dataset_cleaned)
summary(m_full_quad)
##
## Call:
## lm(formula = Salary ~ Age + I(Age^2) + Education + Years.of.Experience +
## I(Years.of.Experience^2), data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100171 -16903 -4746 14488 75115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 335695.92 23496.87 14.29 < 2e-16 ***
## Age -16914.58 1367.84 -12.37 < 2e-16 ***
## I(Age^2) 176.16 17.27 10.20 < 2e-16 ***
## Education 6930.50 1056.51 6.56 7.66e-11 ***
## Years.of.Experience 21943.80 751.04 29.22 < 2e-16 ***
## I(Years.of.Experience^2) -497.30 25.29 -19.67 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24720 on 1350 degrees of freedom
## Multiple R-squared: 0.774, Adjusted R-squared: 0.7732
## F-statistic: 924.8 on 5 and 1350 DF, p-value: < 2.2e-16
# Compare linear vs quadratic specification
anova(m_full, m_full_quad)
## Analysis of Variance Table
##
## Model 1: Salary ~ Age + Education + Years.of.Experience + Male
## Model 2: Salary ~ Age + I(Age^2) + Education + Years.of.Experience + I(Years.of.Experience^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1351 1.0855e+12
## 2 1350 8.2480e+11 1 2.6071e+11 426.73 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience, data = dataset_cleaned)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience, data = data_removeS)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience, data = data_capS)
# Original
coeftest(m_all_O, vcov = vcovHC(m_all_O, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.62 9151.62 11.8814 < 2.2e-16 ***
## Age -2710.68 371.28 -7.3008 4.865e-13 ***
## Education 14970.47 1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience 9005.05 547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Removed outliers
coeftest(m_all_R, vcov = vcovHC(m_all_R, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.62 9151.62 11.8814 < 2.2e-16 ***
## Age -2710.68 371.28 -7.3008 4.865e-13 ***
## Education 14970.47 1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience 9005.05 547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Capped outliers
coeftest(m_all_C, vcov = vcovHC(m_all_C, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108733.62 9151.62 11.8814 < 2.2e-16 ***
## Age -2710.68 371.28 -7.3008 4.865e-13 ***
## Education 14970.47 1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience 9005.05 547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_all)
##
## Call:
## lm(formula = Salary ~ Age + Years.of.Experience + Education +
## Male + Race_AA + Race_Asian + Race_Hisp + Senior, data = dataset_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101305 -18083 -6055 15844 78239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104443.7 7731.4 13.509 < 2e-16 ***
## Age -2751.8 314.2 -8.757 < 2e-16 ***
## Years.of.Experience 9035.4 398.6 22.670 < 2e-16 ***
## Education 15863.3 1141.7 13.895 < 2e-16 ***
## Male 7949.8 1566.4 5.075 4.41e-07 ***
## Race_AA -723.1 2138.0 -0.338 0.73526
## Race_Asian 3381.9 2168.2 1.560 0.11906
## Race_Hisp 119.6 2190.3 0.055 0.95648
## Senior -7818.1 2411.4 -3.242 0.00122 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28230 on 1347 degrees of freedom
## Multiple R-squared: 0.7059, Adjusted R-squared: 0.7041
## F-statistic: 404.1 on 8 and 1347 DF, p-value: < 2.2e-16
confint(m_all, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 89276.7408 119610.607
## Age -3368.1704 -2135.338
## Years.of.Experience 8253.5115 9817.279
## Education 13623.7230 18102.957
## Male 4876.9326 11022.575
## Race_AA -4917.3223 3471.133
## Race_Asian -871.5987 7635.310
## Race_Hisp -4177.2710 4416.384
## Senior -12548.6322 -3087.598