rm(list = ls())
graphics.off()

setwd("/Users/iamarmenain2001/Desktop/UNI/B3/ECONOMETRICS R/Project")

dataset <- read.csv("Salary.csv", sep = ",", dec = ".", header = TRUE)

FILTER + BASIC SETUP

dataset_cleaned <- subset(dataset, Country == "USA")
colnames(dataset_cleaned)[3] <- "Education"
dataset_cleaned <- dataset_cleaned[, !names(dataset_cleaned) %in% c("Job.Title")]

str(dataset_cleaned)
## 'data.frame':    1356 obs. of  8 variables:
##  $ Age                : num  28 36 52 29 42 40 33 41 29 46 ...
##  $ Gender             : chr  "Female" "Female" "Male" "Male" ...
##  $ Education          : int  2 1 2 1 2 2 2 2 2 3 ...
##  $ Years.of.Experience: num  3 7 20 2 12 14 7 13 3 20 ...
##  $ Salary             : num  65000 60000 200000 55000 120000 130000 90000 140000 75000 170000 ...
##  $ Country            : chr  "USA" "USA" "USA" "USA" ...
##  $ Race               : chr  "Hispanic" "Hispanic" "Asian" "Hispanic" ...
##  $ Senior             : int  0 0 0 0 0 0 0 0 0 1 ...
head(dataset_cleaned)
##    Age Gender Education Years.of.Experience Salary Country             Race
## 2   28 Female         2                   3  65000     USA         Hispanic
## 4   36 Female         1                   7  60000     USA         Hispanic
## 5   52   Male         2                  20 200000     USA            Asian
## 6   29   Male         1                   2  55000     USA         Hispanic
## 7   42 Female         2                  12 120000     USA            Asian
## 14  40 Female         2                  14 130000     USA African American
##    Senior
## 2       0
## 4       0
## 5       0
## 6       0
## 7       0
## 14      0

PACKAGES

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effectsize)
library(report)
library(parameters)
library(visreg)
library(interactions)
library(corrplot)
## corrplot 0.95 loaded

CREATE NUMERIC VARIABLES

# Male dummy
dataset_cleaned$Male <- ifelse(dataset_cleaned$Gender == "Male", 1, 0)

# Race dummies (White = reference: 0 0 0)
dataset_cleaned$Race_AA    <- ifelse(dataset_cleaned$Race == "African American", 1, 0)
dataset_cleaned$Race_Asian <- ifelse(dataset_cleaned$Race == "Asian",            1, 0)
dataset_cleaned$Race_Hisp  <- ifelse(dataset_cleaned$Race == "Hispanic",         1, 0)

CORRELATIONS

num_only <- dataset_cleaned[sapply(dataset_cleaned, is.numeric)]
corr <- cor(num_only, use = "pairwise.complete.obs")

corrplot(corr, method = "number")

corrplot(corr, method = "ellipse", type = "upper", tl.srt = 45)

colSums(is.na(dataset_cleaned))
##                 Age              Gender           Education Years.of.Experience 
##                   0                   0                   0                   0 
##              Salary             Country                Race              Senior 
##                   0                   0                   0                   0 
##                Male             Race_AA          Race_Asian           Race_Hisp 
##                   0                   0                   0                   0

SIMPLE LINEAR MODELS

m_age    <- lm(Salary ~ Age,                  data = dataset_cleaned)
m_exp    <- lm(Salary ~ Years.of.Experience,  data = dataset_cleaned)
m_edu    <- lm(Salary ~ Education,            data = dataset_cleaned)
m_gender <- lm(Salary ~ Male,                 data = dataset_cleaned)
m_race <- lm(Salary ~ Race_AA + Race_Asian + Race_Hisp,
             data = dataset_cleaned)
m_senior <- lm(Salary ~ Senior,               data = dataset_cleaned)

MULTIPLE REGRESSION — FULL

m_all <- lm(Salary ~ Age + Years.of.Experience + Education +
              Male + Race_AA + Race_Asian + Race_Hisp + Senior,
            data = dataset_cleaned)

summary(m_all)
## 
## Call:
## lm(formula = Salary ~ Age + Years.of.Experience + Education + 
##     Male + Race_AA + Race_Asian + Race_Hisp + Senior, data = dataset_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101305  -18083   -6055   15844   78239 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         104443.7     7731.4  13.509  < 2e-16 ***
## Age                  -2751.8      314.2  -8.757  < 2e-16 ***
## Years.of.Experience   9035.4      398.6  22.670  < 2e-16 ***
## Education            15863.3     1141.7  13.895  < 2e-16 ***
## Male                  7949.8     1566.4   5.075 4.41e-07 ***
## Race_AA               -723.1     2138.0  -0.338  0.73526    
## Race_Asian            3381.9     2168.2   1.560  0.11906    
## Race_Hisp              119.6     2190.3   0.055  0.95648    
## Senior               -7818.1     2411.4  -3.242  0.00122 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28230 on 1347 degrees of freedom
## Multiple R-squared:  0.7059, Adjusted R-squared:  0.7041 
## F-statistic: 404.1 on 8 and 1347 DF,  p-value: < 2.2e-16

TWO-PREDICTOR MODELS

m_age_exp    <- lm(Salary ~ Age + Years.of.Experience,      data = dataset_cleaned)
m_age_edu    <- lm(Salary ~ Age + Education,                 data = dataset_cleaned)
m_exp_edu    <- lm(Salary ~ Years.of.Experience + Education, data = dataset_cleaned)
m_exp_gender <- lm(Salary ~ Years.of.Experience + Male,      data = dataset_cleaned)
m_edu_race <- lm(Salary ~ Education + Race_AA + Race_Asian + Race_Hisp,
                 data = dataset_cleaned)
m_gender_race <- lm(Salary ~ Male + Race_AA + Race_Asian + Race_Hisp,
                    data = dataset_cleaned)
m_senior_exp <- lm(Salary ~ Senior + Years.of.Experience,    data = dataset_cleaned)

PLOTS

par(mfrow = c(1,3))
plot(dataset_cleaned$Age, dataset_cleaned$Salary,
     main = "Salary vs Age", pch=8, col="blue")
abline(m_age)

plot(dataset_cleaned$Education, dataset_cleaned$Salary,
     main = "Salary vs Education", pch=8, col="blue")
abline(m_edu)

plot(dataset_cleaned$Years.of.Experience, dataset_cleaned$Salary,
     main = "Salary vs Experience", pch=8, col="blue")
abline(m_exp)

############################################################ # STARGAZER — SIMPLE + FULL ############################################################

stargazer(m_exp, m_age, m_edu, m_all, type="text")
## 
## ===============================================================================================================================
##                                                                 Dependent variable:                                            
##                     -----------------------------------------------------------------------------------------------------------
##                                                                       Salary                                                   
##                                 (1)                         (2)                        (3)                       (4)           
## -------------------------------------------------------------------------------------------------------------------------------
## Years.of.Experience        7,151.466***                                                                     9,035.395***       
##                              (144.093)                                                                        (398.569)        
##                                                                                                                                
## Age                                                    5,089.125***                                         -2,751.754***      
##                                                          (134.353)                                            (314.221)        
##                                                                                                                                
## Education                                                                         38,581.560***             15,863.340***      
##                                                                                    (1,260.373)               (1,141.656)       
##                                                                                                                                
## Male                                                                                                        7,949.754***       
##                                                                                                              (1,566.386)       
##                                                                                                                                
## Race_AA                                                                                                       -723.095         
##                                                                                                              (2,138.029)       
##                                                                                                                                
## Race_Asian                                                                                                    3,381.855        
##                                                                                                              (2,168.220)       
##                                                                                                                                
## Race_Hisp                                                                                                      119.557         
##                                                                                                              (2,190.329)       
##                                                                                                                                
## Senior                                                                                                      -7,818.115***      
##                                                                                                              (2,411.405)       
##                                                                                                                                
## Constant                   57,021.130***              -56,139.580***              51,484.500***            104,443.700***      
##                             (1,406.154)                 (4,572.052)                (2,283.227)               (7,731.420)       
##                                                                                                                                
## -------------------------------------------------------------------------------------------------------------------------------
## Observations                   1,356                       1,356                      1,356                     1,356          
## R2                             0.645                       0.514                      0.409                     0.706          
## Adjusted R2                    0.645                       0.514                      0.409                     0.704          
## Residual Std. Error   30,922.480 (df = 1354)      36,177.590 (df = 1354)     39,914.580 (df = 1354)    28,230.290 (df = 1347)  
## F Statistic         2,463.231*** (df = 1; 1354) 1,434.804*** (df = 1; 1354) 937.048*** (df = 1; 1354) 404.126*** (df = 8; 1347)
## ===============================================================================================================================
## Note:                                                                                               *p<0.1; **p<0.05; ***p<0.01

RESIDUAL CHECKS

par(mfrow = c(2,2))
plot(m_exp$residuals);    abline(h=0, col="red")
plot(m_age$residuals);    abline(h=0, col="red")
plot(m_edu$residuals);    abline(h=0, col="red")
plot(m_all$residuals);    abline(h=0, col="red")

############################################################ # OUTLIER DETECTION AND CLEANING FOR SALARY ############################################################

# Boxplot (visual check)
boxplot(dataset_cleaned$Salary,
        main = "Boxplot of Salary",
        ylab = "Salary",
        col = "lightblue")

# Compute IQR bounds for Salary
Q1_S  <- quantile(dataset_cleaned$Salary, 0.25, na.rm = TRUE)
Q3_S  <- quantile(dataset_cleaned$Salary, 0.75, na.rm = TRUE)
IQR_S <- Q3_S - Q1_S

LOW_S <- Q1_S - 1.5 * IQR_S
UPP_S <- Q3_S + 1.5 * IQR_S

# Identify outliers
is_out_S <- dataset_cleaned$Salary < LOW_S | dataset_cleaned$Salary > UPP_S
table(is_out_S)
## is_out_S
## FALSE 
##  1356
# CREATE REMOVED-OUTLIER DATASET
data_removeS <- dataset_cleaned[!is_out_S, ]

# CREATE CAPPED-OUTLIER DATASET
data_capS <- dataset_cleaned
data_capS$Salary[data_capS$Salary < LOW_S] <- LOW_S
data_capS$Salary[data_capS$Salary > UPP_S] <- UPP_S

# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeS)
## [1] 1356
nrow(data_capS)
## [1] 1356

OUTLIER DETECTION AND CLEANING FOR AGE

# Boxplot (visual check)
boxplot(dataset_cleaned$Age,
        main = "Boxplot of Age",
        ylab = "Age",
        col = "lightblue")

# Compute IQR bounds for Age
Q1_A  <- quantile(dataset_cleaned$Age, 0.25, na.rm = TRUE)
Q3_A  <- quantile(dataset_cleaned$Age, 0.75, na.rm = TRUE)
IQR_A <- Q3_A - Q1_A

LOW_A <- Q1_A - 1.5 * IQR_A
UPP_A <- Q3_A + 1.5 * IQR_A

# Identify outliers
is_out_A <- dataset_cleaned$Age < LOW_A | dataset_cleaned$Age > UPP_A
table(is_out_A)
## is_out_A
## FALSE  TRUE 
##  1322    34
# CREATE REMOVED-OUTLIER DATASET
data_removeA <- dataset_cleaned[!is_out_A, ]

# CREATE CAPPED-OUTLIER DATASET
data_capA <- dataset_cleaned
data_capA$Age[data_capA$Age < LOW_A] <- LOW_A
data_capA$Age[data_capA$Age > UPP_A] <- UPP_A

# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeA)
## [1] 1322
nrow(data_capA)
## [1] 1356

OUTLIER DETECTION AND CLEANING FOR EXPERIENCE

# Boxplot (visual check)
boxplot(dataset_cleaned$Years.of.Experience,
        main = "Boxplot of Years of Experience",
        ylab = "Years of Experience",
        col = "lightblue")

# Compute IQR bounds for Experience
Q1_E  <- quantile(dataset_cleaned$Years.of.Experience, 0.25, na.rm = TRUE)
Q3_E  <- quantile(dataset_cleaned$Years.of.Experience, 0.75, na.rm = TRUE)
IQR_E <- Q3_E - Q1_E

LOW_E <- Q1_E - 1.5 * IQR_E
UPP_E <- Q3_E + 1.5 * IQR_E

# Identify outliers
is_out_E <- dataset_cleaned$Years.of.Experience < LOW_E | dataset_cleaned$Years.of.Experience > UPP_E
table(is_out_E)
## is_out_E
## FALSE  TRUE 
##  1334    22
# CREATE REMOVED-OUTLIER DATASET
data_removeE <- dataset_cleaned[!is_out_E, ]

# CREATE CAPPED-OUTLIER DATASET
data_capE <- dataset_cleaned
data_capE$Years.of.Experience[data_capE$Years.of.Experience < LOW_E] <- LOW_E
data_capE$Years.of.Experience[data_capE$Years.of.Experience > UPP_E] <- UPP_E

# Check dimensions
nrow(dataset_cleaned)
## [1] 1356
nrow(data_removeE)
## [1] 1334
nrow(data_capE)
## [1] 1356

OUTLIER DETECTION — SALARY / AGE / EXPERIENCE

# Unified function to produce removed + capped datasets
trim_outliers <- function(df, var){
  Q1  <- quantile(df[[var]], 0.25, na.rm=TRUE)
  Q3  <- quantile(df[[var]], 0.75, na.rm=TRUE)
  IQRv <- Q3 - Q1
  
  LOW <- Q1 - 1.5*IQRv
  UPP <- Q3 + 1.5*IQRv
  
  is_out <- df[[var]] < LOW | df[[var]] > UPP
  
  df_remove <- df[!is_out, ]
  df_cap    <- df
  df_cap[df_cap[[var]] < LOW, var] <- LOW
  df_cap[df_cap[[var]] > UPP, var] <- UPP
  
  return(list(remove=df_remove, cap=df_cap))
}

out_S <- trim_outliers(dataset_cleaned, "Salary")
out_A <- trim_outliers(dataset_cleaned, "Age")
out_E <- trim_outliers(dataset_cleaned, "Years.of.Experience")

data_removeS <- out_S$remove
data_capS    <- out_S$cap

TWO-PREDICTOR MODELS — ORIGINAL / REMOVED / CAPPED

m_age_edu_O <- m_age_edu
m_age_edu_R <- lm(Salary ~ Age + Education, data=data_removeS)
m_age_edu_C <- lm(Salary ~ Age + Education, data=data_capS)

m_exp_edu_O <- m_exp_edu
m_exp_edu_R <- lm(Salary ~ Years.of.Experience + Education, data=data_removeS)
m_exp_edu_C <- lm(Salary ~ Years.of.Experience + Education, data=data_capS)

m_age_exp_O <- m_age_exp
m_age_exp_R <- lm(Salary ~ Age + Years.of.Experience, data=data_removeS)
m_age_exp_C <- lm(Salary ~ Age + Years.of.Experience, data=data_capS)

STARGAZER — TWO-PREDICTOR OUTLIER COMPARISONS

stargazer(m_age_edu_O, m_age_edu_R, m_age_edu_C, type="text",
          title="Age + Education: Outlier Comparison")
## 
## Age + Education: Outlier Comparison
## ============================================================================
##                                             Dependent variable:             
##                                 --------------------------------------------
##                                                    Salary                   
##                                      (1)            (2)            (3)      
## ----------------------------------------------------------------------------
## Age                              3,699.390***   3,699.390***   3,699.390*** 
##                                   (154.096)      (154.096)      (154.096)   
##                                                                             
## Education                       19,956.250***  19,956.250***  19,956.250*** 
##                                  (1,310.244)    (1,310.244)    (1,310.244)  
##                                                                             
## Constant                        -41,769.540*** -41,769.540*** -41,769.540***
##                                  (4,329.836)    (4,329.836)    (4,329.836)  
##                                                                             
## ----------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356     
## R2                                  0.586          0.586          0.586     
## Adjusted R2                         0.585          0.585          0.585     
## Residual Std. Error (df = 1353)   33,437.730     33,437.730     33,437.730  
## F Statistic (df = 2; 1353)        955.776***     955.776***     955.776***  
## ============================================================================
## Note:                                            *p<0.1; **p<0.05; ***p<0.01
stargazer(m_exp_edu_O, m_exp_edu_R, m_exp_edu_C, type="text",
          title="Experience + Education: Outlier Comparison")
## 
## Experience + Education: Outlier Comparison
## =========================================================================
##                                            Dependent variable:           
##                                 -----------------------------------------
##                                                  Salary                  
##                                      (1)           (2)           (3)     
## -------------------------------------------------------------------------
## Years.of.Experience             5,859.633***  5,859.633***  5,859.633*** 
##                                   (172.685)     (172.685)     (172.685)  
##                                                                          
## Education                       14,337.230*** 14,337.230*** 14,337.230***
##                                  (1,170.182)   (1,170.182)   (1,170.182) 
##                                                                          
## Constant                        44,273.660*** 44,273.660*** 44,273.660***
##                                  (1,692.220)   (1,692.220)   (1,692.220) 
##                                                                          
## -------------------------------------------------------------------------
## Observations                        1,356         1,356         1,356    
## R2                                  0.681         0.681         0.681    
## Adjusted R2                         0.680         0.680         0.680    
## Residual Std. Error (df = 1353)  29,348.620    29,348.620    29,348.620  
## F Statistic (df = 2; 1353)      1,442.310***  1,442.310***  1,442.310*** 
## =========================================================================
## Note:                                         *p<0.1; **p<0.05; ***p<0.01
stargazer(m_age_exp_O, m_age_exp_R, m_age_exp_C, type="text",
          title="Age + Experience: Outlier Comparison")
## 
## Age + Experience: Outlier Comparison
## ============================================================================
##                                             Dependent variable:             
##                                 --------------------------------------------
##                                                    Salary                   
##                                      (1)            (2)            (3)      
## ----------------------------------------------------------------------------
## Age                             -2,443.303***  -2,443.303***  -2,443.303*** 
##                                   (334.504)      (334.504)      (334.504)   
##                                                                             
## Years.of.Experience             10,038.050***  10,038.050***  10,038.050*** 
##                                   (419.722)      (419.722)      (419.722)   
##                                                                             
## Constant                        115,630.400*** 115,630.400*** 115,630.400***
##                                  (8,141.734)    (8,141.734)    (8,141.734)  
##                                                                             
## ----------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356     
## R2                                  0.659          0.659          0.659     
## Adjusted R2                         0.658          0.658          0.658     
## Residual Std. Error (df = 1353)   30,341.470     30,341.470     30,341.470  
## F Statistic (df = 2; 1353)       1,305.912***   1,305.912***   1,305.912*** 
## ============================================================================
## Note:                                            *p<0.1; **p<0.05; ***p<0.01

SCALING — NORMALIZATION + STANDARDIZATION

dat <- dataset_cleaned

dat$Salary_norm <- (dat$Salary - min(dat$Salary))/(max(dat$Salary)-min(dat$Salary))
dat$Salary_z    <- scale(dat$Salary)

m_norm <- lm(Salary_norm ~ Age + Education + Years.of.Experience, data=dat)
m_z    <- lm(Salary_z    ~ Age + Education + Years.of.Experience, data=dat)

STARGAZER — SCALING COMPARISON

m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience, data=dataset_cleaned)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience, data=data_removeS)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience, data=data_capS)

stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
          type="text",
          title="Scaling + Outlier Strategies Combined")
## 
## Scaling + Outlier Strategies Combined
## ======================================================================================================
##                                                          Dependent variable:                          
##                                 ----------------------------------------------------------------------
##                                                    Salary                     Salary_norm     Salary_z  
##                                      (1)            (2)            (3)           (4)          (5)     
## ------------------------------------------------------------------------------------------------------
## Age                             -2,710.680***  -2,710.680***  -2,710.680***   -0.011***    -0.052***  
##                                   (315.866)      (315.866)      (315.866)      (0.001)      (0.006)   
##                                                                                                       
## Education                       14,970.470***  14,970.470***  14,970.470***    0.062***     0.288***  
##                                  (1,142.364)    (1,142.364)    (1,142.364)     (0.005)      (0.022)   
##                                                                                                       
## Years.of.Experience              9,005.050***   9,005.050***   9,005.050***    0.038***     0.174***  
##                                   (403.287)      (403.287)      (403.287)      (0.002)      (0.008)   
##                                                                                                       
## Constant                        108,733.600*** 108,733.600*** 108,733.600***   0.452***      -0.082   
##                                  (7,690.070)    (7,690.070)    (7,690.070)     (0.032)      (0.148)   
##                                                                                                       
## ------------------------------------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356         1,356        1,356    
## R2                                  0.697          0.697          0.697         0.697        0.697    
## Adjusted R2                         0.697          0.697          0.697         0.697        0.697    
## Residual Std. Error (df = 1352)   28,591.080     28,591.080     28,591.080      0.119        0.551    
## F Statistic (df = 3; 1352)       1,037.716***   1,037.716***   1,037.716***  1,037.716*** 1,037.716***
## ======================================================================================================
## Note:                                                                      *p<0.1; **p<0.05; ***p<0.01

EXTENDED MODEL — AGE + EDUCATION + EXPERIENCE

m_full <- lm(Salary ~ Age + Education + Years.of.Experience + Male,
             data = dataset_cleaned)

summary(m_full)
## 
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience + 
##     Male, data = dataset_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98564 -18166  -5796  15763  77636 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         107814.0     7626.4  14.137  < 2e-16 ***
## Age                  -2844.1      314.3  -9.048  < 2e-16 ***
## Education            15550.6     1138.6  13.657  < 2e-16 ***
## Years.of.Experience   9029.5      399.9  22.582  < 2e-16 ***
## Male                  7773.3     1570.6   4.949 8.39e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28350 on 1351 degrees of freedom
## Multiple R-squared:  0.7026, Adjusted R-squared:  0.7017 
## F-statistic: 797.9 on 4 and 1351 DF,  p-value: < 2.2e-16
vif(m_full)
##                 Age           Education Years.of.Experience                Male 
##            8.915277            1.618227            9.164269            1.032334

HYPOTHESIS TESTING

t_exp <- summary(m_full)$coefficients["Years.of.Experience", "t value"]
p_two <- summary(m_full)$coefficients["Years.of.Experience","Pr(>|t|)"]
p_one_gt <- ifelse(t_exp > 0, p_two/2, 1 - p_two/2)

DONE — CLEAN, CONSISTENT, NO DUPLICATION

p_one_gt
## [1] 2.343384e-96

VIF CHECKS – MULTICOLLINEARITY

library(car)
vif(m_all)
##                 Age Years.of.Experience           Education                Male 
##            8.983097            9.179956            1.640231            1.035230 
##             Race_AA          Race_Asian           Race_Hisp              Senior 
##            1.494903            1.487944            1.478103            1.117492

STARGAZER TABLES

library(stargazer)

stargazer(m_all_O, m_all_R, m_all_C,
          type = "text",
          title = "Outlier Comparison: Original vs Removed vs Capped",
          dep.var.labels = "Salary",
          column.labels = c("Original", "Removed", "Capped"),
          covariate.labels = c("Age","Education","Experience"))
## 
## Outlier Comparison: Original vs Removed vs Capped
## ============================================================================
##                                             Dependent variable:             
##                                 --------------------------------------------
##                                                    Salary                   
##                                    Original       Removed         Capped    
##                                      (1)            (2)            (3)      
## ----------------------------------------------------------------------------
## Age                             -2,710.680***  -2,710.680***  -2,710.680*** 
##                                   (315.866)      (315.866)      (315.866)   
##                                                                             
## Education                       14,970.470***  14,970.470***  14,970.470*** 
##                                  (1,142.364)    (1,142.364)    (1,142.364)  
##                                                                             
## Experience                       9,005.050***   9,005.050***   9,005.050*** 
##                                   (403.287)      (403.287)      (403.287)   
##                                                                             
## Constant                        108,733.600*** 108,733.600*** 108,733.600***
##                                  (7,690.070)    (7,690.070)    (7,690.070)  
##                                                                             
## ----------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356     
## R2                                  0.697          0.697          0.697     
## Adjusted R2                         0.697          0.697          0.697     
## Residual Std. Error (df = 1352)   28,591.080     28,591.080     28,591.080  
## F Statistic (df = 3; 1352)       1,037.716***   1,037.716***   1,037.716*** 
## ============================================================================
## Note:                                            *p<0.1; **p<0.05; ***p<0.01
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
          type = "text",
          title = "Scaling Comparison",
          dep.var.labels = "Salary (scaled)",
          column.labels = c("Orig","RemOut","CapOut","Norm","Std"))
## 
## Scaling Comparison
## ======================================================================================================
##                                                          Dependent variable:                          
##                                 ----------------------------------------------------------------------
##                                               Salary (scaled)                 Salary_norm     Salary_z  
##                                      Orig          RemOut         CapOut         Norm         Std     
##                                      (1)            (2)            (3)           (4)          (5)     
## ------------------------------------------------------------------------------------------------------
## Age                             -2,710.680***  -2,710.680***  -2,710.680***   -0.011***    -0.052***  
##                                   (315.866)      (315.866)      (315.866)      (0.001)      (0.006)   
##                                                                                                       
## Education                       14,970.470***  14,970.470***  14,970.470***    0.062***     0.288***  
##                                  (1,142.364)    (1,142.364)    (1,142.364)     (0.005)      (0.022)   
##                                                                                                       
## Years.of.Experience              9,005.050***   9,005.050***   9,005.050***    0.038***     0.174***  
##                                   (403.287)      (403.287)      (403.287)      (0.002)      (0.008)   
##                                                                                                       
## Constant                        108,733.600*** 108,733.600*** 108,733.600***   0.452***      -0.082   
##                                  (7,690.070)    (7,690.070)    (7,690.070)     (0.032)      (0.148)   
##                                                                                                       
## ------------------------------------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356         1,356        1,356    
## R2                                  0.697          0.697          0.697         0.697        0.697    
## Adjusted R2                         0.697          0.697          0.697         0.697        0.697    
## Residual Std. Error (df = 1352)   28,591.080     28,591.080     28,591.080      0.119        0.551    
## F Statistic (df = 3; 1352)       1,037.716***   1,037.716***   1,037.716***  1,037.716*** 1,037.716***
## ======================================================================================================
## Note:                                                                      *p<0.1; **p<0.05; ***p<0.01

MULTIPLE REGRESSION — ORIGINAL / REMOVED / CAPPED

# Original model (O)
m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience,
              data = dataset_cleaned)

# Removed-outlier model (R)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience,
              data = data_removeS)

# Capped-outlier model (C)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience,
              data = data_capS)

summary(m_all_O)
## 
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience, 
##     data = dataset_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104163  -18677   -4894   16549   80997 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         108733.6     7690.1  14.139   <2e-16 ***
## Age                  -2710.7      315.9  -8.582   <2e-16 ***
## Education            14970.5     1142.4  13.105   <2e-16 ***
## Years.of.Experience   9005.0      403.3  22.329   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6965 
## F-statistic:  1038 on 3 and 1352 DF,  p-value: < 2.2e-16
summary(m_all_R)
## 
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience, 
##     data = data_removeS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104163  -18677   -4894   16549   80997 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         108733.6     7690.1  14.139   <2e-16 ***
## Age                  -2710.7      315.9  -8.582   <2e-16 ***
## Education            14970.5     1142.4  13.105   <2e-16 ***
## Years.of.Experience   9005.0      403.3  22.329   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6965 
## F-statistic:  1038 on 3 and 1352 DF,  p-value: < 2.2e-16
summary(m_all_C)
## 
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience, 
##     data = data_capS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104163  -18677   -4894   16549   80997 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         108733.6     7690.1  14.139   <2e-16 ***
## Age                  -2710.7      315.9  -8.582   <2e-16 ***
## Education            14970.5     1142.4  13.105   <2e-16 ***
## Years.of.Experience   9005.0      403.3  22.329   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28590 on 1352 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6965 
## F-statistic:  1038 on 3 and 1352 DF,  p-value: < 2.2e-16
stargazer(m_all_O, m_all_R, m_all_C, type = "text")
## 
## ============================================================================
##                                             Dependent variable:             
##                                 --------------------------------------------
##                                                    Salary                   
##                                      (1)            (2)            (3)      
## ----------------------------------------------------------------------------
## Age                             -2,710.680***  -2,710.680***  -2,710.680*** 
##                                   (315.866)      (315.866)      (315.866)   
##                                                                             
## Education                       14,970.470***  14,970.470***  14,970.470*** 
##                                  (1,142.364)    (1,142.364)    (1,142.364)  
##                                                                             
## Years.of.Experience              9,005.050***   9,005.050***   9,005.050*** 
##                                   (403.287)      (403.287)      (403.287)   
##                                                                             
## Constant                        108,733.600*** 108,733.600*** 108,733.600***
##                                  (7,690.070)    (7,690.070)    (7,690.070)  
##                                                                             
## ----------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356     
## R2                                  0.697          0.697          0.697     
## Adjusted R2                         0.697          0.697          0.697     
## Residual Std. Error (df = 1352)   28,591.080     28,591.080     28,591.080  
## F Statistic (df = 3; 1352)       1,037.716***   1,037.716***   1,037.716*** 
## ============================================================================
## Note:                                            *p<0.1; **p<0.05; ***p<0.01
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z, type = "text")
## 
## ======================================================================================================
##                                                          Dependent variable:                          
##                                 ----------------------------------------------------------------------
##                                                    Salary                     Salary_norm     Salary_z  
##                                      (1)            (2)            (3)           (4)          (5)     
## ------------------------------------------------------------------------------------------------------
## Age                             -2,710.680***  -2,710.680***  -2,710.680***   -0.011***    -0.052***  
##                                   (315.866)      (315.866)      (315.866)      (0.001)      (0.006)   
##                                                                                                       
## Education                       14,970.470***  14,970.470***  14,970.470***    0.062***     0.288***  
##                                  (1,142.364)    (1,142.364)    (1,142.364)     (0.005)      (0.022)   
##                                                                                                       
## Years.of.Experience              9,005.050***   9,005.050***   9,005.050***    0.038***     0.174***  
##                                   (403.287)      (403.287)      (403.287)      (0.002)      (0.008)   
##                                                                                                       
## Constant                        108,733.600*** 108,733.600*** 108,733.600***   0.452***      -0.082   
##                                  (7,690.070)    (7,690.070)    (7,690.070)     (0.032)      (0.148)   
##                                                                                                       
## ------------------------------------------------------------------------------------------------------
## Observations                        1,356          1,356          1,356         1,356        1,356    
## R2                                  0.697          0.697          0.697         0.697        0.697    
## Adjusted R2                         0.697          0.697          0.697         0.697        0.697    
## Residual Std. Error (df = 1352)   28,591.080     28,591.080     28,591.080      0.119        0.551    
## F Statistic (df = 3; 1352)       1,037.716***   1,037.716***   1,037.716***  1,037.716*** 1,037.716***
## ======================================================================================================
## Note:                                                                      *p<0.1; **p<0.05; ***p<0.01

SCALE SALARY: NORMALIZATION AND STANDARDIZATION

# Use capped salary dataset if you created it earlier:
dat <- dataset_cleaned   # or data_capS if you used outlier capping

# Normalization (0–1)
min_S <- min(dat$Salary, na.rm = TRUE)
max_S <- max(dat$Salary, na.rm = TRUE)
dat$Salary_norm <- (dat$Salary - min_S) / (max_S - min_S)

# Standardization (z-score)
mean_S <- mean(dat$Salary, na.rm = TRUE)
sd_S   <- sd(dat$Salary,   na.rm = TRUE)
dat$Salary_z <- (dat$Salary - mean_S) / sd_S

# Normalized model
m_norm <- lm(Salary_norm ~ Age + Education + Years.of.Experience,
             data = dat)

# Standardized model (THIS IS m_z)
m_z <- lm(Salary_z ~ Age + Education + Years.of.Experience,
          data = dat)

summary(m_norm)
## 
## Call:
## lm(formula = Salary_norm ~ Age + Education + Years.of.Experience, 
##     data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43465 -0.07794 -0.02042  0.06905  0.33798 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.452258   0.032089  14.094   <2e-16 ***
## Age                 -0.011311   0.001318  -8.582   <2e-16 ***
## Education            0.062468   0.004767  13.105   <2e-16 ***
## Years.of.Experience  0.037576   0.001683  22.329   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1193 on 1352 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6965 
## F-statistic:  1038 on 3 and 1352 DF,  p-value: < 2.2e-16
summary(m_z)
## 
## Call:
## lm(formula = Salary_z ~ Age + Education + Years.of.Experience, 
##     data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0069 -0.3599 -0.0943  0.3189  1.5606 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.082178   0.148167  -0.555    0.579    
## Age                 -0.052228   0.006086  -8.582   <2e-16 ***
## Education            0.288441   0.022010  13.105   <2e-16 ***
## Years.of.Experience  0.173503   0.007770  22.329   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5509 on 1352 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6965 
## F-statistic:  1038 on 3 and 1352 DF,  p-value: < 2.2e-16
best_model <- m_z   # standardized-salary model from earlier

# Residuals vs Fitted
plot(fitted(best_model), resid(best_model),
     main = "Residuals vs Fitted (Standardized Salary)",
     xlab = "Fitted values", ylab = "Residuals",
     pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)

# Histogram + QQ plot
par(mfrow = c(1,2))
hist(resid(best_model), breaks = 20, col = "gray",
     main = "Residuals Histogram", xlab = "Residuals")
qqnorm(resid(best_model))
qqline(resid(best_model), col = "red")

par(mfrow = c(1,1))

OUTLIERS AND SCALING – STARGAZER MODEL TABLES

# Compare outlier strategies – multiple regression
stargazer(m_all_O, m_all_R, m_all_C,
          type = "text",
          title = "Outliers on Salary: Original vs Removed vs Capped",
          dep.var.labels = "Salary (USD)",
          column.labels = c("Original", "Removed outliers", "Capped outliers"),
          covariate.labels = c("Age", "Education", "Experience"),
          digits = 3,
          omit.stat = c("f", "ser", "adj.rsq"))
## 
## Outliers on Salary: Original vs Removed vs Capped
## ============================================================
##                            Dependent variable:              
##              -----------------------------------------------
##                               Salary (USD)                  
##                 Original    Removed outliers Capped outliers
##                   (1)             (2)              (3)      
## ------------------------------------------------------------
## Age          -2,710.680***   -2,710.680***    -2,710.680*** 
##                (315.866)       (315.866)        (315.866)   
##                                                             
## Education    14,970.470***   14,970.470***    14,970.470*** 
##               (1,142.364)     (1,142.364)      (1,142.364)  
##                                                             
## Experience    9,005.050***    9,005.050***    9,005.050***  
##                (403.287)       (403.287)        (403.287)   
##                                                             
## Constant     108,733.600***  108,733.600***  108,733.600*** 
##               (7,690.070)     (7,690.070)      (7,690.070)  
##                                                             
## ------------------------------------------------------------
## Observations     1,356           1,356            1,356     
## R2               0.697           0.697            0.697     
## ============================================================
## Note:                            *p<0.1; **p<0.05; ***p<0.01
# Compare scaling choices – original vs normalized vs standardized salary
m_all_cap <- lm(Salary ~ Age + Education + Years.of.Experience,
                data = dat)

stargazer(m_all_cap, m_norm, m_z,
          type = "text",
          title = "Scaling Salary: Original vs Normalized vs Standardized",
          dep.var.labels = "Salary (scaled in different ways)",
          column.labels = c("Original (capped)", "Normalized (0-1)", "Standardized (z)"),
          covariate.labels = c("Age", "Education", "Experience"),
          digits = 3,
          omit.stat = c("f", "ser", "adj.rsq"))
## 
## Scaling Salary: Original vs Normalized vs Standardized
## ================================================================================
##                                      Dependent variable:                        
##              -------------------------------------------------------------------
##              Salary (scaled in different ways)   Salary_norm        Salary_z    
##                      Original (capped)         Normalized (0-1) Standardized (z)
##                             (1)                      (2)              (3)       
## --------------------------------------------------------------------------------
## Age                    -2,710.680***              -0.011***        -0.052***    
##                          (315.866)                 (0.001)          (0.006)     
##                                                                                 
## Education              14,970.470***               0.062***         0.288***    
##                         (1,142.364)                (0.005)          (0.022)     
##                                                                                 
## Experience             9,005.050***                0.038***         0.174***    
##                          (403.287)                 (0.002)          (0.008)     
##                                                                                 
## Constant              108,733.600***               0.452***          -0.082     
##                         (7,690.070)                (0.032)          (0.148)     
##                                                                                 
## --------------------------------------------------------------------------------
## Observations               1,356                    1,356            1,356      
## R2                         0.697                    0.697            0.697      
## ================================================================================
## Note:                                                *p<0.1; **p<0.05; ***p<0.01
# Combined table
stargazer(m_all_O, m_all_R, m_all_C, m_norm, m_z,
          type = "text",
          title = "All models: Salary Outliers and Scaling",
          dep.var.labels = "Salary (original and scaled)",
          column.labels = c("Orig", "RemOut", "CapOut", "Norm", "Std"),
          digits = 3,
          keep.stat = c("n","rsq","adj.rsq"),
          no.space = TRUE)
## 
## All models: Salary Outliers and Scaling
## ======================================================================================
##                                            Dependent variable:                        
##                     ------------------------------------------------------------------
##                             Salary (original and scaled)          Salary_norm   Salary_z 
##                          Orig          RemOut         CapOut        Norm        Std   
##                          (1)            (2)            (3)           (4)        (5)   
## --------------------------------------------------------------------------------------
## Age                 -2,710.680***  -2,710.680***  -2,710.680***   -0.011***  -0.052***
##                       (315.866)      (315.866)      (315.866)      (0.001)    (0.006) 
## Education           14,970.470***  14,970.470***  14,970.470***   0.062***   0.288*** 
##                      (1,142.364)    (1,142.364)    (1,142.364)     (0.005)    (0.022) 
## Years.of.Experience  9,005.050***   9,005.050***   9,005.050***   0.038***   0.174*** 
##                       (403.287)      (403.287)      (403.287)      (0.002)    (0.008) 
## Constant            108,733.600*** 108,733.600*** 108,733.600***  0.452***    -0.082  
##                      (7,690.070)    (7,690.070)    (7,690.070)     (0.032)    (0.148) 
## --------------------------------------------------------------------------------------
## Observations            1,356          1,356          1,356         1,356      1,356  
## R2                      0.697          0.697          0.697         0.697      0.697  
## Adjusted R2             0.697          0.697          0.697         0.697      0.697  
## ======================================================================================
## Note:                                                      *p<0.1; **p<0.05; ***p<0.01

HYPOTHESIS TESTING – SINGLE AND JOINT

summary(m_full)
## 
## Call:
## lm(formula = Salary ~ Age + Education + Years.of.Experience + 
##     Male, data = dataset_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98564 -18166  -5796  15763  77636 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         107814.0     7626.4  14.137  < 2e-16 ***
## Age                  -2844.1      314.3  -9.048  < 2e-16 ***
## Education            15550.6     1138.6  13.657  < 2e-16 ***
## Years.of.Experience   9029.5      399.9  22.582  < 2e-16 ***
## Male                  7773.3     1570.6   4.949 8.39e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28350 on 1351 degrees of freedom
## Multiple R-squared:  0.7026, Adjusted R-squared:  0.7017 
## F-statistic: 797.9 on 4 and 1351 DF,  p-value: < 2.2e-16
# Two-sided test for Experience coefficient
t_exp   <- summary(m_full)$coefficients["Years.of.Experience", "t value"]
p_twos  <- summary(m_full)$coefficients["Years.of.Experience", "Pr(>|t|)"]

# One-sided test H0: beta_exp <= 0 vs H1: > 0
p_one_gt <- ifelse(t_exp > 0, p_twos / 2, 1 - p_two_)

PARTIAL EFFECT VISUALIZATION

library(visreg)

visreg(m_all, "Years.of.Experience")

visreg(m_all, "Age")

visreg(m_all, "Education")

############################################################ # HETEROSKEDASTICITY TESTS AND ROBUST STANDARD ERRORS ############################################################

# Breusch–Pagan test for main model with core predictors
bptest(m_full)
## 
##  studentized Breusch-Pagan test
## 
## data:  m_full
## BP = 128.16, df = 4, p-value < 2.2e-16
# If you want, also for the richer model m_all
bptest(m_all)
## 
##  studentized Breusch-Pagan test
## 
## data:  m_all
## BP = 134.35, df = 8, p-value < 2.2e-16
# Robust (HC3) standard errors for m_full
coeftest(m_full, vcov = vcovHC(m_full, type = "HC3"))
## 
## t test of coefficients:
## 
##                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)         107814.00    8978.19 12.0084 < 2.2e-16 ***
## Age                  -2844.07     361.93 -7.8581 7.910e-15 ***
## Education            15550.56    1241.09 12.5298 < 2.2e-16 ***
## Years.of.Experience   9029.51     533.99 16.9094 < 2.2e-16 ***
## Male                  7773.33    1589.51  4.8904 1.127e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Robust (HC3) standard errors for m_all
coeftest(m_all, vcov = vcovHC(m_all, type = "HC3"))
## 
## t test of coefficients:
## 
##                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)         104443.67    8887.92 11.7512 < 2.2e-16 ***
## Age                  -2751.75     353.61 -7.7818 1.415e-14 ***
## Years.of.Experience   9035.40     529.30 17.0704 < 2.2e-16 ***
## Education            15863.34    1231.82 12.8780 < 2.2e-16 ***
## Male                  7949.75    1585.59  5.0138 6.048e-07 ***
## Race_AA               -723.09    2120.55 -0.3410  0.733162    
## Race_Asian            3381.86    2173.03  1.5563  0.119876    
## Race_Hisp              119.56    2259.63  0.0529  0.957812    
## Senior               -7818.12    2145.12 -3.6446  0.000278 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

INFLUENCE AND LEVERAGE DIAGNOSTICS – SALARY MODEL

# Choose which model you treat as final:
final_lm <- m_full    # or m_all if that is your preferred one

# Leverage values
lev <- hatvalues(final_lm)
plot(lev,
     main = "Leverage values – final salary model",
     ylab = "Leverage", xlab = "Observation")
abline(h = 2 * length(coef(final_lm)) / nrow(dataset_cleaned),
       col = "red", lty = 2)

# Cook's distance
cd <- cooks.distance(final_lm)
plot(cd,
     main = "Cook's distance – final salary model",
     ylab = "Cook's D", xlab = "Observation")
abline(h = 4 / (nrow(dataset_cleaned) - length(coef(final_lm)) - 1),
       col = "red", lty = 2)

# Identify potentially influential points
which(cd > 4 / (nrow(dataset_cleaned) - length(coef(final_lm)) - 1))
##   88   96  117  134  259  428  435  439  462  530  537  542  554  566  570  586 
##   21   22   30   34   61   82   83   84   91  101  102  103  107  108  109  113 
##  657  666  671  696  703  712  716  721  741  757  777  780  790  807  818  876 
##  121  122  123  129  133  134  135  137  143  145  149  151  153  157  159  171 
##  887  893  903  907  926  932  936  937  942  946  951 1030 1040 1060 1107 1111 
##  173  174  176  178  179  180  181  182  184  186  189  205  208  210  221  222 
## 1126 1163 1175 1187 1200 1239 1276 1318 1368 1414 1429 1432 1500 1637 1639 1641 
##  224  228  230  235  237  241  246  253  265  277  280  281  292  318  319  320 
## 1730 1749 2379 2411 2440 2454 2469 2486 2514 2529 2550 2559 2573 2593 2610 2616 
##  335  338  468  472  480  483  485  488  495  498  502  504  509  516  519  521 
## 2639 2792 2832 2840 2874 2882 2894 2919 2927 2935 2992 3010 3017 3104 4545 4597 
##  527  565  571  575  581  586  588  597  599  602  612  617  618  634  929  939 
## 4616 5508 5657 5945 6140 6351 6393 6539 6635 
##  944 1140 1167 1230 1265 1296 1303 1331 1350

NONLINEARITY CHECK – QUADRATIC TERMS

m_full_quad <- lm(Salary ~ Age + I(Age^2) +
                    Education +
                    Years.of.Experience + I(Years.of.Experience^2),
                  data = dataset_cleaned)

summary(m_full_quad)
## 
## Call:
## lm(formula = Salary ~ Age + I(Age^2) + Education + Years.of.Experience + 
##     I(Years.of.Experience^2), data = dataset_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100171  -16903   -4746   14488   75115 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              335695.92   23496.87   14.29  < 2e-16 ***
## Age                      -16914.58    1367.84  -12.37  < 2e-16 ***
## I(Age^2)                    176.16      17.27   10.20  < 2e-16 ***
## Education                  6930.50    1056.51    6.56 7.66e-11 ***
## Years.of.Experience       21943.80     751.04   29.22  < 2e-16 ***
## I(Years.of.Experience^2)   -497.30      25.29  -19.67  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24720 on 1350 degrees of freedom
## Multiple R-squared:  0.774,  Adjusted R-squared:  0.7732 
## F-statistic: 924.8 on 5 and 1350 DF,  p-value: < 2.2e-16
# Compare linear vs quadratic specification
anova(m_full, m_full_quad)
## Analysis of Variance Table
## 
## Model 1: Salary ~ Age + Education + Years.of.Experience + Male
## Model 2: Salary ~ Age + I(Age^2) + Education + Years.of.Experience + I(Years.of.Experience^2)
##   Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
## 1   1351 1.0855e+12                                   
## 2   1350 8.2480e+11  1 2.6071e+11 426.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m_all_O <- lm(Salary ~ Age + Education + Years.of.Experience, data = dataset_cleaned)
m_all_R <- lm(Salary ~ Age + Education + Years.of.Experience, data = data_removeS)
m_all_C <- lm(Salary ~ Age + Education + Years.of.Experience, data = data_capS)

ROBUST SE FOR OUTLIER STRATEGIES

# Original
coeftest(m_all_O, vcov = vcovHC(m_all_O, type = "HC3"))
## 
## t test of coefficients:
## 
##                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)         108733.62    9151.62 11.8814 < 2.2e-16 ***
## Age                  -2710.68     371.28 -7.3008 4.865e-13 ***
## Education            14970.47    1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience   9005.05     547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Removed outliers
coeftest(m_all_R, vcov = vcovHC(m_all_R, type = "HC3"))
## 
## t test of coefficients:
## 
##                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)         108733.62    9151.62 11.8814 < 2.2e-16 ***
## Age                  -2710.68     371.28 -7.3008 4.865e-13 ***
## Education            14970.47    1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience   9005.05     547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Capped outliers
coeftest(m_all_C, vcov = vcovHC(m_all_C, type = "HC3"))
## 
## t test of coefficients:
## 
##                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)         108733.62    9151.62 11.8814 < 2.2e-16 ***
## Age                  -2710.68     371.28 -7.3008 4.865e-13 ***
## Education            14970.47    1231.27 12.1586 < 2.2e-16 ***
## Years.of.Experience   9005.05     547.66 16.4429 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

FINAL MODEL SUMMARY

summary(m_all)
## 
## Call:
## lm(formula = Salary ~ Age + Years.of.Experience + Education + 
##     Male + Race_AA + Race_Asian + Race_Hisp + Senior, data = dataset_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101305  -18083   -6055   15844   78239 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         104443.7     7731.4  13.509  < 2e-16 ***
## Age                  -2751.8      314.2  -8.757  < 2e-16 ***
## Years.of.Experience   9035.4      398.6  22.670  < 2e-16 ***
## Education            15863.3     1141.7  13.895  < 2e-16 ***
## Male                  7949.8     1566.4   5.075 4.41e-07 ***
## Race_AA               -723.1     2138.0  -0.338  0.73526    
## Race_Asian            3381.9     2168.2   1.560  0.11906    
## Race_Hisp              119.6     2190.3   0.055  0.95648    
## Senior               -7818.1     2411.4  -3.242  0.00122 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28230 on 1347 degrees of freedom
## Multiple R-squared:  0.7059, Adjusted R-squared:  0.7041 
## F-statistic: 404.1 on 8 and 1347 DF,  p-value: < 2.2e-16
confint(m_all, level = 0.95)
##                           2.5 %     97.5 %
## (Intercept)          89276.7408 119610.607
## Age                  -3368.1704  -2135.338
## Years.of.Experience   8253.5115   9817.279
## Education            13623.7230  18102.957
## Male                  4876.9326  11022.575
## Race_AA              -4917.3223   3471.133
## Race_Asian            -871.5987   7635.310
## Race_Hisp            -4177.2710   4416.384
## Senior              -12548.6322  -3087.598