require(fastDummies)
require(data.table)
require(car)
require(stargazer)
setwd("/Users/olivia/Documents/Documents/Study/Semester 5/PBA")
data<-read.csv("banksalary.csv")
head(data)
## Employee EducLev JobGrade YrsExper Age Gender YrsPrior PCJob Salary
## 1 1 3 1 3 26 Male 1 No $32,000
## 2 2 1 1 14 38 Female 1 No $39,100
## 3 3 1 1 12 35 Female 0 No $33,200
## 4 4 2 1 8 40 Female 7 No $30,600
## 5 5 3 1 3 28 Male 0 No $29,000
## 6 6 3 1 3 24 Female 0 No $30,500
data$Salary <- as.numeric(gsub('\\$|,', '', data$Salary))
summary(data[c("YrsExper","Age","YrsPrior","Salary")])
## YrsExper Age YrsPrior Salary
## Min. : 2.000 Min. :22.00 Min. : 0.000 Min. :26700
## 1st Qu.: 5.000 1st Qu.:32.00 1st Qu.: 0.000 1st Qu.:33000
## Median : 8.000 Median :38.50 Median : 1.000 Median :37000
## Mean : 9.673 Mean :40.39 Mean : 2.375 Mean :39922
## 3rd Qu.:13.000 3rd Qu.:47.25 3rd Qu.: 4.000 3rd Qu.:44000
## Max. :39.000 Max. :65.00 Max. :18.000 Max. :97000
table(data$EducLev)
##
## 1 2 3 4 5
## 36 35 63 8 66
table(data$JobGrade)
##
## 1 2 3 4 5 6
## 60 42 43 28 21 14
table(data$Gender)
##
## Female Male
## 140 68
table(data$PCJob)
##
## No Yes
## 189 19
xtabs(~ EducLev + JobGrade + Gender + PCJob, data=data)
## , , Gender = Female, PCJob = No
##
## JobGrade
## EducLev 1 2 3 4 5 6
## 1 17 8 3 1 0 0
## 2 14 6 6 3 0 0
## 3 13 4 11 4 2 0
## 4 0 1 2 0 1 0
## 5 0 3 9 6 6 1
##
## , , Gender = Male, PCJob = No
##
## JobGrade
## EducLev 1 2 3 4 5 6
## 1 2 0 0 1 0 0
## 2 2 3 1 0 0 0
## 3 8 5 2 2 1 2
## 4 0 1 1 0 0 1
## 5 0 4 3 8 11 10
##
## , , Gender = Female, PCJob = Yes
##
## JobGrade
## EducLev 1 2 3 4 5 6
## 1 1 2 0 1 0 0
## 2 0 0 0 0 0 0
## 3 2 2 3 2 0 0
## 4 0 0 1 0 0 0
## 5 1 3 1 0 0 0
##
## , , Gender = Male, PCJob = Yes
##
## JobGrade
## EducLev 1 2 3 4 5 6
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
shapiro.test(data[data$Gender=="Male","Salary"])
##
## Shapiro-Wilk normality test
##
## data: data[data$Gender == "Male", "Salary"]
## W = 0.83295, p-value = 2.744e-07
shapiro.test(data[data$Gender=="Female","Salary"])
##
## Shapiro-Wilk normality test
##
## data: data[data$Gender == "Female", "Salary"]
## W = 0.92025, p-value = 4.814e-07
The data likely does not follow normal dist so we can use ansari test to check the equality of variances.
ansari.test(Salary ~ Gender, data)
##
## Ansari-Bradley test
##
## data: Salary by Gender
## AB = 8024, p-value = 0.0009319
## alternative hypothesis: true ratio of scales is not equal to 1
The variances are likely not equal, and thus we can use the welch two sample t-test.
t.test(Salary ~ Gender, data=data, var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: Salary by Gender
## t = -4.141, df = 78.898, p-value = 8.604e-05
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -12282.943 -4308.082
## sample estimates:
## mean in group Female mean in group Male
## 37209.93 45505.44
The p value is below 0.01 so we can reject the null hypothesis (h0 = the average salary between female and male employees are equal) and say that there is a significant difference in average salary between female employees and male employees.
data$EducLev_2<-ifelse(data$EducLev == "2", 1, 0)
data$EducLev_3<-ifelse(data$EducLev == "3", 1, 0)
data$EducLev_4<-ifelse(data$EducLev == "4", 1, 0)
data$EducLev_5<-ifelse(data$EducLev == "5", 1, 0)
data$JobGrade_2<-ifelse(data$JobGrade == "2", 1, 0)
data$JobGrade_3<-ifelse(data$JobGrade == "3", 1, 0)
data$JobGrade_4<-ifelse(data$JobGrade == "4", 1, 0)
data$JobGrade_5<-ifelse(data$JobGrade == "5", 1, 0)
data$JobGrade_6<-ifelse(data$JobGrade == "6", 1, 0)
data$Gender_F<-ifelse(data$Gender == "Female", 1, 0)
data$PCJob_Y<-ifelse(data$PCJob == "Yes", 1, 0)
head(data)
## Employee EducLev JobGrade YrsExper Age Gender YrsPrior PCJob Salary EducLev_2
## 1 1 3 1 3 26 Male 1 No 32000 0
## 2 2 1 1 14 38 Female 1 No 39100 0
## 3 3 1 1 12 35 Female 0 No 33200 0
## 4 4 2 1 8 40 Female 7 No 30600 1
## 5 5 3 1 3 28 Male 0 No 29000 0
## 6 6 3 1 3 24 Female 0 No 30500 0
## EducLev_3 EducLev_4 EducLev_5 JobGrade_2 JobGrade_3 JobGrade_4 JobGrade_5
## 1 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0
## JobGrade_6 Gender_F PCJob_Y
## 1 0 0 0
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 5 0 0 0
## 6 0 1 0
(If EducLev_2 - EducLev_5 == 0 it means that the EducLev is 1)
(If JobGrade_2 - JobGrade_6 == 0 it means that the JobGrade is 1)
(If Gender == 0 it means that it is male else is female)
(If PCJob == 0 it means that it is no else is yes)
temp <- lm(Salary ~ EducLev_2 +EducLev_3 +EducLev_4 +EducLev_5 + JobGrade_2 +JobGrade_3 +JobGrade_4 +JobGrade_5 +JobGrade_6 + YrsExper + Age + Gender_F + YrsPrior + PCJob_Y,data = data)
stargazer(temp,type="text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## Salary
## -----------------------------------------------
## EducLev_2 -485.552
## (1,398.657)
##
## EducLev_3 527.915
## (1,357.519)
##
## EducLev_4 285.176
## (2,404.727)
##
## EducLev_5 2,690.801*
## (1,620.891)
##
## JobGrade_2 1,564.497
## (1,185.771)
##
## JobGrade_3 5,219.358***
## (1,262.395)
##
## JobGrade_4 8,594.833***
## (1,496.018)
##
## JobGrade_5 13,659.410***
## (1,874.269)
##
## JobGrade_6 23,832.390***
## (2,799.888)
##
## YrsExper 515.583***
## (97.980)
##
## Age -8.962
## (57.699)
##
## Gender_F -2,554.474**
## (1,011.974)
##
## YrsPrior 167.727
## (140.442)
##
## PCJob_Y 4,922.846***
## (1,473.825)
##
## Constant 29,689.940***
## (2,490.014)
##
## -----------------------------------------------
## Observations 208
## R2 0.765
## Adjusted R2 0.748
## Residual Std. Error 5,648.080 (df = 193)
## F Statistic 44.939*** (df = 14; 193)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
summary(temp)
##
## Call:
## lm(formula = Salary ~ EducLev_2 + EducLev_3 + EducLev_4 + EducLev_5 +
## JobGrade_2 + JobGrade_3 + JobGrade_4 + JobGrade_5 + JobGrade_6 +
## YrsExper + Age + Gender_F + YrsPrior + PCJob_Y, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40117 -2359 -397 1778 23958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29689.935 2490.014 11.924 < 2e-16 ***
## EducLev_2 -485.552 1398.657 -0.347 0.7289
## EducLev_3 527.915 1357.519 0.389 0.6978
## EducLev_4 285.176 2404.727 0.119 0.9057
## EducLev_5 2690.801 1620.891 1.660 0.0985 .
## JobGrade_2 1564.497 1185.771 1.319 0.1886
## JobGrade_3 5219.358 1262.395 4.134 5.30e-05 ***
## JobGrade_4 8594.833 1496.018 5.745 3.53e-08 ***
## JobGrade_5 13659.409 1874.269 7.288 7.86e-12 ***
## JobGrade_6 23832.391 2799.888 8.512 4.75e-15 ***
## YrsExper 515.583 97.980 5.262 3.77e-07 ***
## Age -8.962 57.699 -0.155 0.8767
## Gender_F -2554.474 1011.974 -2.524 0.0124 *
## YrsPrior 167.727 140.442 1.194 0.2338
## PCJob_Y 4922.846 1473.825 3.340 0.0010 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5648 on 193 degrees of freedom
## Multiple R-squared: 0.7652, Adjusted R-squared: 0.7482
## F-statistic: 44.94 on 14 and 193 DF, p-value: < 2.2e-16
R-squared : A statistical measure in a regression model that determines the proportion of variance in the dependent variable that can be explained by the independent variable. It shows how well the predictor variables are at predicting the value of the response variable.
T-Values : It displays the test statistic. It is a value from the division between the estimate and the std.error . The larger the test statistic is, the less likely that the results occurred by chance or we can also say that the standard error will likely to be low which is great. Then we can use the test statistics to obtain the p-values.
Coefficients : How much the mean of the dependent variable changes given a one-unit shift in the independent variable while holding other variables in the model constant.
In conclusion, We can see that the independent variable have a quite significant impact to the dependent variable which is salary.The r-squared values carries a high predictive power (0.7482). The negative coefficient indicates a negative relationship(decreasing) while the positive coefficient indicates a positive relationship (increasing). The equation of the linear regression is :
Y = b1x1 + b2x2 + … + bnxn + c
Y : Salary
X1…Xn : Independent variables (ex. EducLev_2, Age, Gender,etc)
B : Coefficients of each independent variables
Yes, the data clearly stated that there is a discrimination against female employees in terms of salary. We can see that the “Gender_F” which means female indicates a negative relationship (-2554.474).
Interaction : An interaction occurs when an independent variable has a different effect on the outcome depending on the values of another independent variable. We could include an interaction term if the variables have large main effect and the effect of one also changes for a number of subgroups of the other.In the two-predictor case, the two-way interaction term is constructed by computing the product of X1 and X2.
If the p-value of the coefficient of the interaction term turns out to be lower than the significance level (usually 0.05) that suggests the interaction term is significantly different from 0. In that case, we should keep the interaction term in the model.
We can also compare the R-squared of the model between the ones who have interaction and without.