data1 <- data.table::fread("C:/R-language/PBA/banksalary.csv")
require(tidyverse)
## 載入需要的套件:tidyverse
## Warning: 套件 'tidyverse' 是用 R 版本 4.2.2 來建造的
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: 套件 'ggplot2' 是用 R 版本 4.2.2 來建造的
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
glimpse(data1)
## Rows: 208
## Columns: 9
## $ Employee <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ EducLev <int> 3, 1, 1, 2, 3, 3, 3, 3, 1, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 2, 3…
## $ JobGrade <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ YrsExper <int> 3, 14, 12, 8, 3, 3, 4, 8, 4, 9, 9, 8, 9, 10, 4, 3, 6, 8, 5, 4…
## $ Age <int> 26, 38, 35, 40, 28, 24, 27, 33, 62, 31, 34, 37, 37, 58, 33, 2…
## $ Gender <chr> "Male", "Female", "Female", "Female", "Male", "Female", "Fema…
## $ YrsPrior <int> 1, 1, 0, 7, 0, 0, 0, 2, 0, 0, 2, 8, 0, 6, 0, 0, 0, 9, 6, 3, 2…
## $ PCJob <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…
## $ Salary <chr> "$32,000 ", "$39,100 ", "$33,200 ", "$30,600 ", "$29,000 ", "…
data1$Salary <- str_trim(data1$Salary, side = c("right"))
data1$Salary <- gsub(c("[$]"),"",data1$Salary)
data1$Salary <- gsub(c(","),"",data1$Salary)
data1$Salary <- as.integer(data1$Salary)
summary(data1)
## Employee EducLev JobGrade YrsExper
## Min. : 1.00 Min. :1.000 Min. :1.00 Min. : 2.000
## 1st Qu.: 52.75 1st Qu.:2.000 1st Qu.:1.00 1st Qu.: 5.000
## Median :104.50 Median :3.000 Median :3.00 Median : 8.000
## Mean :104.50 Mean :3.159 Mean :2.76 Mean : 9.673
## 3rd Qu.:156.25 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:13.000
## Max. :208.00 Max. :5.000 Max. :6.00 Max. :39.000
## Age Gender YrsPrior PCJob
## Min. :22.00 Length:208 Min. : 0.000 Length:208
## 1st Qu.:32.00 Class :character 1st Qu.: 0.000 Class :character
## Median :38.50 Mode :character Median : 1.000 Mode :character
## Mean :40.39 Mean : 2.375
## 3rd Qu.:47.25 3rd Qu.: 4.000
## Max. :65.00 Max. :18.000
## Salary
## Min. :26700
## 1st Qu.:33000
## Median :37000
## Mean :39922
## 3rd Qu.:44000
## Max. :97000
require(reshape2)
## 載入需要的套件:reshape2
## Warning: 套件 'reshape2' 是用 R 版本 4.2.2 來建造的
##
## 載入套件:'reshape2'
## 下列物件被遮斷自 'package:tidyr':
##
## smiths
require(data.table)
## 載入需要的套件:data.table
##
## 載入套件:'data.table'
## 下列物件被遮斷自 'package:reshape2':
##
## dcast, melt
## 下列物件被遮斷自 'package:dplyr':
##
## between, first, last
## 下列物件被遮斷自 'package:purrr':
##
## transpose
shapiro.test(data1$Salary)
##
## Shapiro-Wilk normality test
##
## data: data1$Salary
## W = 0.7792, p-value < 2.2e-16
data1[,shapiro.test(Salary),Gender] # match with normal distribution
## Gender statistic p.value method data.name
## 1: Male 0.8329482 2.744032e-07 Shapiro-Wilk normality test Salary
## 2: Female 0.9202464 4.814479e-07 Shapiro-Wilk normality test Salary
ansari.test(Salary ~ Gender, data1) #variance is not similar
##
## Ansari-Bradley test
##
## data: Salary by Gender
## AB = 8024, p-value = 0.0009319
## alternative hypothesis: true ratio of scales is not equal to 1
t.test(Salary ~ Gender, data = data1, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: Salary by Gender
## t = -4.141, df = 78.898, p-value = 8.604e-05
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -12282.943 -4308.082
## sample estimates:
## mean in group Female mean in group Male
## 37209.93 45505.44
dataSummary <- data1[,.(
Salary.mean=mean(Salary), Salary.sd=sd(Salary),
Lower=mean(Salary) - 2*sd(Salary)/sqrt(NROW(Salary)),
Upper=mean(Salary) + 2*sd(Salary)/sqrt(NROW(Salary))
), Gender]; dataSummary
## Gender Salary.mean Salary.sd Lower Upper
## 1: Male 45505.44 15843.218 41662.90 49347.99
## 2: Female 37209.93 6710.867 36075.59 38344.27
ggplot(dataSummary, aes(x=Salary.mean, y=Gender)) + geom_point() +
geom_errorbarh(aes(xmin=Lower, xmax=Upper), height=.2) +
theme_bw()
data2 <- data1
data2$EducLev <- as.factor(data2$EducLev)
data2$JobGrade <- as.factor(data2$JobGrade)
data2$Gender <- factor(data2$Gender,levels = c(unique(data2$Gender)))
data2$PCJob <- factor(data2$PCJob,levels = c(unique(data2$PCJob)))
require(fastDummies)
## 載入需要的套件:fastDummies
## Warning: 套件 'fastDummies' 是用 R 版本 4.2.2 來建造的
data_2 <- dummy_cols(data2,select_columns = c("Gender","EducLev","JobGrade","PCJob"))
data_dum <- data_2[,c(-10,-12,-17,-23)];data_dum
## Employee EducLev JobGrade YrsExper Age Gender YrsPrior PCJob Salary
## 1: 1 3 1 3 26 Male 1 No 32000
## 2: 2 1 1 14 38 Female 1 No 39100
## 3: 3 1 1 12 35 Female 0 No 33200
## 4: 4 2 1 8 40 Female 7 No 30600
## 5: 5 3 1 3 28 Male 0 No 29000
## ---
## 204: 204 3 6 34 60 Male 0 No 95000
## 205: 205 5 6 36 61 Male 0 No 97000
## 206: 206 5 6 32 62 Male 0 No 88000
## 207: 207 5 6 35 59 Male 0 No 94000
## 208: 208 5 6 33 62 Female 0 No 30000
## Gender_Female EducLev_2 EducLev_3 EducLev_4 EducLev_5 JobGrade_2
## 1: 0 0 1 0 0 0
## 2: 1 0 0 0 0 0
## 3: 1 0 0 0 0 0
## 4: 1 1 0 0 0 0
## 5: 0 0 1 0 0 0
## ---
## 204: 0 0 1 0 0 0
## 205: 0 0 0 0 1 0
## 206: 0 0 0 0 1 0
## 207: 0 0 0 0 1 0
## 208: 1 0 0 0 1 0
## JobGrade_3 JobGrade_4 JobGrade_5 JobGrade_6 PCJob_Yes
## 1: 0 0 0 0 0
## 2: 0 0 0 0 0
## 3: 0 0 0 0 0
## 4: 0 0 0 0 0
## 5: 0 0 0 0 0
## ---
## 204: 0 0 0 1 0
## 205: 0 0 0 1 0
## 206: 0 0 0 1 0
## 207: 0 0 0 1 0
## 208: 0 0 0 1 0
original model
model1 <- lm(Salary ~ YrsExper+Age+YrsPrior+Gender_Female+EducLev_2+EducLev_3+EducLev_4+EducLev_5
+JobGrade_2+JobGrade_3+JobGrade_4+JobGrade_5+JobGrade_6+PCJob_Yes, data = data_dum)
summary(model1)
##
## Call:
## lm(formula = Salary ~ YrsExper + Age + YrsPrior + Gender_Female +
## EducLev_2 + EducLev_3 + EducLev_4 + EducLev_5 + JobGrade_2 +
## JobGrade_3 + JobGrade_4 + JobGrade_5 + JobGrade_6 + PCJob_Yes,
## data = data_dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40117 -2359 -397 1778 23958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29689.935 2490.014 11.924 < 2e-16 ***
## YrsExper 515.583 97.980 5.262 3.77e-07 ***
## Age -8.962 57.699 -0.155 0.8767
## YrsPrior 167.727 140.442 1.194 0.2338
## Gender_Female -2554.474 1011.974 -2.524 0.0124 *
## EducLev_2 -485.552 1398.657 -0.347 0.7289
## EducLev_3 527.915 1357.519 0.389 0.6978
## EducLev_4 285.176 2404.727 0.119 0.9057
## EducLev_5 2690.801 1620.891 1.660 0.0985 .
## JobGrade_2 1564.497 1185.771 1.319 0.1886
## JobGrade_3 5219.358 1262.395 4.134 5.30e-05 ***
## JobGrade_4 8594.833 1496.018 5.745 3.53e-08 ***
## JobGrade_5 13659.409 1874.269 7.288 7.86e-12 ***
## JobGrade_6 23832.391 2799.888 8.512 4.75e-15 ***
## PCJob_Yes 4922.846 1473.825 3.340 0.0010 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5648 on 193 degrees of freedom
## Multiple R-squared: 0.7652, Adjusted R-squared: 0.7482
## F-statistic: 44.94 on 14 and 193 DF, p-value: < 2.2e-16
require(stargazer)
## 載入需要的套件:stargazer
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model1, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## Salary
## -----------------------------------------------
## YrsExper 515.583***
## (97.980)
##
## Age -8.962
## (57.699)
##
## YrsPrior 167.727
## (140.442)
##
## Gender_Female -2,554.474**
## (1,011.974)
##
## EducLev_2 -485.552
## (1,398.657)
##
## EducLev_3 527.915
## (1,357.519)
##
## EducLev_4 285.176
## (2,404.727)
##
## EducLev_5 2,690.801*
## (1,620.891)
##
## JobGrade_2 1,564.497
## (1,185.771)
##
## JobGrade_3 5,219.358***
## (1,262.395)
##
## JobGrade_4 8,594.833***
## (1,496.018)
##
## JobGrade_5 13,659.410***
## (1,874.269)
##
## JobGrade_6 23,832.390***
## (2,799.888)
##
## PCJob_Yes 4,922.846***
## (1,473.825)
##
## Constant 29,689.940***
## (2,490.014)
##
## -----------------------------------------------
## Observations 208
## R2 0.765
## Adjusted R2 0.748
## Residual Std. Error 5,648.080 (df = 193)
## F Statistic 44.939*** (df = 14; 193)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
require(jtools) # install the package, "jtools", first!
## 載入需要的套件:jtools
## Warning: 套件 'jtools' 是用 R 版本 4.2.2 來建造的
plot_summs(model1)
## 載入需要的命名空間:broom.mixed
model2 <- lm(Salary ~ YrsExper+scale(Age)+YrsPrior+Gender+EducLev+
JobGrade+PCJob, data = data2)
summary(model2)
##
## Call:
## lm(formula = Salary ~ YrsExper + scale(Age) + YrsPrior + Gender +
## EducLev + JobGrade + PCJob, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40117 -2359 -397 1778 23958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29327.92 1637.94 17.905 < 2e-16 ***
## YrsExper 515.58 97.98 5.262 3.77e-07 ***
## scale(Age) -92.48 595.39 -0.155 0.8767
## YrsPrior 167.73 140.44 1.194 0.2338
## GenderFemale -2554.47 1011.97 -2.524 0.0124 *
## EducLev2 -485.55 1398.66 -0.347 0.7289
## EducLev3 527.91 1357.52 0.389 0.6978
## EducLev4 285.18 2404.73 0.119 0.9057
## EducLev5 2690.80 1620.89 1.660 0.0985 .
## JobGrade2 1564.50 1185.77 1.319 0.1886
## JobGrade3 5219.36 1262.39 4.134 5.30e-05 ***
## JobGrade4 8594.83 1496.02 5.745 3.53e-08 ***
## JobGrade5 13659.41 1874.27 7.288 7.86e-12 ***
## JobGrade6 23832.39 2799.89 8.512 4.75e-15 ***
## PCJobYes 4922.85 1473.82 3.340 0.0010 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5648 on 193 degrees of freedom
## Multiple R-squared: 0.7652, Adjusted R-squared: 0.7482
## F-statistic: 44.94 on 14 and 193 DF, p-value: < 2.2e-16
stargazer(model2, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## Salary
## -----------------------------------------------
## YrsExper 515.583***
## (97.980)
##
## scale(Age) -92.480
## (595.393)
##
## YrsPrior 167.727
## (140.442)
##
## GenderFemale -2,554.474**
## (1,011.974)
##
## EducLev2 -485.552
## (1,398.657)
##
## EducLev3 527.915
## (1,357.519)
##
## EducLev4 285.176
## (2,404.727)
##
## EducLev5 2,690.801*
## (1,620.891)
##
## JobGrade2 1,564.497
## (1,185.771)
##
## JobGrade3 5,219.358***
## (1,262.395)
##
## JobGrade4 8,594.833***
## (1,496.018)
##
## JobGrade5 13,659.410***
## (1,874.269)
##
## JobGrade6 23,832.390***
## (2,799.888)
##
## PCJobYes 4,922.846***
## (1,473.825)
##
## Constant 29,327.920***
## (1,637.944)
##
## -----------------------------------------------
## Observations 208
## R2 0.765
## Adjusted R2 0.748
## Residual Std. Error 5,648.080 (df = 193)
## F Statistic 44.939*** (df = 14; 193)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
plot_summs(model2)
## 載入需要的命名空間:broom.mixed
lm1 <- lm(Salary ~ YrsExper+Gender+JobGrade, data = data2)
summary(lm1)
##
## Call:
## lm(formula = Salary ~ YrsExper + Gender + JobGrade, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39461 -2763 -704 2190 24394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30531.85 1142.83 26.716 < 2e-16 ***
## YrsExper 406.22 77.79 5.222 4.42e-07 ***
## GenderFemale -1926.52 1005.27 -1.916 0.0567 .
## JobGrade2 2667.72 1180.04 2.261 0.0249 *
## JobGrade3 6420.38 1165.81 5.507 1.11e-07 ***
## JobGrade4 10524.67 1367.86 7.694 6.36e-13 ***
## JobGrade5 16115.28 1557.68 10.346 < 2e-16 ***
## JobGrade6 27450.30 2412.98 11.376 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5831 on 200 degrees of freedom
## Multiple R-squared: 0.7407, Adjusted R-squared: 0.7316
## F-statistic: 81.61 on 7 and 200 DF, p-value: < 2.2e-16
data3 <- data2
data3$JobGrade <- as.integer(data3$JobGrade)
data3 <- dummy_cols(data3,select_columns = "Gender")
data3 <- data3[,-10]
salary1 <- lm(Salary ~ YrsExper+JobGrade,data = data3,subset=(Gender_Female==0))
summary(salary1)
##
## Call:
## lm(formula = Salary ~ YrsExper + JobGrade, data = data3, subset = (Gender_Female ==
## 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15506.2 -2268.7 287.2 2044.2 15140.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22244.94 1505.05 14.780 < 2e-16 ***
## YrsExper 1054.43 98.48 10.707 5.50e-16 ***
## JobGrade 3627.35 503.92 7.198 7.74e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5524 on 65 degrees of freedom
## Multiple R-squared: 0.882, Adjusted R-squared: 0.8784
## F-statistic: 243 on 2 and 65 DF, p-value: < 2.2e-16
salary2 <- lm(Salary ~ YrsExper+JobGrade,data = data3,subset=(Gender_Female==1))
summary(salary2)
##
## Call:
## lm(formula = Salary ~ YrsExper + JobGrade, data = data3, subset = (Gender_Female ==
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21988.6 -2924.6 -449.6 2071.0 14828.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27877.34 980.98 28.418 <2e-16 ***
## YrsExper 64.45 73.36 0.879 0.381
## JobGrade 3664.06 322.56 11.359 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4715 on 137 degrees of freedom
## Multiple R-squared: 0.5136, Adjusted R-squared: 0.5065
## F-statistic: 72.32 on 2 and 137 DF, p-value: < 2.2e-16
(3627.35-3664.06) / sqrt(98.48^2+73.36^2)
## [1] -0.2989398