setwd("C:/Users/Taiyyab Ali/Desktop/R language")
MBASalary <- read.csv(paste("MBAStartingSalariesData.csv",sep=""))
placedMBA <- MBASalary[which(MBASalary$salary > 999),]
2.Salary dependent variables in MBAs
model1 <- salary ~ age + sex + work_yrs + gmat_tot + f_avg + s_avg + quarter + satis
fit1 <-lm (model1, data = placedMBA)
summary(fit1)
##
## Call:
## lm(formula = model1, data = placedMBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24191 -7715 -1892 5226 84183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77506.24 41331.20 1.875 0.0639 .
## age 2378.24 1030.48 2.308 0.0232 *
## sex -4021.37 3518.14 -1.143 0.2559
## work_yrs 327.98 1118.32 0.293 0.7700
## gmat_tot -16.54 32.10 -0.515 0.6077
## f_avg -245.64 3870.72 -0.063 0.9495
## s_avg -2688.53 8041.77 -0.334 0.7389
## quarter -1691.87 2676.79 -0.632 0.5289
## satis -1877.21 2099.06 -0.894 0.3734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15840 on 94 degrees of freedom
## Multiple R-squared: 0.276, Adjusted R-squared: 0.2144
## F-statistic: 4.48 on 8 and 94 DF, p-value: 0.0001243
Excepet age all variable seems statistically insignificant.
library(leaps)
leap1 <- regsubsets(model1, data = placedMBA, nbest=1)
# summary(leap1)
plot(leap1, scale="adjr2")
All the variable have very less significance in salary.
3.Lets exclude MBA score
model2 <- salary ~ age + sex + work_yrs + gmat_tot + quarter + satis
fit2 <- lm(model2, data = placedMBA)
summary(fit2)
##
## Call:
## lm(formula = model2, data = placedMBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26282 -7776 -2189 5475 84544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68433.67 32014.97 2.138 0.0351 *
## age 2378.81 1017.95 2.337 0.0215 *
## sex -4221.88 3423.82 -1.233 0.2206
## work_yrs 313.32 1100.36 0.285 0.7765
## gmat_tot -18.49 31.33 -0.590 0.5564
## quarter -889.75 1456.19 -0.611 0.5426
## satis -1930.23 2073.30 -0.931 0.3542
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15680 on 96 degrees of freedom
## Multiple R-squared: 0.275, Adjusted R-squared: 0.2297
## F-statistic: 6.07 on 6 and 96 DF, p-value: 2.035e-05
library(coefplot)
## Loading required package: ggplot2
coefplot(fit2, intercept= FALSE, outerCI=1.96,coefficients=c("age","sex","work_yrs", "gmat_tot","quarter","satis"))
## Warning: Ignoring unknown aesthetics: xmin, xmax
fit3 <- lm(salary ~ age + work_yrs + gmat_tot, data = placedMBA)
summary(fit3)
##
## Call:
## lm(formula = salary ~ age + work_yrs + gmat_tot, data = placedMBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32657 -8150 -2117 4705 78974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46876.35 29418.14 1.593 0.1142
## age 2448.62 1002.87 2.442 0.0164 *
## work_yrs 319.93 1094.82 0.292 0.7707
## gmat_tot -17.19 30.92 -0.556 0.5795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15680 on 99 degrees of freedom
## Multiple R-squared: 0.2529, Adjusted R-squared: 0.2303
## F-statistic: 11.17 on 3 and 99 DF, p-value: 2.228e-06
fit4 <- lm(salary ~ age + gmat_tot, data = placedMBA)
summary(fit4)
##
## Call:
## lm(formula = salary ~ age + gmat_tot, data = placedMBA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32536 -8423 -1802 4955 79066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41776.16 23573.16 1.772 0.0794 .
## age 2706.59 473.72 5.713 1.15e-07 ***
## gmat_tot -18.21 30.58 -0.596 0.5528
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 100 degrees of freedom
## Multiple R-squared: 0.2523, Adjusted R-squared: 0.2373
## F-statistic: 16.87 on 2 and 100 DF, p-value: 4.859e-07
It seems like starting salary is mostly depend on other variable like how they presented self in interview or other aptitude test which are not covered in data because intercept is very high and statistically significant. Although age, sex, and other score matter because everytime reducing variable R-square and adjusted R-squre deceases.