mba.df<-read.csv(paste("MBASSD.csv"))
View(mba.df)
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
attach(mba.df)
Histogram of age distribution
hist(mba.df$age, main="Histogram of age distribution",xlab="Age", col="purple")
Histogram of gender
gender=factor(mba.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(gender,col = "purple",main = "Histogram of sex distribution")
Boxplot of gmat total
boxplot(mba.df$gmat_tot, main="Boxplot of gmat total", xlab="Gmat total",
col="maroon")
Boxplot of Work experience
boxplot(mba.df$work_yrs, main="Boxplot of work experience", xlab="Work (years)",
col="yellow")
Histogram of salary distribution
hist(mba.df$salary, main="Boxplot of mba salary", xlab="salary",
col="dark blue")
Histogram of First language
flang=factor(mba.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(flang,col="yellow", main="Histogram of first language")
Plot for satisfaction level
sl<- mba.df[ which(mba.df$satis<='7'), ]
hist(sl$satis,breaks =5,col="dark blue",xlab="Satisfaction level",
main="Histogram for Satisfaction")
library(corrgram)
corrgram(mba.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of the dataset")
cov(mba.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
var(mba.df)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
emp.df <- subset(mba.df, salary>0 & salary!= 998 & salary!=999)
View(emp.df)
aggregate(age~sex, data = emp.df, mean)
## sex age
## 1 1 27.08333
## 2 2 26.06452
aggregate(salary~age, data = emp.df, mean)
## age salary
## 1 22 85000.00
## 2 23 91651.20
## 3 24 101518.75
## 4 25 99086.96
## 5 26 101665.00
## 6 27 102214.29
## 7 28 103625.00
## 8 29 102083.33
## 9 30 109916.67
## 10 31 100500.00
## 11 32 107300.00
## 12 33 118000.00
## 13 34 105000.00
## 14 39 112000.00
## 15 40 183000.00
aggregate(salary~sex, data=emp.df, mean)
## sex salary
## 1 1 104970.97
## 2 2 98524.39
aggregate(salary ~ gmat_tot, data=emp.df, mean)
## gmat_tot salary
## 1 500 158250.0
## 2 520 78256.0
## 3 530 99500.0
## 4 540 104000.0
## 5 550 112236.7
## 6 560 94000.0
## 7 570 103857.1
## 8 580 99875.0
## 9 590 97000.0
## 10 600 107666.7
## 11 610 96200.0
## 12 620 104108.3
## 13 630 105812.5
## 14 640 110000.0
## 15 650 101285.7
## 16 660 92480.0
## 17 670 100642.9
## 18 680 102166.7
## 19 700 122333.3
## 20 710 101250.0
## 21 720 85000.0
t.test(emp.df)
##
## One Sample t-test
##
## data: emp.df
## t = 10.492, df = 1338, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 6500.198 9490.068
## sample estimates:
## mean of x
## 7995.133
t.test(salary~sex, data=emp.df)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
t.test(salary~frstlang, data=emp.df)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
chisq.test(emp.df$work_yrs,emp.df$satis,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: emp.df$work_yrs and emp.df$satis
## X-squared = 131.13, df = NA, p-value = 0.02399
#Null hypothesis is rejected, work years and satisfaction points are not independent
chisq.test(emp.df$sex,emp.df$gmat_tot,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: emp.df$sex and emp.df$gmat_tot
## X-squared = 18.554, df = NA, p-value = 0.5982
#Failed to reject null hypothesis,Gender has no role to play on gmat total
chisq.test(emp.df$sex,emp.df$salary,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: emp.df$sex and emp.df$salary
## X-squared = 52.681, df = NA, p-value = 0.03398
#Null hypothesis rejected, Gender plays a role to play on salary
chisq.test(emp.df$frstlang,emp.df$salary,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: emp.df$frstlang and emp.df$salary
## X-squared = 69.847, df = NA, p-value = 0.02349
# Null hypothesis rejected, First language plays a role on salary
m1 <- lm(salary ~ gmat_tot + sex + s_avg+f_avg+frstlang, data = emp.df)
summary(m1)
##
## Call:
## lm(formula = salary ~ gmat_tot + sex + s_avg + f_avg + frstlang,
## data = emp.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37896 -8763 -1578 5154 104055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89490.94 25748.69 3.476 0.000764 ***
## gmat_tot -27.28 33.62 -0.811 0.419067
## sex -7102.35 3678.66 -1.931 0.056441 .
## s_avg 11401.42 4991.92 2.284 0.024554 *
## f_avg -5822.22 3861.27 -1.508 0.134843
## frstlang 20902.76 6711.49 3.114 0.002423 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16810 on 97 degrees of freedom
## Multiple R-squared: 0.1579, Adjusted R-squared: 0.1145
## F-statistic: 3.638 on 5 and 97 DF, p-value: 0.004649
m2<- lm(salary ~ work_yrs + sex + f_avg+s_avg+frstlang, data = emp.df)
summary(m2)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + f_avg + s_avg + frstlang,
## data = emp.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31846 -9078 -1992 5210 83864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75605.2 16094.4 4.698 8.68e-06 ***
## work_yrs 2229.5 572.6 3.893 0.000182 ***
## sex -5829.1 3444.0 -1.693 0.093760 .
## f_avg -1172.7 3805.0 -0.308 0.758581
## s_avg 4562.8 4905.2 0.930 0.354584
## frstlang 15281.9 6426.9 2.378 0.019377 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15690 on 97 degrees of freedom
## Multiple R-squared: 0.2668, Adjusted R-squared: 0.229
## F-statistic: 7.058 on 5 and 97 DF, p-value: 1.15e-05
m3<-lm(salary ~ work_yrs+gmat_tot + sex + f_avg+s_avg+frstlang, data = emp.df)
summary(m3)
##
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + sex + f_avg + s_avg +
## frstlang, data = emp.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32652 -8940 -1709 5186 83182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82356.30 24206.88 3.402 0.000976 ***
## work_yrs 2201.83 579.92 3.797 0.000257 ***
## gmat_tot -11.90 31.77 -0.375 0.708712
## sex -5886.39 3462.79 -1.700 0.092388 .
## f_avg -1153.74 3822.28 -0.302 0.763422
## s_avg 4851.02 4986.79 0.973 0.333110
## frstlang 15101.77 6473.46 2.333 0.021743 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15760 on 96 degrees of freedom
## Multiple R-squared: 0.2678, Adjusted R-squared: 0.2221
## F-statistic: 5.853 on 6 and 96 DF, p-value: 3.114e-05
Model 3 is a better model
notemp.df <- subset(mba.df, salary==0)
View(notemp.df)
aggregate(age~sex, data = notemp.df, mean)
## sex age
## 1 1 28.29851
## 2 2 29.13043
aggregate(age~gmat_tot, data=notemp.df, mean)
## gmat_tot age
## 1 450 26.00000
## 2 480 43.00000
## 3 510 30.50000
## 4 530 30.33333
## 5 540 29.66667
## 6 550 27.00000
## 7 560 28.50000
## 8 570 27.57143
## 9 580 28.00000
## 10 590 34.66667
## 11 600 24.00000
## 12 610 28.77778
## 13 620 27.25000
## 14 630 32.20000
## 15 640 26.00000
## 16 650 32.00000
## 17 660 26.66667
## 18 670 26.00000
## 19 680 28.00000
## 20 700 32.00000
## 21 710 25.25000
## 22 720 25.00000
## 23 730 25.00000
## 24 740 27.00000
## 25 750 27.00000
## 26 760 32.00000
aggregate(age ~work_yrs, data=notemp.df, mean)
## work_yrs age
## 1 0 24.00000
## 2 1 25.83333
## 3 2 24.95455
## 4 3 27.00000
## 5 4 27.55556
## 6 5 29.33333
## 7 6 28.50000
## 8 7 32.60000
## 9 8 35.00000
## 10 9 37.00000
## 11 10 31.00000
## 12 11 33.00000
## 13 12 34.50000
## 14 13 42.00000
## 15 16 43.00000
## 16 18 36.00000
## 17 22 45.50000
chisq.test(notemp.df$work_yrs,notemp.df$satis,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: notemp.df$work_yrs and notemp.df$satis
## X-squared = 44.974, df = NA, p-value = 0.5392
Null hypothesis is true.
chisq.test(emp.df$sex,emp.df$gmat_tot,simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: emp.df$sex and emp.df$gmat_tot
## X-squared = 18.554, df = NA, p-value = 0.6022
Null hypothesis is true, Gender and gmat total are indpendent