mba<-read.csv(paste("MBA Starting Salaries Data.csv",sep=" "))
View(mba)
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
attach(mba)
hist(sex,col="light green")
Most of the students were Male.
boxplot(age,col="light green",main="Age Distribution of the students",ylab="Age")
Most of the people are young.
barplot(table(satis[satis!=998]),col=c("light green"))
hist(salary,main="Salary distribution",col="light green")
hist(quarter,col="light green",main="Histogram of Quartile Ranking")
hist(work_yrs,col="light green")
boxplot(salary~sex,col=c("light blue","pink"),las=2,main="Plot of Sex and Salary",horizontal=TRUE,xlab="Salary",ylab="Sex",names=c("Females","Males"))
Clearly we can see that the average and the 1st quartile salaries are almost same for both men and women but the peak salaries for females are much heigher than the males.
plot(gmat_tot,salary,main="Salary vs Total score in GMAT",xlab="GMAT score",ylab="Salary")
plot(gmat_tpc,salary)
boxplot(salary~work_yrs,main="Work experience vs Salary",col="peachpuff",xlab="Work Experience(in years)",ylab="Salary")
boxplot(salary~quarter,main="Quartile Ranking vs Salary",col="peachpuff",xlab="Quartile Rankings",ylab="Salary")
The plot shows that high quartile ranking will get you better salary.
plot(salary~age,main="Salary vs Age")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula = ~ age + gmat_tot +s_avg +f_avg + work_yrs +frstlang, cex=1,
data=mba,diagonal="histogram")
library(corrgram)
corrgram(mba,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)
var(mba)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
cov(mba)
## age sex gmat_tot gmat_qpc
## age 1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex -4.513248e-02 1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00 3.310688e+03 6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00 6.200233e+02 2.210731e+02
## gmat_vpc -2.763643e+00 5.463758e-01 7.260006e+02 3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02 6.839911e+02 1.357997e+02
## s_avg 2.116874e-01 2.096227e-02 2.480257e+00 -1.691233e-01
## f_avg -3.399348e-02 2.082698e-02 3.154688e+00 5.753854e-01
## quarter -2.045935e-01 -6.414267e-02 -5.891153e+00 6.001979e-01
## work_yrs 1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang 6.796610e-02 2.138980e-04 -2.499933e+00 6.646346e-01
## salary -1.183042e+04 1.518264e+03 -1.611600e+05 -3.335823e+04
## satis -1.763499e+02 -8.780808e+00 1.765263e+03 3.348371e+02
## gmat_vpc gmat_tpc s_avg f_avg
## age -2.7636427 -8.8399775 0.21168739 -0.03399348
## sex 0.5463758 -0.0490896 0.02096227 0.02082698
## gmat_tot 726.0006417 683.9910698 2.48025721 3.15468838
## gmat_qpc 38.1482581 135.7996845 -0.16912329 0.57538542
## gmat_vpc 284.2481217 157.4932488 1.31357023 0.67207000
## gmat_tpc 157.4932488 196.6057057 0.62710008 0.58698618
## s_avg 1.3135702 0.6271001 0.14521760 0.11016898
## f_avg 0.6720700 0.5869862 0.11016898 0.27567237
## quarter -3.2676666 -1.2923719 -0.32237213 -0.26080880
## work_yrs -3.6181653 -7.8575172 0.15926392 -0.06628700
## frstlang -2.1145691 -0.4663244 -0.01671372 -0.00626026
## salary -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis 392.3562739 484.2466779 -4.62884495 2.12532927
## quarter work_yrs frstlang salary
## age -2.045935e-01 10.29493864 6.796610e-02 -1.183042e+04
## sex -6.414267e-02 -0.01580172 2.138980e-04 1.518264e+03
## gmat_tot -5.891153e+00 -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc 6.001979e-01 -11.37186171 6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00 -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00 -7.85751718 -4.663244e-01 3.522750e+03
## s_avg -3.223721e-01 0.15926392 -1.671372e-02 2.831601e+03
## f_avg -2.608088e-01 -0.06628700 -6.260260e-03 7.876560e+02
## quarter 1.232119e+00 -0.30866822 3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01 10.44882490 -2.898318e-02 1.486147e+03
## frstlang 3.553381e-02 -0.02898318 1.035266e-01 -1.419586e+03
## salary -9.296214e+03 1486.14704152 -1.419586e+03 2.596062e+09
## satis -5.227133e-03 -131.24080907 9.484532e+00 -6.347115e+06
## satis
## age -1.763499e+02
## sex -8.780808e+00
## gmat_tot 1.765263e+03
## gmat_qpc 3.348371e+02
## gmat_vpc 3.923563e+02
## gmat_tpc 4.842467e+02
## s_avg -4.628845e+00
## f_avg 2.125329e+00
## quarter -5.227133e-03
## work_yrs -1.312408e+02
## frstlang 9.484532e+00
## salary -6.347115e+06
## satis 1.380974e+05
selected<-mba[which(salary>1000),] #As 998 and 999 are the answers of those "did not answer" and "did not disclose salary" respectively.
library(car)
some(selected)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 46 23 2 650 93 81 93 3.4 3.00 1
## 53 30 1 600 60 91 83 3.3 3.25 1
## 55 30 1 620 60 96 87 3.5 3.00 1
## 67 30 2 670 87 95 95 3.3 3.25 1
## 120 24 1 560 52 81 72 3.2 3.25 2
## 193 28 1 580 72 71 78 2.8 3.00 3
## 194 24 2 670 83 98 96 2.9 3.25 3
## 199 29 1 710 93 98 99 2.9 3.25 3
## 207 32 1 660 83 95 94 2.9 3.50 3
## 256 24 2 560 55 78 71 3.5 3.25 4
## work_yrs frstlang salary satis
## 46 2 1 100000 7
## 53 5 1 105000 6
## 55 8 1 106000 7
## 67 8 1 120000 6
## 120 2 1 96000 7
## 193 3 1 97000 6
## 194 2 1 98000 7
## 199 7 1 98000 5
## 207 2 2 107300 7
## 256 2 1 64000 7
mytable1<-aggregate(salary~sex,data=selected,mean)
mytable1
## sex salary
## 1 1 104970.97
## 2 2 98524.39
It can easily be seen that the average salary of Male is much larger than Females.
mytable2<-aggregate(salary~frstlang,data=selected,mean)
mytable2
## frstlang salary
## 1 1 101748.6
## 2 2 120614.3
It can be clearly seen that english speaking students and non english speaking students both have comparable average salaries.
mytable3<-aggregate(salary~work_yrs,data=selected,mean)
mytable3
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
There is almost no affect on salary of experiences and non experienced people.
mytable4<-aggregate(salary~gmat_tot,data=selected,mean)
mytable4
## gmat_tot salary
## 1 500 158250.0
## 2 520 78256.0
## 3 530 99500.0
## 4 540 104000.0
## 5 550 112236.7
## 6 560 94000.0
## 7 570 103857.1
## 8 580 99875.0
## 9 590 97000.0
## 10 600 107666.7
## 11 610 96200.0
## 12 620 104108.3
## 13 630 105812.5
## 14 640 110000.0
## 15 650 101285.7
## 16 660 92480.0
## 17 670 100642.9
## 18 680 102166.7
## 19 700 122333.3
## 20 710 101250.0
## 21 720 85000.0
The gmat score has no effect on the salary.
mytable5<-aggregate(salary~quarter,data=selected,mean)
mytable5
## quarter salary
## 1 1 106328.6
## 2 2 103612.0
## 3 3 98319.0
## 4 4 102142.6
Quartile Ranking has no effect on salary as well.
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable1
## X-squared = 0.0030128, df = 1, p-value = 0.9562
As p-value>0.05 therefore the test shows that sex is independant of salary
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable2
## X-squared = 1.0827e-22, df = 1, p-value = 1
As p-value>0.05 therefore the test shows that First language is independant of salary.
chisq.test(mytable3)
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 33.445, df = 11, p-value = 0.0004455
As p-value<0.05 therefore the test shows that salary is dependant upon work experience a person have.
t.test(mytable4)
##
## One Sample t-test
##
## data: mytable4
## t = 6.343, df = 41, p-value = 1.406e-07
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 35520.98 68705.25
## sample estimates:
## mean of x
## 52113.12
As p-value<0.05 therefore the test shows that salary is dependant upon GMAT total score of a person.
t.test(mytable5)
##
## One Sample t-test
##
## data: mytable5
## t = 2.6438, df = 7, p-value = 0.03324
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 5416.909 97186.142
## sample estimates:
## mean of x
## 51301.53
As p-value<0.05 therefore the test shows that Quartile Ranking is dependant of salary.
notselected<-subset(mba,salary==0)
some(notselected)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 2 620 77 87 87 3.40 3.00 1
## 7 25 1 610 89 74 87 3.40 3.50 1
## 24 28 2 540 75 50 65 3.60 4.00 1
## 29 32 1 640 79 91 91 3.60 3.75 1
## 33 42 2 650 75 98 93 3.38 3.00 1
## 34 48 1 590 84 62 81 3.80 4.00 1
## 92 27 1 720 99 95 99 3.10 3.25 2
## 150 25 1 550 72 58 69 2.90 3.00 3
## 183 34 1 610 79 81 86 2.80 3.00 3
## 236 28 1 710 94 98 99 3.40 3.75 4
## work_yrs frstlang salary satis
## 1 2 1 0 7
## 7 2 1 0 5
## 24 5 1 0 5
## 29 7 1 0 6
## 33 13 1 0 5
## 34 22 1 0 6
## 92 5 1 0 5
## 150 3 1 0 6
## 183 11 1 0 6
## 236 6 1 0 6
mytable1<-with(notselected,table(sex))
mytable1
## sex
## 1 2
## 67 23
More number of males are not selected.
mytable2<-with(notselected,table(frstlang))
mytable2
## frstlang
## 1 2
## 82 8
A large number of people exactly 82 of them knew English and didn’t get selected.
mytable3<-with(notselected,table(work_yrs))
mytable3
## work_yrs
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 16 18 22
## 1 12 22 14 9 12 2 5 2 1 1 2 2 1 1 1 2
People with less work experience and without selection are large in number.
mytable4<-with(notselected,table(gmat_tot))
mytable4
## gmat_tot
## 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670
## 1 1 2 3 3 4 8 7 4 3 3 9 4 5 6 5 3 4
## 680 700 710 720 730 740 750 760
## 3 2 4 2 1 1 1 1
People not getting selected vary wastly in their GMAT Scores.
mytable5<-with(notselected,table(quarter))
mytable5
## quarter
## 1 2 3 4
## 18 27 23 22
People with heigher quartile are not selected but the difference is not too large.
fit1<-lm(salary~gmat_qpc+gmat_tot+gmat_tpc+gmat_vpc,data=mba)
summary(fit1)
##
## Call:
## lm(formula = salary ~ gmat_qpc + gmat_tot + gmat_tpc + gmat_vpc,
## data = mba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48199 -41195 -33034 56735 182897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141539.0 59303.9 2.387 0.0177 *
## gmat_qpc 465.7 615.2 0.757 0.4497
## gmat_tot -369.7 222.7 -1.660 0.0980 .
## gmat_tpc 523.2 443.0 1.181 0.2386
## gmat_vpc 573.4 563.0 1.018 0.3094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50900 on 269 degrees of freedom
## Multiple R-squared: 0.01651, Adjusted R-squared: 0.001889
## F-statistic: 1.129 on 4 and 269 DF, p-value: 0.343
The above model is not a good model as Multiple R-squared = 1.65% only and the Adjusted R-squared = 0.19% only.And by looking at the data we can say that salary is independent of the marks that you get in your GMAT exam.
fit3<-lm(salary~work_yrs+s_avg+f_avg+quarter,data=mba)
summary(fit3)
##
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + quarter, data = mba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66801 -40287 -29000 54493 197266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41852.2 46698.6 0.896 0.371
## work_yrs -274.3 961.9 -0.285 0.776
## s_avg 12329.4 13418.1 0.919 0.359
## f_avg -7861.6 7042.9 -1.116 0.265
## quarter -6051.9 4257.7 -1.421 0.156
##
## Residual standard error: 50490 on 269 degrees of freedom
## Multiple R-squared: 0.03258, Adjusted R-squared: 0.01819
## F-statistic: 2.265 on 4 and 269 DF, p-value: 0.06256
This model is also not a good model as Multiple R-squared = 3.25% only and the Adjusted R-squared = 1.81% only.And by looking at the data we can say that salary is independent of Marks obtained in course and work experience.