setwd("C:\\Users\\Adithya Nataraj\\Downloads")
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba.df)

Summary

mba1.df <- mba.df[which(mba.df$salary!= 998 & mba.df$salary!= 999 & mba.df$satis!= 998), ]
mba2.df <- mba.df[which(mba.df$salary!= 998 & mba.df$salary!= 999 & mba.df$satis!= 998 & mba.df$salary!= 0), ]
mba3.df <- mba.df[which(mba.df$salary!=998 & mba.df$salary!=999 & mba.df$salary==0 & mba.df$satis!= 998), ]

summary(mba1.df)
##       age             sex          gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.00   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.00   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00   Median :1.00   Median :610.0   Median :82.00  
##  Mean   :27.59   Mean   :1.28   Mean   :615.2   Mean   :79.35  
##  3rd Qu.:29.00   3rd Qu.:2.00   3rd Qu.:650.0   3rd Qu.:91.00  
##  Max.   :48.00   Max.   :2.00   Max.   :760.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.090   Median :3.000  
##  Mean   :78.13   Mean   :83.48   Mean   :3.064   Mean   :3.078  
##  3rd Qu.:91.00   3rd Qu.:93.00   3rd Qu.:3.300   3rd Qu.:3.330  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median : 85000  
##  Mean   :2.394   Mean   : 4.104   Mean   :1.078   Mean   : 54985  
##  3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.:1.000   3rd Qu.:100000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.762  
##  3rd Qu.:6.000  
##  Max.   :7.000

Mean, Medain and Standard deviation of Age

mean(mba.df$age)
## [1] 27.35766
median(mba.df$age)
## [1] 27
sd(mba.df$age)
## [1] 3.710666

Mean, Medain and Standard deviation of total GMAT score

mean(mba.df$gmat_tot)
## [1] 619.4526
median(mba.df$gmat_tot)
## [1] 620
sd(mba.df$gmat_tot)
## [1] 57.53858

Mean, Medain and Standard deviation of quantitative GMAT percentile

mean(mba.df$gmat_qpc)
## [1] 80.64234
median(mba.df$gmat_qpc)
## [1] 83
sd(mba.df$gmat_qpc)
## [1] 14.86853

Mean, Medain and Standard deviation of verbal GMAT percentile

mean(mba.df$gmat_vpc)
## [1] 78.32117
median(mba.df$gmat_vpc)
## [1] 81
sd(mba.df$gmat_vpc)
## [1] 16.85966

Mean, Medain and Standard deviation of overall GMAT percentile

mean(mba.df$gmat_tpc)
## [1] 84.19708
median(mba.df$gmat_tpc)
## [1] 87
sd(mba.df$gmat_tpc)
## [1] 14.02162

Mean, Medain and Standard deviation of spring MBA average

mean(mba.df$s_avg)
## [1] 3.025401
median(mba.df$s_avg)
## [1] 3
sd(mba.df$s_avg)
## [1] 0.3810743

Mean, Medain and Standard deviation of fall MBA average

mean(mba.df$f_avg)
## [1] 3.061533
median(mba.df$f_avg)
## [1] 3
sd(mba.df$f_avg)
## [1] 0.5250451

Mean, Medain and Standard deviation of years of work experience

mean(mba.df$work_yrs)
## [1] 3.872263
median(mba.df$work_yrs)
## [1] 3
sd(mba.df$work_yrs)
## [1] 3.232464

Mean, Medain and Standard deviation of salary

mean(mba1.df$salary)
## [1] 54985.32
median(mba1.df$salary)
## [1] 85000
sd(mba1.df$salary)
## [1] 53152.39

Boxplots of the various variables

boxplot(mba.df$age, main="Age")

boxplot(mba.df$gmat_tot, main="Total GMAT score")

boxplot(mba.df$gmat_qpc, main="Quantitative GMAT percentile")

boxplot(mba.df$gmat_vpc, main="Verbal GMAT percentile")

boxplot(mba.df$gmat_tpc, main="Overall GMAT percentile")

boxplot(mba.df$s_avg, main="Spring MBA average")

boxplot(mba.df$f_avg, main="Fall MBA average")

boxplot(mba.df$work_yrs, main="Years of work experience")

boxplot(mba1.df$salary, main="Salary")

boxplot(mba1.df$satis, main="Degree of satisfaction")

Barplots of the various variables

count <- table(mba.df$age)
barplot(count, main="Age")

count <- table(mba.df$gmat_tot)
barplot(count, main="Total GMAT score")

count <- table(mba.df$gmat_qpc)
barplot(count, main="Quantitative GMAT percentile")

count <- table(mba.df$gmat_vpc)
barplot(count, main="Verbal GMAT percentile")

count <- table(mba.df$gmat_tpc)
barplot(count, main="Overall GMAT percentile")

count <- table(mba.df$s_avg)
barplot(count, main="Spring MBA average")

count <- table(mba.df$f_avg)
barplot(count, main="Fall MBA average")

count <- table(mba.df$work_yrs)
barplot(count, main="Years of work experience")

count <- table(mba1.df$salary)
barplot(count, main="Salary")

count <- table(mba1.df$satis)
barplot(count, main="Degree of satisfaction")

Scatterplots between the variables

plot(mba1.df$salary, mba1.df$age, main="Salary vs Age")

plot(mba1.df$salary, mba1.df$gmat_tot, main="Salary vs Total GMAT score")

plot(mba1.df$salary, mba1.df$gmat_qpc, main="Salary vs Quantitative GMAT percentile")

plot(mba1.df$salary, mba1.df$gmat_vpc, main="Salary vs Verbal GMAT percentile")

plot(mba1.df$salary, mba1.df$gmat_tpc, main="Salary vs Overall GMAT percentile")

plot(mba1.df$salary, mba1.df$s_avg, main="Salary vs Spring MBA average")

plot(mba1.df$salary, mba1.df$f_avg, main="Salary vs Fall MBA average")

plot(mba1.df$salary, mba1.df$quarter, main="Salary vs Quartile ranking")

plot(mba1.df$salary, mba1.df$satis, main="Salary vs Degree of satisfaction")

Correlation plot of the variables

library(corrplot)    
## corrplot 0.84 loaded
corrplot(corr=cor(mba1.df[ , c(1:13)], use="complete.obs"), 
         method ="ellipse")

Corrgram of all the variables

library(corrgram)
corrgram(mba1.df, order=TRUE,
         main="Corrgram of all the Variables",
         lower.panel=panel.shade, upper.panel=panel.pie,
         diag.panel=panel.minmax, text.panel=panel.txt) 

Variance-Covariance Matrix

x <- mba1.df[,c("age", "gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg", "f_avg","work_yrs")]
y <- mba1.df[,c("salary", "satis")]
cov(x,y)
##               salary       satis
## age      -29185.2850 -0.23993415
## gmat_tot   -170.8814  3.49336140
## gmat_qpc  22855.7178 -0.23453692
## gmat_vpc   2901.3078  2.43207578
## gmat_tpc  43822.5292  1.39148856
## s_avg      1940.5276 -0.01361264
## f_avg       244.3157 -0.04613153
## work_yrs -10442.6267 -0.02204771

Contingency table and Chi-Square tests for the employed people

Two-way with Salary and Gender

Null hypothesis: Salary and Gender are independent of each other.

mytable <- xtabs(~ salary+sex,data= mba2.df)
mytable
##         sex
## salary    1  2
##   64000   0  1
##   77000   1  0
##   78256   0  1
##   82000   0  1
##   85000   1  3
##   86000   0  2
##   88000   0  1
##   88500   1  0
##   90000   3  0
##   92000   2  1
##   93000   2  1
##   95000   4  3
##   96000   3  1
##   96500   1  0
##   97000   2  0
##   98000   6  4
##   99000   0  1
##   100000  4  5
##   100400  1  0
##   101000  0  2
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  2  0
##   105000 11  0
##   106000  2  1
##   107000  1  0
##   107300  1  0
##   107500  1  0
##   108000  2  0
##   110000  0  1
##   112000  3  0
##   115000  5  0
##   118000  1  0
##   120000  3  1
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## sex
##  1  2 
## 72 31
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 52.681, df = 41, p-value = 0.1045

Insights from the contingency table: There are a total of 72 males and 31 females in the data set. The highest salary package of 220000 is bagged by a female and the lowest salary package of 64000 is also bagged by a female. The highest number of graduates recieving the same salary is 105000 and are bagged by 11 males. The second highest number of graduates recieving the same salary of 98000 and are bagged by 10 people, 6 males and 4 females.

Insights from the Pearson’s Chi-square test: The p-value is 0.1045. As p>0.05, we confirm that there may not be a relationship between Salary and Gender. Thus we fail to reject the null hypothesis.

Two-way with Salary and First Language

Null hypothesis: Salary and First language are independent of each other.

mytable <- xtabs(~ salary+frstlang,data= mba2.df)
mytable
##         frstlang
## salary    1  2
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## frstlang
##  1  2 
## 96  7
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 69.847, df = 41, p-value = 0.003296

Insights from the contingency table: The highest salary package is bagged by a person whose first language is not English. Similarly, the lowest salary package is bagged by a person whose first language is English. The highest number of graduates recieving the same salary is 105000 and are bagged by 11 people, all whose first lanuage is English. Likewise, the total number of employed people whose first language is English is 96 and the ones whose first language is non-English is 7.

Insights from the Pearson’s Chi-square test: The p-value is 0.003296. As p<0.05, there appears to be a relationship between Salary and First language. Thus the null hypothesis is rejected.

Two-way with Salary and prior-work experience

Null hypothesis: Salary and prior-work experience are independent of each other.

mytable <- xtabs(~ salary+work_yrs,data= mba2.df)
mytable
##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## work_yrs
##  0  1  2  3  4  5  6  7  8 10 15 16 
##  1  8 38 21 11  7  7  1  4  1  2  2
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 535.23, df = 451, p-value = 0.003809

Insights from the contingency table: The second major peak of 14 people having a work experience of 3 years. The lowest work experience is 0 years and the highest is 22 years having 2 people each respectively. The lowest salary of 64000 have a prior work experience of 2 years and the highest salary of 220000 is bagged by a person having a prior work experience of 15 years. The highest number of graduates with the same work experience of 2 years are 60 in total.

Insights from the Pearson’s Chi-square test: The p-value is 0.003809. As p<0.05, there appears to be a realtionship with Salary and prior-work experience. Thus we reject the null hypothesis.

Two way with Salary and Total GMAT score

Null hypothesis: Salary and Total GMAT score are independent of each other.

mytable <- xtabs(~ salary+gmat_tot,data= mba2.df)
options(max.print=1000000)
mytable
##         gmat_tot
## salary   500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
##   64000    0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
##   77000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   78256    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   85000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   86000    0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   88000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   90000    0   0   0   0   0   0   0   1   0   0   0   0   1   0   1   0
##   92000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   93000    0   0   0   1   0   0   0   0   0   0   1   1   0   0   0   0
##   95000    0   0   1   0   0   2   0   0   0   0   2   0   0   0   0   0
##   96000    0   0   0   0   0   1   0   0   1   1   0   0   0   0   1   0
##   96500    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0
##   98000    0   0   0   0   0   1   3   1   1   0   1   0   0   0   0   0
##   99000    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##   100000   0   0   0   0   0   2   0   1   0   1   1   0   1   0   2   0
##   100400   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   101000   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0
##   101100   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   101600   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   102500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   104000   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0
##   105000   0   0   0   0   2   0   2   3   0   1   0   1   0   0   1   0
##   106000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   107000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   107300   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   107500   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   108000   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0
##   110000   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
##   112000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   115000   0   0   0   1   0   0   1   0   0   0   0   1   1   0   0   0
##   118000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   120000   0   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0
##   126710   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   130000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   162000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   220000   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##         gmat_tot
## salary   670 680 700 710 720
##   64000    0   0   0   0   0
##   77000    0   0   0   0   0
##   78256    0   0   0   0   0
##   82000    1   0   0   0   0
##   85000    0   0   1   0   1
##   86000    0   1   0   0   0
##   88000    0   0   0   0   0
##   88500    0   0   0   0   0
##   90000    0   0   0   0   0
##   92000    0   0   0   1   0
##   93000    0   0   0   0   0
##   95000    2   0   0   0   0
##   96000    0   0   0   0   0
##   96500    0   0   0   0   0
##   97000    0   0   0   0   0
##   98000    1   1   0   1   0
##   99000    0   0   0   0   0
##   100000   0   0   0   1   0
##   100400   0   0   0   0   0
##   101000   0   0   0   0   0
##   101100   0   0   0   0   0
##   101600   0   0   0   0   0
##   102500   1   0   0   0   0
##   103000   0   0   0   0   0
##   104000   0   0   0   0   0
##   105000   0   1   0   0   0
##   106000   0   2   0   0   0
##   107000   0   0   0   0   0
##   107300   0   0   0   0   0
##   107500   0   0   0   0   0
##   108000   0   0   0   0   0
##   110000   0   0   0   0   0
##   112000   1   1   0   0   0
##   115000   0   0   0   1   0
##   118000   0   0   0   0   0
##   120000   1   0   1   0   0
##   126710   0   0   0   0   0
##   130000   0   0   0   0   0
##   145800   0   0   0   0   0
##   146000   0   0   0   0   0
##   162000   0   0   1   0   0
##   220000   0   0   0   0   0
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## gmat_tot
## 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 
##   2   1   2   2   3   8   7   8   2   9   5  12   8   1   7   5   7   6 
## 700 710 720 
##   3   4   1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 927.24, df = 820, p-value = 0.005279

Insights from the contingency table: The person with the highest salary of 220000 has a GMAT score of 500. The person with the lowest salary of 64000 has a GMAT score of 560. There is a group of the highest number of people (12) having the same GMAT score of 620.

Insights from the Pearson’s Chi-square test: The p-value is 0.005279. As p<0.05, there is a relationship with Salary and Total GMAT score. Thus we reject the null hypothesis.

Two way with Salary and Spring MBA average

Null hypothesis: Salary and Spring MBA average are independent of each other.

mytable <- xtabs(~ salary+s_avg,data= mba2.df)
mytable
##         s_avg
## salary   2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.91 3 3.09 3.1 3.2 3.27 3.3 3.4
##   64000    0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   0
##   77000    0   0   0   1   0   0   0   0    0 0    0   0   0    0   0   0
##   78256    0   0   0   0   0   0   0   1    0 0    0   0   0    0   0   0
##   82000    0   0   0   0   0   0   0   0    0 0    0   1   0    0   0   0
##   85000    0   0   1   0   0   0   0   0    0 0    0   0   0    0   1   0
##   86000    0   0   0   0   0   0   0   1    0 0    0   0   0    0   0   0
##   88000    0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   1
##   88500    0   0   0   0   0   1   0   0    0 0    0   0   0    0   0   0
##   90000    0   0   1   0   0   1   0   1    0 0    0   0   0    0   0   0
##   92000    0   0   0   0   0   0   0   0    0 0    0   1   0    0   1   1
##   93000    0   0   0   0   0   1   0   0    0 0    0   1   0    0   0   1
##   95000    0   0   1   0   0   0   0   1    0 0    0   0   1    1   2   0
##   96000    0   0   0   1   0   0   0   0    0 0    0   0   1    0   2   0
##   96500    0   0   0   0   0   0   0   0    0 1    0   0   0    0   0   0
##   97000    0   0   0   0   0   0   1   1    0 0    0   0   0    0   0   0
##   98000    0   1   0   0   0   1   1   4    0 1    0   0   2    0   0   0
##   99000    0   0   0   0   0   0   0   0    0 0    0   1   0    0   0   0
##   100000   0   0   0   0   2   0   1   1    0 1    1   0   0    0   1   2
##   100400   0   0   0   0   1   0   0   0    0 0    0   0   0    0   0   0
##   101000   0   0   0   0   0   0   1   0    0 0    0   1   0    0   0   0
##   101100   0   0   0   0   0   0   1   0    0 0    0   0   0    0   0   0
##   101600   0   0   0   0   1   0   0   0    0 0    0   0   0    0   0   0
##   102500   0   0   0   0   0   0   1   0    0 0    0   0   0    0   0   0
##   103000   0   0   0   0   0   0   0   0    0 0    0   0   1    0   0   0
##   104000   0   0   1   0   0   0   0   0    0 0    0   0   1    0   0   0
##   105000   1   0   0   0   0   0   0   0    1 2    0   0   1    0   2   0
##   106000   0   0   0   0   0   0   0   1    0 0    0   0   0    0   0   0
##   107000   0   0   0   0   0   0   0   0    0 0    1   0   0    0   0   0
##   107300   0   0   0   0   0   0   0   1    0 0    0   0   0    0   0   0
##   107500   0   0   0   0   0   0   0   0    0 0    0   0   0    0   1   0
##   108000   0   0   0   0   0   0   0   1    0 0    0   0   0    0   0   0
##   110000   0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   0
##   112000   0   0   0   0   0   0   1   0    0 0    0   1   0    0   0   0
##   115000   0   0   0   0   1   0   0   0    0 1    0   0   1    0   0   0
##   118000   0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   0
##   120000   0   0   0   0   0   0   0   0    0 0    0   0   0    0   1   0
##   126710   0   0   0   0   1   0   0   0    0 0    0   0   0    0   0   0
##   130000   0   0   0   0   0   0   0   0    0 0    0   0   1    0   0   0
##   145800   0   0   0   0   0   0   0   0    0 0    0   1   0    0   0   0
##   146000   0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   0
##   162000   0   0   0   0   0   0   0   0    0 0    0   0   0    0   0   0
##   220000   0   0   0   1   0   0   0   0    0 0    0   0   0    0   0   0
##         s_avg
## salary   3.45 3.5 3.6 3.7 3.8 4
##   64000     0   1   0   0   0 0
##   77000     0   0   0   0   0 0
##   78256     0   0   0   0   0 0
##   82000     0   0   0   0   0 0
##   85000     0   2   0   0   0 0
##   86000     0   1   0   0   0 0
##   88000     0   0   0   0   0 0
##   88500     0   0   0   0   0 0
##   90000     0   0   0   0   0 0
##   92000     0   0   0   0   0 0
##   93000     0   0   0   0   0 0
##   95000     0   0   1   0   0 0
##   96000     0   0   0   0   0 0
##   96500     0   0   0   0   0 0
##   97000     0   0   0   0   0 0
##   98000     0   0   0   0   0 0
##   99000     0   0   0   0   0 0
##   100000    0   0   0   0   0 0
##   100400    0   0   0   0   0 0
##   101000    0   0   0   0   0 0
##   101100    0   0   0   0   0 0
##   101600    0   0   0   0   0 0
##   102500    0   0   0   0   0 0
##   103000    0   0   0   0   0 0
##   104000    0   0   0   0   0 0
##   105000    1   1   1   0   1 0
##   106000    0   1   0   1   0 0
##   107000    0   0   0   0   0 0
##   107300    0   0   0   0   0 0
##   107500    0   0   0   0   0 0
##   108000    0   1   0   0   0 0
##   110000    0   0   1   0   0 0
##   112000    0   0   1   0   0 0
##   115000    0   0   1   1   0 0
##   118000    0   1   0   0   0 0
##   120000    0   2   0   0   1 0
##   126710    0   0   0   0   0 0
##   130000    0   0   0   0   0 0
##   145800    0   0   0   0   0 0
##   146000    0   0   0   0   0 1
##   162000    0   0   1   0   0 0
##   220000    0   0   0   0   0 0
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## s_avg
##  2.2  2.3  2.4  2.5  2.6  2.7  2.8  2.9 2.91    3 3.09  3.1  3.2 3.27  3.3 
##    1    1    4    3    6    4    7   13    1    6    2    7    9    1   11 
##  3.4 3.45  3.5  3.6  3.7  3.8    4 
##    5    1   10    6    2    2    1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 792.97, df = 861, p-value = 0.9524

Insights from the contingency table: The person having the highest salary of 220000 has a Spring MBA average of 2.5. The person with the highest spring MBA average of 4 has a salary package of 146000. The person with the lowest spring MBA average of 2.2 earns 105000. The highest number of people with the same spring MBA average are 13 in number with an average of 2.9.

Insights from the Pearson’s Chi-square test: The p-value is 0.9524. As p>0.05, there is no relationship between Salary and Spring MBA average. Thus we fail to reject the null hypothesis.

Two way table of Salary and Fall MBA average

Null hypothesis: Salary and Fall MBA average are independent of each other.

mytable <- xtabs(~ salary+f_avg,data= mba2.df)
mytable
##         f_avg
## salary   0 2 2.25 2.5 2.67 2.75 2.83 3 3.25 3.33 3.5 3.6 3.67 3.75 4
##   64000  0 0    0   0    0    0    0 0    1    0   0   0    0    0 0
##   77000  0 0    0   0    0    0    0 1    0    0   0   0    0    0 0
##   78256  0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   82000  0 0    0   0    0    0    0 0    0    1   0   0    0    0 0
##   85000  0 1    0   0    0    0    0 0    1    0   0   1    0    1 0
##   86000  0 0    0   0    1    0    0 0    1    0   0   0    0    0 0
##   88000  0 0    0   0    0    0    0 0    1    0   0   0    0    0 0
##   88500  0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   90000  0 0    1   1    0    0    0 0    1    0   0   0    0    0 0
##   92000  0 0    0   0    0    0    0 0    1    0   2   0    0    0 0
##   93000  0 0    0   0    0    1    0 1    0    0   0   0    0    1 0
##   95000  0 0    0   0    0    1    0 1    2    0   2   0    1    0 0
##   96000  0 0    0   1    0    0    0 0    2    0   1   0    0    0 0
##   96500  0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   97000  0 0    0   0    0    1    0 1    0    0   0   0    0    0 0
##   98000  0 0    0   1    0    2    0 2    5    0   0   0    0    0 0
##   99000  0 0    0   0    0    0    0 1    0    0   0   0    0    0 0
##   100000 0 0    0   0    0    1    0 5    1    0   1   0    1    0 0
##   100400 0 0    0   1    0    0    0 0    0    0   0   0    0    0 0
##   101000 0 0    0   0    0    0    0 1    0    0   1   0    0    0 0
##   101100 0 0    0   0    0    0    0 1    0    0   0   0    0    0 0
##   101600 0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   102500 0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   103000 0 0    0   0    0    0    0 1    0    0   0   0    0    0 0
##   104000 0 0    0   0    0    1    0 0    1    0   0   0    0    0 0
##   105000 0 1    0   0    0    0    1 3    2    0   4   0    0    0 0
##   106000 0 0    0   0    0    0    0 2    0    0   0   1    0    0 0
##   107000 0 0    0   0    0    0    0 0    0    0   1   0    0    0 0
##   107300 0 0    0   0    0    0    0 0    0    0   1   0    0    0 0
##   107500 0 0    0   0    0    0    0 0    1    0   0   0    0    0 0
##   108000 0 0    0   0    0    0    0 1    1    0   0   0    0    0 0
##   110000 0 0    0   0    0    0    0 0    0    0   1   0    0    0 0
##   112000 0 0    0   0    0    1    0 1    0    0   1   0    0    0 0
##   115000 0 0    0   1    0    0    0 1    1    0   1   0    0    0 1
##   118000 0 0    0   0    0    0    0 0    0    0   1   0    0    0 0
##   120000 0 0    0   0    0    0    0 1    2    0   0   0    0    0 1
##   126710 0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
##   130000 0 0    0   0    0    0    0 0    1    0   0   0    0    0 0
##   145800 0 0    0   0    0    0    0 1    0    0   0   0    0    0 0
##   146000 1 0    0   0    0    0    0 0    0    0   0   0    0    0 0
##   162000 0 0    0   0    0    0    0 0    0    0   0   0    0    1 0
##   220000 0 0    0   0    0    1    0 0    0    0   0   0    0    0 0
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## f_avg
##    0    2 2.25  2.5 2.67 2.75 2.83    3 3.25 3.33  3.5  3.6 3.67 3.75    4 
##    1    2    1    5    1   15    1   25   25    1   17    2    2    3    2
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 596.28, df = 574, p-value = 0.2518

Insights from the contingency table: The person having the highest salary of 220000 has a fall MBA average of 2.75. Two people have the highest fall MBA average of 4 have a salary package of 115000 and 120000. One person has the lowest fall MBA average of 0 and earns 146000. There are two groups of 25 people with a fall MBA average of 3 and 3.25.

Insights from the Pearson’s Chi-square test: The p-value is 0.2518. As p>0.05, there is no relationship between Salary and Fall MBA average. Thus we fail to reject the null hypothesis.

Two way table of Salary and degree of satisfaction

Null hypothesis: Salary and degree of satisfaction are independent of each other.

mytable <- xtabs(~ salary+satis,data= mba2.df)
mytable
##         satis
## salary   3 4 5 6 7
##   64000  0 0 0 0 1
##   77000  0 0 0 1 0
##   78256  0 0 1 0 0
##   82000  0 0 0 0 1
##   85000  0 0 1 3 0
##   86000  0 0 2 0 0
##   88000  0 0 0 0 1
##   88500  0 0 0 1 0
##   90000  0 0 2 0 1
##   92000  0 0 1 1 1
##   93000  0 0 1 2 0
##   95000  1 1 1 2 2
##   96000  0 0 1 1 2
##   96500  0 0 0 1 0
##   97000  0 0 0 1 1
##   98000  0 0 2 5 3
##   99000  0 0 0 1 0
##   100000 0 0 1 6 2
##   100400 0 0 0 0 1
##   101000 0 0 1 1 0
##   101100 0 0 0 1 0
##   101600 0 0 0 1 0
##   102500 0 0 1 0 0
##   103000 0 0 0 1 0
##   104000 0 0 1 1 0
##   105000 0 0 4 6 1
##   106000 0 0 0 2 1
##   107000 0 0 1 0 0
##   107300 0 0 0 0 1
##   107500 0 0 1 0 0
##   108000 0 0 0 2 0
##   110000 0 0 1 0 0
##   112000 0 0 0 2 1
##   115000 0 0 3 2 0
##   118000 0 0 0 0 1
##   120000 0 0 2 2 0
##   126710 0 0 0 1 0
##   130000 0 0 0 0 1
##   145800 0 0 0 1 0
##   146000 0 0 0 1 0
##   162000 0 0 1 0 0
##   220000 0 0 0 1 0
margin.table(mytable,1)
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
margin.table(mytable,2)
## satis
##  3  4  5  6  7 
##  1  1 29 50 22
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 109.1, df = 164, p-value = 0.9997

Insights from the contingency table: Degree of satisfaction: 1 person has rated 3 1 person has rated 4 29 people have rated 5 50 people have rated 6 22 people have rated 7

The maximum number of 50 people have rated it 6 and a minimum number of 1 person has rated it 3 and 4. 22 people have rated it the maximum rating of 7. No one has rated it the minimum of 1. The person with the maximum salary package of 220000 has rated it 6 and the person with the minimum salary package of 64000 has rated it 7.

Insights from the Pearson’s Chi-square test: The p-value is 0.9997. As p>0.05, there is no relationship between Salary and Spring MBA average. Thus we fail to reject the null hypothesis.

T Tests

Null Hypothesis: There is no difference between the mean of Spring average and Fall average.

t.test(mba2.df$s_avg, mba2.df$f_avg)
## 
##  Welch Two Sample t-test
## 
## data:  mba2.df$s_avg and mba2.df$f_avg
## t = 0.022345, df = 192.16, p-value = 0.9822
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1186187  0.1213372
## sample estimates:
## mean of x mean of y 
##  3.092330  3.090971

The p-value is 0.9822. As p>0.05, we fail to reject the null hypothesis.

Null hypothesis: There is no difference between the mean of age and prior-work experience.

t.test(mba2.df$age, mba2.df$work_yrs)
## 
##  Welch Two Sample t-test
## 
## data:  mba2.df$age and mba2.df$work_yrs
## t = 52.723, df = 202.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  22.23330 23.96088
## sample estimates:
## mean of x mean of y 
## 26.776699  3.679612

The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of age and prior-work experience.”

Null hypothesis: There is no difference between the mean of salary and first language.

t.test(mba2.df$salary, mba2.df$frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  mba2.df$salary and mba2.df$frstlang
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99537.4 106521.9
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 1.067961e+00

The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of salary and first language.”

Null hypothesis: There is no difference between the mean of salary and degree of satisfaction.

t.test(mba2.df$salary, mba2.df$satis)
## 
##  Welch Two Sample t-test
## 
## data:  mba2.df$salary and mba2.df$satis
## t = 58.515, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99532.58 106517.13
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 5.883495e+00

The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of salary and degree of satisfaction.”

Regression Analysis

regr <- lm(salary ~ age+gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg, data = mba2.df)
summary(regr)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + s_avg + f_avg, data = mba2.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31186  -7438    622   5299  69725 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37730.6788 43445.3073   0.868   0.3873    
## age          2657.3914   503.9286   5.273 8.39e-07 ***
## gmat_tot       -0.8982   164.7408  -0.005   0.9957    
## gmat_qpc      859.6496   480.8645   1.788   0.0770 .  
## gmat_vpc      537.0116   480.7496   1.117   0.2668    
## gmat_tpc    -1454.7911   700.5012  -2.077   0.0405 *  
## s_avg        4069.5906  4808.3606   0.846   0.3995    
## f_avg       -1827.8260  3750.0292  -0.487   0.6271    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15290 on 95 degrees of freedom
## Multiple R-squared:  0.3181, Adjusted R-squared:  0.2679 
## F-statistic: 6.332 on 7 and 95 DF,  p-value: 4.124e-06
regr$coefficients
##   (Intercept)           age      gmat_tot      gmat_qpc      gmat_vpc 
## 37730.6788094  2657.3914327    -0.8981671   859.6496462   537.0115555 
##      gmat_tpc         s_avg         f_avg 
## -1454.7911249  4069.5906403 -1827.8260101

Variable age is statistically significant.

Multiple R-squared value is 0.3181.

regr <- lm(salary ~ sex+work_yrs+frstlang+satis+quarter, data = mba2.df)
summary(regr)
## 
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis + quarter, 
##     data = mba2.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29352  -8342  -1943   5264  83154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  99601.3    13868.8   7.182 1.41e-10 ***
## sex          -6037.8     3394.5  -1.779   0.0784 .  
## work_yrs      2321.1      534.1   4.346 3.42e-05 ***
## frstlang     15448.4     6326.4   2.442   0.0164 *  
## satis        -1800.6     2041.0  -0.882   0.3798    
## quarter      -1397.2     1441.5  -0.969   0.3348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15590 on 97 degrees of freedom
## Multiple R-squared:  0.2765, Adjusted R-squared:  0.2392 
## F-statistic: 7.414 on 5 and 97 DF,  p-value: 6.324e-06
regr$coefficients
## (Intercept)         sex    work_yrs    frstlang       satis     quarter 
##   99601.284   -6037.779    2321.095   15448.364   -1800.607   -1397.213

Variable work_yrs is statistically significant.

Multiple R-squared values is 0.2765.

regr <- lm(salary ~ ., data = mba2.df)
summary(regr)
## 
## Call:
## lm(formula = salary ~ ., data = mba2.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## age          1750.65    1130.92   1.548   0.1251  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_tot       16.19     178.85   0.090   0.9281  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## work_yrs      749.66    1135.90   0.660   0.5110  
## frstlang     7719.42    7373.27   1.047   0.2979  
## satis       -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05
regr$coefficients
## (Intercept)         age         sex    gmat_tot    gmat_qpc    gmat_vpc 
## 78005.66171  1750.65216 -3584.07221    16.18545   796.54809   546.30750 
##    gmat_tpc       s_avg       f_avg     quarter    work_yrs    frstlang 
## -1457.08759  -931.53478 -2222.82135 -2336.55542   749.66083  7719.42304 
##       satis 
## -1086.54069

Multiple R-squared is 0.3422. This model has the highest revelance to the regression line and hence is the best model. The multiple R-squared indciates that the model accounts for 34.22% of the variance in Salaries.

Contingency table and Chi-Square tests for the unemployed people

Two way between salary and gender

Null hypothesis: Salary and Gender are independent of each other.

mytable <- xtabs(~ salary+sex,data= mba3.df)
mytable
##       sex
## salary  1  2
##      0 67 23
margin.table(mytable,1)
## salary
##  0 
## 90
chisq.test(mytable)
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 21.511, df = 1, p-value = 3.518e-06

Insights from the contingency table: There are 90 unemployed people out of which 67 are males and 23 are females.

Insights from the Pearson’s Chi-square test: The p-value is 3.518e-06. As p<0.05, there is a relationship between null salary and gender. Therefore we reject the null hypothesis.

Two-way with Salary and First Language

Null hypothesis: Salary and First language are independent of each other.

mytable <- xtabs(~ salary+frstlang,data= mba3.df)
mytable
##       frstlang
## salary  1  2
##      0 82  8
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## frstlang
##  1  2 
## 82  8
chisq.test(mytable)
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 60.844, df = 1, p-value = 6.177e-15

Insights from the contingency table: Out of the 90 unemployed people, 82 have first language as english and 8 have a different first language.

Insights from the Pearson’s Chi-square test: The p-value is 6.177e-15. As p<0.05, there is a relationship between null salary and first language. Therefore we reject the null hypothesis.

Two-way with Salary and prior work experience

Null hypothesis: Salary and Work years are independent of each other.

mytable <- xtabs(~ salary+work_yrs,data= mba3.df)
mytable
##       work_yrs
## salary  0  1  2  3  4  5  6  7  8  9 10 11 12 13 16 18 22
##      0  1 12 22 14  9 12  2  5  2  1  1  2  2  1  1  1  2
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## work_yrs
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 16 18 22 
##  1 12 22 14  9 12  2  5  2  1  1  2  2  1  1  1  2
chisq.test(mytable)
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 117.78, df = 16, p-value < 2.2e-16

Insights from the contingency table: Out of the 90 unemployed people, a peak of 22 people have a prior work experience of 2 years.

Insights from the Pearson’s Chi-square test: The p-value is lesser than 2.2e-16. As p<0.05, there is a relationship between null salary and first language. Therefore we reject the null hypothesis.

Two-way with Salary and Total GMAT score

Null hypothesis: Salary and Total GMAT score are independent of each other.

mytable <- xtabs(~ salary+gmat_tot,data= mba3.df)
mytable
##       gmat_tot
## salary 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660
##      0   1   1   2   3   3   4   8   7   4   3   3   9   4   5   6   5   3
##       gmat_tot
## salary 670 680 700 710 720 730 740 750 760
##      0   4   3   2   4   2   1   1   1   1
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## gmat_tot
## 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 
##   1   1   2   3   3   4   8   7   4   3   3   9   4   5   6   5   3   4 
## 680 700 710 720 730 740 750 760 
##   3   2   4   2   1   1   1   1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 34.8, df = 25, p-value = 0.09188

Insights from the contingency table: Out of the 90 unemployed people, the maximum GMAT score of 760 is bagged by one person and the least GMAT score of 450 is bagged by one person. A peak of 9 people have bagged the same GMAT score of 610.

Insights from the Pearson’s Chi-square test: The p-value is 0.09188. As p>0.05, there isn’t a relationship between null salary and the total GMAT score. Hence, we fail to reject the null hypothesis.

Two way with Salary and Spring MBA average

Null hypothesis: Salary and Spring MBA average are independent of each other.

mytable <- xtabs(~ salary+s_avg,data= mba3.df)
mytable
##       s_avg
## salary  2 2.1 2.2 2.3 2.4 2.6 2.7 2.8 2.82 2.9  3 3.08 3.09 3.1 3.17 3.2
##      0  1   2   1   2   2   1   8   9    1   9 10    1    2   6    1   4
##       s_avg
## salary 3.25 3.27 3.3 3.38 3.4 3.45 3.5 3.6 3.64 3.8 3.9
##      0    1    2   9    1   7    1   2   4    1   1   1
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## s_avg
##    2  2.1  2.2  2.3  2.4  2.6  2.7  2.8 2.82  2.9    3 3.08 3.09  3.1 3.17 
##    1    2    1    2    2    1    8    9    1    9   10    1    2    6    1 
##  3.2 3.25 3.27  3.3 3.38  3.4 3.45  3.5  3.6 3.64  3.8  3.9 
##    4    1    2    9    1    7    1    2    4    1    1    1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 78, df = 26, p-value = 4.248e-07

Insights from the contingency table: The maximum Spring MBA average of 3.9 is bagged by one person and the minimum Spring MBA average of 2 is bagged by one person. A peak of 10 people have bagged the same Spring MBA average of 3.

Insights from the Pearson’s Chi-square test: The p-value is 4.248e-07. As p<0.05, there is a relationship between null salary and spring MBA average. Therefore we reject the null hypothesis.

Two way table of Salary and Fall MBA average

Null hypothesis: Salary and Fall MBA average are independent of each other.

mytable <- xtabs(~ salary+f_avg,data= mba3.df)
mytable
##       f_avg
## salary  0  2 2.25 2.5 2.67 2.75  3 3.17 3.2 3.25 3.33 3.4 3.5 3.6 3.67
##      0  1  3    2   8    1    9 24    1   1   18    1   1   7   1    1
##       f_avg
## salary 3.75 3.83  4
##      0    6    1  4
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## f_avg
##    0    2 2.25  2.5 2.67 2.75    3 3.17  3.2 3.25 3.33  3.4  3.5  3.6 3.67 
##    1    3    2    8    1    9   24    1    1   18    1    1    7    1    1 
## 3.75 3.83    4 
##    6    1    4
chisq.test(mytable)
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 143.6, df = 17, p-value < 2.2e-16

Insights from the contingency table: The maximum Fall MBA average of 4 is bagged by four people and the minimum Fall MBA average of 0 is bagged by one person. A peak of 24 people have bagged the same Fall MBA average of 3.

Insights from the Pearson’s Chi-square test: The p-value is less than 2.2e-16. As p<0.05, there is a relationship between null salary and spring MBA average. Therefore we reject the null hypothesis.

Two way table of Salary and Degree of satisfaction

Null hypothesis: Salary and Degree of satisfaction are independent of each other.

mytable <- xtabs(~ salary+satis,data= mba3.df)
mytable
##       satis
## salary  4  5  6  7
##      0  4 36 40 10
margin.table(mytable,1)
## salary
##  0 
## 90
margin.table(mytable,2)
## satis
##  4  5  6  7 
##  4 36 40 10
chisq.test(mytable)
## 
##  Chi-squared test for given probabilities
## 
## data:  mytable
## X-squared = 43.867, df = 3, p-value = 1.611e-09

Insights from the contingency table: Out of the 90 unemployed people, 40 people have rated it 6, 36 people have rated it 5, 10 people have rated it 7 and 4 people have rated it 4.

Insights from the Pearson’s Chi-square test: The p-value is 1.611e-09. As p<0.05, there is a relationship between null salary and degree of satisfaction. Therefore we reject the null hypothesis.

Logistic regression

Since we are comparing people who have a job with those who don’t. I have replaced all the salary values which are greater than zero, with ‘1’ and those who don’t have a job obviously stays ‘0’. Now we have successfully created a model called the “binomial logistic regression”. Those who are employed are denoted by ‘1’ and those who aren’t by ‘0’.

mba1.df$salary[mba1.df$salary!=0] <- 1
lregr <- glm(salary ~.,family=binomial(link='logit'),data=mba1.df)
summary(lregr)
## 
## Call:
## glm(formula = salary ~ ., family = binomial(link = "logit"), 
##     data = mba1.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0419  -1.1439   0.7517   1.0238   1.9665  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  4.56015    4.57567   0.997   0.3190  
## age         -0.19812    0.08562  -2.314   0.0207 *
## sex          0.14396    0.36089   0.399   0.6900  
## gmat_tot    -0.01027    0.01289  -0.797   0.4257  
## gmat_qpc    -0.02177    0.04606  -0.473   0.6364  
## gmat_vpc    -0.02058    0.04401  -0.468   0.6401  
## gmat_tpc     0.08833    0.06492   1.361   0.1736  
## s_avg        0.24369    0.67258   0.362   0.7171  
## f_avg       -0.09871    0.36617  -0.270   0.7875  
## quarter     -0.23624    0.21132  -1.118   0.2636  
## work_yrs     0.10479    0.09386   1.116   0.2642  
## frstlang     0.31939    0.64213   0.497   0.6189  
## satis        0.42640    0.21308   2.001   0.0454 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 242.92  on 180  degrees of freedom
## AIC: 268.92
## 
## Number of Fisher Scoring iterations: 5

Interpreting the results of the binomial logistic regression model

anova(lregr, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: salary
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                       192     266.68            
## age       1   8.4714       191     258.21 0.003608 **
## sex       1   0.3867       190     257.82 0.534052   
## gmat_tot  1   0.0156       189     257.81 0.900742   
## gmat_qpc  1   0.0186       188     257.79 0.891625   
## gmat_vpc  1   1.3430       187     256.44 0.246503   
## gmat_tpc  1   4.0117       186     252.43 0.045185 * 
## s_avg     1   2.0591       185     250.37 0.151303   
## f_avg     1   0.2676       184     250.10 0.604976   
## quarter   1   1.2924       183     248.81 0.255602   
## work_yrs  1   1.6279       182     247.19 0.201987   
## frstlang  1   0.1575       181     247.03 0.691446   
## satis     1   4.1033       180     242.92 0.042799 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-values suggest that the age is the biggest factor in variation of salary as it has the lowest p-value. Following this are the degree of satisfaction and the overall GMAT percentile which play the next bigger role (after age) in variation of salary.

A large p-value here indicates that the model without the variable explains more or less the same amount of variation. Variables such as sex, gmat_tot, gmat_qpc, gmat_vpc, s_avg, f_avg, quarter, work_yrs and frstlang donot affect the variable “Salary” much.

Summary from the Binomial logistic regression model: We can conclude that age plays the biggest factor in determining whether an MBA graduate is employed or not. The next biggest roles are played by the Total GMAT percentile and the degree of satisfation in determining whether an MBA graduate is employed or not.