setwd("C:\\Users\\Adithya Nataraj\\Downloads")
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba.df)
mba1.df <- mba.df[which(mba.df$salary!= 998 & mba.df$salary!= 999 & mba.df$satis!= 998), ]
mba2.df <- mba.df[which(mba.df$salary!= 998 & mba.df$salary!= 999 & mba.df$satis!= 998 & mba.df$salary!= 0), ]
mba3.df <- mba.df[which(mba.df$salary!=998 & mba.df$salary!=999 & mba.df$salary==0 & mba.df$satis!= 998), ]
summary(mba1.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.00 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.00 1st Qu.:570.0 1st Qu.:72.00
## Median :27.00 Median :1.00 Median :610.0 Median :82.00
## Mean :27.59 Mean :1.28 Mean :615.2 Mean :79.35
## 3rd Qu.:29.00 3rd Qu.:2.00 3rd Qu.:650.0 3rd Qu.:91.00
## Max. :48.00 Max. :2.00 Max. :760.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:75.00 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :87.00 Median :3.090 Median :3.000
## Mean :78.13 Mean :83.48 Mean :3.064 Mean :3.078
## 3rd Qu.:91.00 3rd Qu.:93.00 3rd Qu.:3.300 3rd Qu.:3.330
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 85000
## Mean :2.394 Mean : 4.104 Mean :1.078 Mean : 54985
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.:1.000 3rd Qu.:100000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.762
## 3rd Qu.:6.000
## Max. :7.000
mean(mba.df$age)
## [1] 27.35766
median(mba.df$age)
## [1] 27
sd(mba.df$age)
## [1] 3.710666
mean(mba.df$gmat_tot)
## [1] 619.4526
median(mba.df$gmat_tot)
## [1] 620
sd(mba.df$gmat_tot)
## [1] 57.53858
mean(mba.df$gmat_qpc)
## [1] 80.64234
median(mba.df$gmat_qpc)
## [1] 83
sd(mba.df$gmat_qpc)
## [1] 14.86853
mean(mba.df$gmat_vpc)
## [1] 78.32117
median(mba.df$gmat_vpc)
## [1] 81
sd(mba.df$gmat_vpc)
## [1] 16.85966
mean(mba.df$gmat_tpc)
## [1] 84.19708
median(mba.df$gmat_tpc)
## [1] 87
sd(mba.df$gmat_tpc)
## [1] 14.02162
mean(mba.df$s_avg)
## [1] 3.025401
median(mba.df$s_avg)
## [1] 3
sd(mba.df$s_avg)
## [1] 0.3810743
mean(mba.df$f_avg)
## [1] 3.061533
median(mba.df$f_avg)
## [1] 3
sd(mba.df$f_avg)
## [1] 0.5250451
mean(mba.df$work_yrs)
## [1] 3.872263
median(mba.df$work_yrs)
## [1] 3
sd(mba.df$work_yrs)
## [1] 3.232464
mean(mba1.df$salary)
## [1] 54985.32
median(mba1.df$salary)
## [1] 85000
sd(mba1.df$salary)
## [1] 53152.39
boxplot(mba.df$age, main="Age")
boxplot(mba.df$gmat_tot, main="Total GMAT score")
boxplot(mba.df$gmat_qpc, main="Quantitative GMAT percentile")
boxplot(mba.df$gmat_vpc, main="Verbal GMAT percentile")
boxplot(mba.df$gmat_tpc, main="Overall GMAT percentile")
boxplot(mba.df$s_avg, main="Spring MBA average")
boxplot(mba.df$f_avg, main="Fall MBA average")
boxplot(mba.df$work_yrs, main="Years of work experience")
boxplot(mba1.df$salary, main="Salary")
boxplot(mba1.df$satis, main="Degree of satisfaction")
count <- table(mba.df$age)
barplot(count, main="Age")
count <- table(mba.df$gmat_tot)
barplot(count, main="Total GMAT score")
count <- table(mba.df$gmat_qpc)
barplot(count, main="Quantitative GMAT percentile")
count <- table(mba.df$gmat_vpc)
barplot(count, main="Verbal GMAT percentile")
count <- table(mba.df$gmat_tpc)
barplot(count, main="Overall GMAT percentile")
count <- table(mba.df$s_avg)
barplot(count, main="Spring MBA average")
count <- table(mba.df$f_avg)
barplot(count, main="Fall MBA average")
count <- table(mba.df$work_yrs)
barplot(count, main="Years of work experience")
count <- table(mba1.df$salary)
barplot(count, main="Salary")
count <- table(mba1.df$satis)
barplot(count, main="Degree of satisfaction")
plot(mba1.df$salary, mba1.df$age, main="Salary vs Age")
plot(mba1.df$salary, mba1.df$gmat_tot, main="Salary vs Total GMAT score")
plot(mba1.df$salary, mba1.df$gmat_qpc, main="Salary vs Quantitative GMAT percentile")
plot(mba1.df$salary, mba1.df$gmat_vpc, main="Salary vs Verbal GMAT percentile")
plot(mba1.df$salary, mba1.df$gmat_tpc, main="Salary vs Overall GMAT percentile")
plot(mba1.df$salary, mba1.df$s_avg, main="Salary vs Spring MBA average")
plot(mba1.df$salary, mba1.df$f_avg, main="Salary vs Fall MBA average")
plot(mba1.df$salary, mba1.df$quarter, main="Salary vs Quartile ranking")
plot(mba1.df$salary, mba1.df$satis, main="Salary vs Degree of satisfaction")
library(corrplot)
## corrplot 0.84 loaded
corrplot(corr=cor(mba1.df[ , c(1:13)], use="complete.obs"),
method ="ellipse")
library(corrgram)
corrgram(mba1.df, order=TRUE,
main="Corrgram of all the Variables",
lower.panel=panel.shade, upper.panel=panel.pie,
diag.panel=panel.minmax, text.panel=panel.txt)
x <- mba1.df[,c("age", "gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg", "f_avg","work_yrs")]
y <- mba1.df[,c("salary", "satis")]
cov(x,y)
## salary satis
## age -29185.2850 -0.23993415
## gmat_tot -170.8814 3.49336140
## gmat_qpc 22855.7178 -0.23453692
## gmat_vpc 2901.3078 2.43207578
## gmat_tpc 43822.5292 1.39148856
## s_avg 1940.5276 -0.01361264
## f_avg 244.3157 -0.04613153
## work_yrs -10442.6267 -0.02204771
Null hypothesis: Salary and Gender are independent of each other.
mytable <- xtabs(~ salary+sex,data= mba2.df)
mytable
## sex
## salary 1 2
## 64000 0 1
## 77000 1 0
## 78256 0 1
## 82000 0 1
## 85000 1 3
## 86000 0 2
## 88000 0 1
## 88500 1 0
## 90000 3 0
## 92000 2 1
## 93000 2 1
## 95000 4 3
## 96000 3 1
## 96500 1 0
## 97000 2 0
## 98000 6 4
## 99000 0 1
## 100000 4 5
## 100400 1 0
## 101000 0 2
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 2 0
## 105000 11 0
## 106000 2 1
## 107000 1 0
## 107300 1 0
## 107500 1 0
## 108000 2 0
## 110000 0 1
## 112000 3 0
## 115000 5 0
## 118000 1 0
## 120000 3 1
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## sex
## 1 2
## 72 31
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
Insights from the contingency table: There are a total of 72 males and 31 females in the data set. The highest salary package of 220000 is bagged by a female and the lowest salary package of 64000 is also bagged by a female. The highest number of graduates recieving the same salary is 105000 and are bagged by 11 males. The second highest number of graduates recieving the same salary of 98000 and are bagged by 10 people, 6 males and 4 females.
Insights from the Pearson’s Chi-square test: The p-value is 0.1045. As p>0.05, we confirm that there may not be a relationship between Salary and Gender. Thus we fail to reject the null hypothesis.
Null hypothesis: Salary and First language are independent of each other.
mytable <- xtabs(~ salary+frstlang,data= mba2.df)
mytable
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## frstlang
## 1 2
## 96 7
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 69.847, df = 41, p-value = 0.003296
Insights from the contingency table: The highest salary package is bagged by a person whose first language is not English. Similarly, the lowest salary package is bagged by a person whose first language is English. The highest number of graduates recieving the same salary is 105000 and are bagged by 11 people, all whose first lanuage is English. Likewise, the total number of employed people whose first language is English is 96 and the ones whose first language is non-English is 7.
Insights from the Pearson’s Chi-square test: The p-value is 0.003296. As p<0.05, there appears to be a relationship between Salary and First language. Thus the null hypothesis is rejected.
Null hypothesis: Salary and prior-work experience are independent of each other.
mytable <- xtabs(~ salary+work_yrs,data= mba2.df)
mytable
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## work_yrs
## 0 1 2 3 4 5 6 7 8 10 15 16
## 1 8 38 21 11 7 7 1 4 1 2 2
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 535.23, df = 451, p-value = 0.003809
Insights from the contingency table: The second major peak of 14 people having a work experience of 3 years. The lowest work experience is 0 years and the highest is 22 years having 2 people each respectively. The lowest salary of 64000 have a prior work experience of 2 years and the highest salary of 220000 is bagged by a person having a prior work experience of 15 years. The highest number of graduates with the same work experience of 2 years are 60 in total.
Insights from the Pearson’s Chi-square test: The p-value is 0.003809. As p<0.05, there appears to be a realtionship with Salary and prior-work experience. Thus we reject the null hypothesis.
Null hypothesis: Salary and Total GMAT score are independent of each other.
mytable <- xtabs(~ salary+gmat_tot,data= mba2.df)
options(max.print=1000000)
mytable
## gmat_tot
## salary 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 64000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 86000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 88500 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 90000 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 93000 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0
## 95000 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0
## 96000 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0
## 96500 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 98000 0 0 0 0 0 1 3 1 1 0 1 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 2 0 1 0 1 1 0 1 0 2 0
## 100400 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 101000 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 101600 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 102500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 104000 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 105000 0 0 0 0 2 0 2 3 0 1 0 1 0 0 1 0
## 106000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 108000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 112000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 115000 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gmat_tot
## salary 670 680 700 710 720
## 64000 0 0 0 0 0
## 77000 0 0 0 0 0
## 78256 0 0 0 0 0
## 82000 1 0 0 0 0
## 85000 0 0 1 0 1
## 86000 0 1 0 0 0
## 88000 0 0 0 0 0
## 88500 0 0 0 0 0
## 90000 0 0 0 0 0
## 92000 0 0 0 1 0
## 93000 0 0 0 0 0
## 95000 2 0 0 0 0
## 96000 0 0 0 0 0
## 96500 0 0 0 0 0
## 97000 0 0 0 0 0
## 98000 1 1 0 1 0
## 99000 0 0 0 0 0
## 100000 0 0 0 1 0
## 100400 0 0 0 0 0
## 101000 0 0 0 0 0
## 101100 0 0 0 0 0
## 101600 0 0 0 0 0
## 102500 1 0 0 0 0
## 103000 0 0 0 0 0
## 104000 0 0 0 0 0
## 105000 0 1 0 0 0
## 106000 0 2 0 0 0
## 107000 0 0 0 0 0
## 107300 0 0 0 0 0
## 107500 0 0 0 0 0
## 108000 0 0 0 0 0
## 110000 0 0 0 0 0
## 112000 1 1 0 0 0
## 115000 0 0 0 1 0
## 118000 0 0 0 0 0
## 120000 1 0 1 0 0
## 126710 0 0 0 0 0
## 130000 0 0 0 0 0
## 145800 0 0 0 0 0
## 146000 0 0 0 0 0
## 162000 0 0 1 0 0
## 220000 0 0 0 0 0
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## gmat_tot
## 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680
## 2 1 2 2 3 8 7 8 2 9 5 12 8 1 7 5 7 6
## 700 710 720
## 3 4 1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 927.24, df = 820, p-value = 0.005279
Insights from the contingency table: The person with the highest salary of 220000 has a GMAT score of 500. The person with the lowest salary of 64000 has a GMAT score of 560. There is a group of the highest number of people (12) having the same GMAT score of 620.
Insights from the Pearson’s Chi-square test: The p-value is 0.005279. As p<0.05, there is a relationship with Salary and Total GMAT score. Thus we reject the null hypothesis.
Null hypothesis: Salary and Spring MBA average are independent of each other.
mytable <- xtabs(~ salary+s_avg,data= mba2.df)
mytable
## s_avg
## salary 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.91 3 3.09 3.1 3.2 3.27 3.3 3.4
## 64000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 85000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
## 86000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 90000 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
## 93000 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1
## 95000 0 0 1 0 0 0 0 1 0 0 0 0 1 1 2 0
## 96000 0 0 0 1 0 0 0 0 0 0 0 0 1 0 2 0
## 96500 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## 98000 0 1 0 0 0 1 1 4 0 1 0 0 2 0 0 0
## 99000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 100000 0 0 0 0 2 0 1 1 0 1 1 0 0 0 1 2
## 100400 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 101000 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 101600 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 104000 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 105000 1 0 0 0 0 0 0 0 1 2 0 0 1 0 2 0
## 106000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 108000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 112000 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## 115000 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## s_avg
## salary 3.45 3.5 3.6 3.7 3.8 4
## 64000 0 1 0 0 0 0
## 77000 0 0 0 0 0 0
## 78256 0 0 0 0 0 0
## 82000 0 0 0 0 0 0
## 85000 0 2 0 0 0 0
## 86000 0 1 0 0 0 0
## 88000 0 0 0 0 0 0
## 88500 0 0 0 0 0 0
## 90000 0 0 0 0 0 0
## 92000 0 0 0 0 0 0
## 93000 0 0 0 0 0 0
## 95000 0 0 1 0 0 0
## 96000 0 0 0 0 0 0
## 96500 0 0 0 0 0 0
## 97000 0 0 0 0 0 0
## 98000 0 0 0 0 0 0
## 99000 0 0 0 0 0 0
## 100000 0 0 0 0 0 0
## 100400 0 0 0 0 0 0
## 101000 0 0 0 0 0 0
## 101100 0 0 0 0 0 0
## 101600 0 0 0 0 0 0
## 102500 0 0 0 0 0 0
## 103000 0 0 0 0 0 0
## 104000 0 0 0 0 0 0
## 105000 1 1 1 0 1 0
## 106000 0 1 0 1 0 0
## 107000 0 0 0 0 0 0
## 107300 0 0 0 0 0 0
## 107500 0 0 0 0 0 0
## 108000 0 1 0 0 0 0
## 110000 0 0 1 0 0 0
## 112000 0 0 1 0 0 0
## 115000 0 0 1 1 0 0
## 118000 0 1 0 0 0 0
## 120000 0 2 0 0 1 0
## 126710 0 0 0 0 0 0
## 130000 0 0 0 0 0 0
## 145800 0 0 0 0 0 0
## 146000 0 0 0 0 0 1
## 162000 0 0 1 0 0 0
## 220000 0 0 0 0 0 0
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## s_avg
## 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.91 3 3.09 3.1 3.2 3.27 3.3
## 1 1 4 3 6 4 7 13 1 6 2 7 9 1 11
## 3.4 3.45 3.5 3.6 3.7 3.8 4
## 5 1 10 6 2 2 1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 792.97, df = 861, p-value = 0.9524
Insights from the contingency table: The person having the highest salary of 220000 has a Spring MBA average of 2.5. The person with the highest spring MBA average of 4 has a salary package of 146000. The person with the lowest spring MBA average of 2.2 earns 105000. The highest number of people with the same spring MBA average are 13 in number with an average of 2.9.
Insights from the Pearson’s Chi-square test: The p-value is 0.9524. As p>0.05, there is no relationship between Salary and Spring MBA average. Thus we fail to reject the null hypothesis.
Null hypothesis: Salary and Fall MBA average are independent of each other.
mytable <- xtabs(~ salary+f_avg,data= mba2.df)
mytable
## f_avg
## salary 0 2 2.25 2.5 2.67 2.75 2.83 3 3.25 3.33 3.5 3.6 3.67 3.75 4
## 64000 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 78256 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 85000 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0
## 86000 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## 88000 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 90000 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0
## 92000 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0
## 93000 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
## 95000 0 0 0 0 0 1 0 1 2 0 2 0 1 0 0
## 96000 0 0 0 1 0 0 0 0 2 0 1 0 0 0 0
## 96500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
## 98000 0 0 0 1 0 2 0 2 5 0 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 1 0 5 1 0 1 0 1 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 101000 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101600 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 104000 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 105000 0 1 0 0 0 0 1 3 2 0 4 0 0 0 0
## 106000 0 0 0 0 0 0 0 2 0 0 0 1 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107500 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 108000 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 112000 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0
## 115000 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1
## 118000 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 1 2 0 0 0 0 0 1
## 126710 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 145800 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 146000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 220000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## f_avg
## 0 2 2.25 2.5 2.67 2.75 2.83 3 3.25 3.33 3.5 3.6 3.67 3.75 4
## 1 2 1 5 1 15 1 25 25 1 17 2 2 3 2
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 596.28, df = 574, p-value = 0.2518
Insights from the contingency table: The person having the highest salary of 220000 has a fall MBA average of 2.75. Two people have the highest fall MBA average of 4 have a salary package of 115000 and 120000. One person has the lowest fall MBA average of 0 and earns 146000. There are two groups of 25 people with a fall MBA average of 3 and 3.25.
Insights from the Pearson’s Chi-square test: The p-value is 0.2518. As p>0.05, there is no relationship between Salary and Fall MBA average. Thus we fail to reject the null hypothesis.
Null hypothesis: Salary and degree of satisfaction are independent of each other.
mytable <- xtabs(~ salary+satis,data= mba2.df)
mytable
## satis
## salary 3 4 5 6 7
## 64000 0 0 0 0 1
## 77000 0 0 0 1 0
## 78256 0 0 1 0 0
## 82000 0 0 0 0 1
## 85000 0 0 1 3 0
## 86000 0 0 2 0 0
## 88000 0 0 0 0 1
## 88500 0 0 0 1 0
## 90000 0 0 2 0 1
## 92000 0 0 1 1 1
## 93000 0 0 1 2 0
## 95000 1 1 1 2 2
## 96000 0 0 1 1 2
## 96500 0 0 0 1 0
## 97000 0 0 0 1 1
## 98000 0 0 2 5 3
## 99000 0 0 0 1 0
## 100000 0 0 1 6 2
## 100400 0 0 0 0 1
## 101000 0 0 1 1 0
## 101100 0 0 0 1 0
## 101600 0 0 0 1 0
## 102500 0 0 1 0 0
## 103000 0 0 0 1 0
## 104000 0 0 1 1 0
## 105000 0 0 4 6 1
## 106000 0 0 0 2 1
## 107000 0 0 1 0 0
## 107300 0 0 0 0 1
## 107500 0 0 1 0 0
## 108000 0 0 0 2 0
## 110000 0 0 1 0 0
## 112000 0 0 0 2 1
## 115000 0 0 3 2 0
## 118000 0 0 0 0 1
## 120000 0 0 2 2 0
## 126710 0 0 0 1 0
## 130000 0 0 0 0 1
## 145800 0 0 0 1 0
## 146000 0 0 0 1 0
## 162000 0 0 1 0 0
## 220000 0 0 0 1 0
margin.table(mytable,1)
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
margin.table(mytable,2)
## satis
## 3 4 5 6 7
## 1 1 29 50 22
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 109.1, df = 164, p-value = 0.9997
Insights from the contingency table: Degree of satisfaction: 1 person has rated 3 1 person has rated 4 29 people have rated 5 50 people have rated 6 22 people have rated 7
The maximum number of 50 people have rated it 6 and a minimum number of 1 person has rated it 3 and 4. 22 people have rated it the maximum rating of 7. No one has rated it the minimum of 1. The person with the maximum salary package of 220000 has rated it 6 and the person with the minimum salary package of 64000 has rated it 7.
Insights from the Pearson’s Chi-square test: The p-value is 0.9997. As p>0.05, there is no relationship between Salary and Spring MBA average. Thus we fail to reject the null hypothesis.
Null Hypothesis: There is no difference between the mean of Spring average and Fall average.
t.test(mba2.df$s_avg, mba2.df$f_avg)
##
## Welch Two Sample t-test
##
## data: mba2.df$s_avg and mba2.df$f_avg
## t = 0.022345, df = 192.16, p-value = 0.9822
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1186187 0.1213372
## sample estimates:
## mean of x mean of y
## 3.092330 3.090971
The p-value is 0.9822. As p>0.05, we fail to reject the null hypothesis.
Null hypothesis: There is no difference between the mean of age and prior-work experience.
t.test(mba2.df$age, mba2.df$work_yrs)
##
## Welch Two Sample t-test
##
## data: mba2.df$age and mba2.df$work_yrs
## t = 52.723, df = 202.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 22.23330 23.96088
## sample estimates:
## mean of x mean of y
## 26.776699 3.679612
The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of age and prior-work experience.”
Null hypothesis: There is no difference between the mean of salary and first language.
t.test(mba2.df$salary, mba2.df$frstlang)
##
## Welch Two Sample t-test
##
## data: mba2.df$salary and mba2.df$frstlang
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99537.4 106521.9
## sample estimates:
## mean of x mean of y
## 1.030307e+05 1.067961e+00
The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of salary and first language.”
Null hypothesis: There is no difference between the mean of salary and degree of satisfaction.
t.test(mba2.df$salary, mba2.df$satis)
##
## Welch Two Sample t-test
##
## data: mba2.df$salary and mba2.df$satis
## t = 58.515, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99532.58 106517.13
## sample estimates:
## mean of x mean of y
## 1.030307e+05 5.883495e+00
The p value is less than 2.2e-16 which is defenitely lesser than 0.05. We reject the null hypothesis, “There is no difference between the mean of salary and degree of satisfaction.”
regr <- lm(salary ~ age+gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg, data = mba2.df)
summary(regr)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + s_avg + f_avg, data = mba2.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31186 -7438 622 5299 69725
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37730.6788 43445.3073 0.868 0.3873
## age 2657.3914 503.9286 5.273 8.39e-07 ***
## gmat_tot -0.8982 164.7408 -0.005 0.9957
## gmat_qpc 859.6496 480.8645 1.788 0.0770 .
## gmat_vpc 537.0116 480.7496 1.117 0.2668
## gmat_tpc -1454.7911 700.5012 -2.077 0.0405 *
## s_avg 4069.5906 4808.3606 0.846 0.3995
## f_avg -1827.8260 3750.0292 -0.487 0.6271
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15290 on 95 degrees of freedom
## Multiple R-squared: 0.3181, Adjusted R-squared: 0.2679
## F-statistic: 6.332 on 7 and 95 DF, p-value: 4.124e-06
regr$coefficients
## (Intercept) age gmat_tot gmat_qpc gmat_vpc
## 37730.6788094 2657.3914327 -0.8981671 859.6496462 537.0115555
## gmat_tpc s_avg f_avg
## -1454.7911249 4069.5906403 -1827.8260101
Variable age is statistically significant.
Multiple R-squared value is 0.3181.
regr <- lm(salary ~ sex+work_yrs+frstlang+satis+quarter, data = mba2.df)
summary(regr)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis + quarter,
## data = mba2.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29352 -8342 -1943 5264 83154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99601.3 13868.8 7.182 1.41e-10 ***
## sex -6037.8 3394.5 -1.779 0.0784 .
## work_yrs 2321.1 534.1 4.346 3.42e-05 ***
## frstlang 15448.4 6326.4 2.442 0.0164 *
## satis -1800.6 2041.0 -0.882 0.3798
## quarter -1397.2 1441.5 -0.969 0.3348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15590 on 97 degrees of freedom
## Multiple R-squared: 0.2765, Adjusted R-squared: 0.2392
## F-statistic: 7.414 on 5 and 97 DF, p-value: 6.324e-06
regr$coefficients
## (Intercept) sex work_yrs frstlang satis quarter
## 99601.284 -6037.779 2321.095 15448.364 -1800.607 -1397.213
Variable work_yrs is statistically significant.
Multiple R-squared values is 0.2765.
regr <- lm(salary ~ ., data = mba2.df)
summary(regr)
##
## Call:
## lm(formula = salary ~ ., data = mba2.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## age 1750.65 1130.92 1.548 0.1251
## sex -3584.07 3595.85 -0.997 0.3216
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## quarter -2336.56 2721.89 -0.858 0.3929
## work_yrs 749.66 1135.90 0.660 0.5110
## frstlang 7719.42 7373.27 1.047 0.2979
## satis -1086.54 2157.76 -0.504 0.6158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
regr$coefficients
## (Intercept) age sex gmat_tot gmat_qpc gmat_vpc
## 78005.66171 1750.65216 -3584.07221 16.18545 796.54809 546.30750
## gmat_tpc s_avg f_avg quarter work_yrs frstlang
## -1457.08759 -931.53478 -2222.82135 -2336.55542 749.66083 7719.42304
## satis
## -1086.54069
Multiple R-squared is 0.3422. This model has the highest revelance to the regression line and hence is the best model. The multiple R-squared indciates that the model accounts for 34.22% of the variance in Salaries.
Null hypothesis: Salary and Gender are independent of each other.
mytable <- xtabs(~ salary+sex,data= mba3.df)
mytable
## sex
## salary 1 2
## 0 67 23
margin.table(mytable,1)
## salary
## 0
## 90
chisq.test(mytable)
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 21.511, df = 1, p-value = 3.518e-06
Insights from the contingency table: There are 90 unemployed people out of which 67 are males and 23 are females.
Insights from the Pearson’s Chi-square test: The p-value is 3.518e-06. As p<0.05, there is a relationship between null salary and gender. Therefore we reject the null hypothesis.
Null hypothesis: Salary and First language are independent of each other.
mytable <- xtabs(~ salary+frstlang,data= mba3.df)
mytable
## frstlang
## salary 1 2
## 0 82 8
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## frstlang
## 1 2
## 82 8
chisq.test(mytable)
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 60.844, df = 1, p-value = 6.177e-15
Insights from the contingency table: Out of the 90 unemployed people, 82 have first language as english and 8 have a different first language.
Insights from the Pearson’s Chi-square test: The p-value is 6.177e-15. As p<0.05, there is a relationship between null salary and first language. Therefore we reject the null hypothesis.
Null hypothesis: Salary and Work years are independent of each other.
mytable <- xtabs(~ salary+work_yrs,data= mba3.df)
mytable
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 9 10 11 12 13 16 18 22
## 0 1 12 22 14 9 12 2 5 2 1 1 2 2 1 1 1 2
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## work_yrs
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 16 18 22
## 1 12 22 14 9 12 2 5 2 1 1 2 2 1 1 1 2
chisq.test(mytable)
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 117.78, df = 16, p-value < 2.2e-16
Insights from the contingency table: Out of the 90 unemployed people, a peak of 22 people have a prior work experience of 2 years.
Insights from the Pearson’s Chi-square test: The p-value is lesser than 2.2e-16. As p<0.05, there is a relationship between null salary and first language. Therefore we reject the null hypothesis.
Null hypothesis: Salary and Total GMAT score are independent of each other.
mytable <- xtabs(~ salary+gmat_tot,data= mba3.df)
mytable
## gmat_tot
## salary 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 0 1 1 2 3 3 4 8 7 4 3 3 9 4 5 6 5 3
## gmat_tot
## salary 670 680 700 710 720 730 740 750 760
## 0 4 3 2 4 2 1 1 1 1
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## gmat_tot
## 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670
## 1 1 2 3 3 4 8 7 4 3 3 9 4 5 6 5 3 4
## 680 700 710 720 730 740 750 760
## 3 2 4 2 1 1 1 1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 34.8, df = 25, p-value = 0.09188
Insights from the contingency table: Out of the 90 unemployed people, the maximum GMAT score of 760 is bagged by one person and the least GMAT score of 450 is bagged by one person. A peak of 9 people have bagged the same GMAT score of 610.
Insights from the Pearson’s Chi-square test: The p-value is 0.09188. As p>0.05, there isn’t a relationship between null salary and the total GMAT score. Hence, we fail to reject the null hypothesis.
Null hypothesis: Salary and Spring MBA average are independent of each other.
mytable <- xtabs(~ salary+s_avg,data= mba3.df)
mytable
## s_avg
## salary 2 2.1 2.2 2.3 2.4 2.6 2.7 2.8 2.82 2.9 3 3.08 3.09 3.1 3.17 3.2
## 0 1 2 1 2 2 1 8 9 1 9 10 1 2 6 1 4
## s_avg
## salary 3.25 3.27 3.3 3.38 3.4 3.45 3.5 3.6 3.64 3.8 3.9
## 0 1 2 9 1 7 1 2 4 1 1 1
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## s_avg
## 2 2.1 2.2 2.3 2.4 2.6 2.7 2.8 2.82 2.9 3 3.08 3.09 3.1 3.17
## 1 2 1 2 2 1 8 9 1 9 10 1 2 6 1
## 3.2 3.25 3.27 3.3 3.38 3.4 3.45 3.5 3.6 3.64 3.8 3.9
## 4 1 2 9 1 7 1 2 4 1 1 1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 78, df = 26, p-value = 4.248e-07
Insights from the contingency table: The maximum Spring MBA average of 3.9 is bagged by one person and the minimum Spring MBA average of 2 is bagged by one person. A peak of 10 people have bagged the same Spring MBA average of 3.
Insights from the Pearson’s Chi-square test: The p-value is 4.248e-07. As p<0.05, there is a relationship between null salary and spring MBA average. Therefore we reject the null hypothesis.
Null hypothesis: Salary and Fall MBA average are independent of each other.
mytable <- xtabs(~ salary+f_avg,data= mba3.df)
mytable
## f_avg
## salary 0 2 2.25 2.5 2.67 2.75 3 3.17 3.2 3.25 3.33 3.4 3.5 3.6 3.67
## 0 1 3 2 8 1 9 24 1 1 18 1 1 7 1 1
## f_avg
## salary 3.75 3.83 4
## 0 6 1 4
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## f_avg
## 0 2 2.25 2.5 2.67 2.75 3 3.17 3.2 3.25 3.33 3.4 3.5 3.6 3.67
## 1 3 2 8 1 9 24 1 1 18 1 1 7 1 1
## 3.75 3.83 4
## 6 1 4
chisq.test(mytable)
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 143.6, df = 17, p-value < 2.2e-16
Insights from the contingency table: The maximum Fall MBA average of 4 is bagged by four people and the minimum Fall MBA average of 0 is bagged by one person. A peak of 24 people have bagged the same Fall MBA average of 3.
Insights from the Pearson’s Chi-square test: The p-value is less than 2.2e-16. As p<0.05, there is a relationship between null salary and spring MBA average. Therefore we reject the null hypothesis.
Null hypothesis: Salary and Degree of satisfaction are independent of each other.
mytable <- xtabs(~ salary+satis,data= mba3.df)
mytable
## satis
## salary 4 5 6 7
## 0 4 36 40 10
margin.table(mytable,1)
## salary
## 0
## 90
margin.table(mytable,2)
## satis
## 4 5 6 7
## 4 36 40 10
chisq.test(mytable)
##
## Chi-squared test for given probabilities
##
## data: mytable
## X-squared = 43.867, df = 3, p-value = 1.611e-09
Insights from the contingency table: Out of the 90 unemployed people, 40 people have rated it 6, 36 people have rated it 5, 10 people have rated it 7 and 4 people have rated it 4.
Insights from the Pearson’s Chi-square test: The p-value is 1.611e-09. As p<0.05, there is a relationship between null salary and degree of satisfaction. Therefore we reject the null hypothesis.
Since we are comparing people who have a job with those who don’t. I have replaced all the salary values which are greater than zero, with ‘1’ and those who don’t have a job obviously stays ‘0’. Now we have successfully created a model called the “binomial logistic regression”. Those who are employed are denoted by ‘1’ and those who aren’t by ‘0’.
mba1.df$salary[mba1.df$salary!=0] <- 1
lregr <- glm(salary ~.,family=binomial(link='logit'),data=mba1.df)
summary(lregr)
##
## Call:
## glm(formula = salary ~ ., family = binomial(link = "logit"),
## data = mba1.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0419 -1.1439 0.7517 1.0238 1.9665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.56015 4.57567 0.997 0.3190
## age -0.19812 0.08562 -2.314 0.0207 *
## sex 0.14396 0.36089 0.399 0.6900
## gmat_tot -0.01027 0.01289 -0.797 0.4257
## gmat_qpc -0.02177 0.04606 -0.473 0.6364
## gmat_vpc -0.02058 0.04401 -0.468 0.6401
## gmat_tpc 0.08833 0.06492 1.361 0.1736
## s_avg 0.24369 0.67258 0.362 0.7171
## f_avg -0.09871 0.36617 -0.270 0.7875
## quarter -0.23624 0.21132 -1.118 0.2636
## work_yrs 0.10479 0.09386 1.116 0.2642
## frstlang 0.31939 0.64213 0.497 0.6189
## satis 0.42640 0.21308 2.001 0.0454 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 242.92 on 180 degrees of freedom
## AIC: 268.92
##
## Number of Fisher Scoring iterations: 5
anova(lregr, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: salary
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 192 266.68
## age 1 8.4714 191 258.21 0.003608 **
## sex 1 0.3867 190 257.82 0.534052
## gmat_tot 1 0.0156 189 257.81 0.900742
## gmat_qpc 1 0.0186 188 257.79 0.891625
## gmat_vpc 1 1.3430 187 256.44 0.246503
## gmat_tpc 1 4.0117 186 252.43 0.045185 *
## s_avg 1 2.0591 185 250.37 0.151303
## f_avg 1 0.2676 184 250.10 0.604976
## quarter 1 1.2924 183 248.81 0.255602
## work_yrs 1 1.6279 182 247.19 0.201987
## frstlang 1 0.1575 181 247.03 0.691446
## satis 1 4.1033 180 242.92 0.042799 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-values suggest that the age is the biggest factor in variation of salary as it has the lowest p-value. Following this are the degree of satisfaction and the overall GMAT percentile which play the next bigger role (after age) in variation of salary.
A large p-value here indicates that the model without the variable explains more or less the same amount of variation. Variables such as sex, gmat_tot, gmat_qpc, gmat_vpc, s_avg, f_avg, quarter, work_yrs and frstlang donot affect the variable “Salary” much.
Summary from the Binomial logistic regression model: We can conclude that age plays the biggest factor in determining whether an MBA graduate is employed or not. The next biggest roles are played by the Total GMAT percentile and the degree of satisfation in determining whether an MBA graduate is employed or not.