Loading and summary
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
Correlation
cor(mba.df, use="complete.obs", method="kendall")
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.000000000 -0.064918735 -0.089348575 -0.14294895 0.004312123
## sex -0.064918735 1.000000000 -0.031512872 -0.11604667 0.063244361
## gmat_tot -0.089348575 -0.031512872 1.000000000 0.57049831 0.602830883
## gmat_qpc -0.142948954 -0.116046670 0.570498312 1.00000000 0.135761590
## gmat_vpc 0.004312123 0.063244361 0.602830883 0.13576159 1.000000000
## gmat_tpc -0.092059593 -0.029202995 0.976022086 0.56617453 0.594830301
## s_avg 0.074567920 0.104994627 0.075723411 -0.03360934 0.146074991
## f_avg 0.023463495 0.106433803 0.097979623 0.01921321 0.118891700
## quarter 0.004556967 -0.122029469 -0.065620299 0.04084046 -0.136979833
## work_yrs 0.655013245 0.003886011 -0.146648764 -0.19009952 -0.047736629
## frstlang 0.082749814 0.001536205 -0.104902855 0.12551916 -0.253495516
## salary -0.066964226 -0.006991235 0.001523055 -0.01833562 0.006500531
## satis -0.121775381 -0.030718623 0.079740627 -0.02494718 0.108479428
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.092059593 0.07456792 0.0234634952 0.004556967 0.655013245
## sex -0.029202995 0.10499463 0.1064338033 -0.122029469 0.003886011
## gmat_tot 0.976022086 0.07572341 0.0979796225 -0.065620299 -0.146648764
## gmat_qpc 0.566174526 -0.03360934 0.0192132078 0.040840458 -0.190099517
## gmat_vpc 0.594830301 0.14607499 0.1188916999 -0.136979833 -0.047736629
## gmat_tpc 1.000000000 0.06976878 0.0920819234 -0.057151609 -0.144030804
## s_avg 0.069768777 1.00000000 0.5906570112 -0.719221088 0.062138164
## f_avg 0.092081923 0.59065701 1.0000000000 -0.479986073 -0.003777988
## quarter -0.057151609 -0.71922109 -0.4799860731 1.000000000 -0.022169086
## work_yrs -0.144030804 0.06213816 -0.0037779876 -0.022169086 1.000000000
## frstlang -0.100679762 -0.12138557 -0.0700173031 0.091239794 -0.001161440
## salary 0.001900985 0.05469155 0.0236987836 -0.102670440 -0.009141085
## satis 0.080815648 -0.00332823 -0.0009964743 -0.006940549 -0.050227342
## frstlang salary satis
## age 0.082749814 -0.066964226 -0.1217753809
## sex 0.001536205 -0.006991235 -0.0307186232
## gmat_tot -0.104902855 0.001523055 0.0797406267
## gmat_qpc 0.125519156 -0.018335616 -0.0249471837
## gmat_vpc -0.253495516 0.006500531 0.1084794281
## gmat_tpc -0.100679762 0.001900985 0.0808156484
## s_avg -0.121385567 0.054691548 -0.0033282301
## f_avg -0.070017303 0.023698784 -0.0009964743
## quarter 0.091239794 -0.102670440 -0.0069405492
## work_yrs -0.001161440 -0.009141085 -0.0502273419
## frstlang 1.000000000 -0.010042495 -0.0910683922
## salary -0.010042495 1.000000000 -0.0020398247
## satis -0.091068392 -0.002039825 1.0000000000
library(corrgram)
corrgram(mba.df, order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram of store variables")
Scatter Plot
library(car)
scatterplotMatrix(~age+sex+gmat_tot+s_avg+f_avg+work_yrs+salary+satis, data=mba.df)
placed.df <- mba.df[which (mba.df$salary > 1000) , ]
aggregate(cbind(salary, work_yrs, age) ~ sex, data = mba.df, mean)
## sex salary work_yrs age
## 1 1 37013.62 3.893204 27.41748
## 2 2 45121.07 3.808824 27.17647
boxplot(salary ~ sex ,data=mba.df, main="Effect of Gender on Salary", ylab="Gender", xlab="Starting Salary")
aggregate(cbind(salary, work_yrs) ~ age, data = mba.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
aggregate(cbind(salary, work_yrs) ~ satis , data = mba.df, mean)
## satis salary work_yrs
## 1 1 999.000 3.000000
## 2 2 999.000 2.000000
## 3 3 19799.200 4.200000
## 4 4 6293.412 2.941176
## 5 5 40476.311 4.243243
## 6 6 54383.536 4.185567
## 7 7 65718.152 3.727273
## 8 998 998.000 3.086957
boxplot(salary ~ work_yrs ,data=mba.df, main="Effect of Work Experience on Salary", xlab="Work Experience", ylab="Starting Salary")
library(lattice)
histogram(~salary, data = mba.df, main = "Distribution of MBA's Starting Salary", xlab="Starting Salary")
placed.df <- mba.df[which (mba.df$salary > 1000) , ]
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
notDisclosedSalary.df <- mba.df[which (mba.df$salary == 999) , ]
clean.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
clean.df$GotPlaced = (clean.df$salary >1000)
clean.df$GotPlaced <- factor(clean.df$GotPlaced)
ChiSqare Test
t1 <- table(clean.df$GotPlaced == 'TRUE')
t1
##
## FALSE TRUE
## 125 103
t2 <- xtabs(~ GotPlaced + sex , data= clean.df)
t2
## sex
## GotPlaced 1 2
## FALSE 97 28
## TRUE 72 31
prop.table(t2, 2)
## sex
## GotPlaced 1 2
## FALSE 0.5739645 0.4745763
## TRUE 0.4260355 0.5254237
t3 <- xtabs(~ GotPlaced + frstlang, data=clean.df )
t3
## frstlang
## GotPlaced 1 2
## FALSE 108 17
## TRUE 96 7
prop.table(t3, 2)
## frstlang
## GotPlaced 1 2
## FALSE 0.5294118 0.7083333
## TRUE 0.4705882 0.2916667
chisq.test(t2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: t2
## X-squared = 1.366, df = 1, p-value = 0.2425
chisq.test(t3)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: t3
## X-squared = 2.1002, df = 1, p-value = 0.1473
mba.df$sex[mba.df$sex == 1] <- 'Male'
mba.df$sex[mba.df$sex == 2] <- 'Female'
mba.df$sex <- factor(mba.df$sex)
mba.df$frstlang[mba.df$frstlang == 1] <- 'English'
mba.df$frstlang[mba.df$frstlang == 2] <- 'Other'
mba.df$frstlang <- factor(mba.df$frstlang)
m <- salary ~ work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit <- lm(m, data = placed.df)
summary(fit)
##
## Call:
## lm(formula = m, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29800 -7822 -1742 4869 82341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86719.94 23350.43 3.714 0.000346 ***
## work_yrs 2331.12 585.99 3.978 0.000137 ***
## s_avg 4659.05 5015.66 0.929 0.355320
## f_avg -1698.83 3834.70 -0.443 0.658773
## gmat_qpc 98.72 121.85 0.810 0.419884
## gmat_vpc -95.80 102.99 -0.930 0.354699
## sex -5289.24 3545.91 -1.492 0.139140
## frstlang 13994.76 6641.66 2.107 0.037770 *
## satis -1671.20 2070.62 -0.807 0.421643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2241
## F-statistic: 4.683 on 8 and 94 DF, p-value: 7.574e-05
nojob.df<- mba.df[ which(mba.df$salary !="998" & mba.df$salary !="999" & mba.df$salary==0), ]
head(nojob.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 Female 620 77 87 87 3.4 3.00 1
## 2 24 Male 610 90 71 87 3.5 4.00 1
## 3 24 Male 670 99 78 95 3.3 3.25 1
## 4 24 Male 570 56 81 75 3.3 2.67 1
## 6 24 Male 640 82 89 91 3.9 3.75 1
## 7 25 Male 610 89 74 87 3.4 3.50 1
## work_yrs frstlang salary satis
## 1 2 English 0 7
## 2 2 English 0 6
## 3 2 English 0 6
## 4 1 English 0 7
## 6 2 English 0 6
## 7 2 English 0 5
hist(nojob.df$gmat_tot,
main = "GMAT performance of students with no job",
xlab="GMAT score",
breaks=10,
col = "blue")
chisq.test(nojob.df$work_yrs,nojob.df$satis)
## Warning in chisq.test(nojob.df$work_yrs, nojob.df$satis): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: nojob.df$work_yrs and nojob.df$satis
## X-squared = 44.974, df = 48, p-value = 0.5976
It shows that the null hypothesis is true and that unplaced students with work experience are satisfied with the MBA program.