This Rmd document is an analysis of the MBA Starting Salaries.csv file, comprising of Boxplots, barcharts, histograms and scatterplots. This also includes various tests performed on the dataset to verify the hyposthesis and to evaluate the significant difference.
#TASK 2-a:
setwd("F:/R-Internship/Course related files")
sal_mba.df<-read.csv(paste("MBA Salary.csv",sep=""))
View(sal_mba.df)
summary(sal_mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(sal_mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
#Histogram:
hist(sal_mba.df$salary,xlab="Salary earned",ylab="Count",
col="light blue",breaks=5,main="Salary of MBA students")
mba_disclosed<-sal_mba.df[sal_mba.df$salary != 0,]
mba_disclosed1<-mba_disclosed[mba_disclosed$salary !=998,]
mba_disclosed2<-mba_disclosed1[mba_disclosed1$salary !=999,]
#Bar-chart:
mba_mean<-aggregate(salary~quarter,data=mba_disclosed2,FUN=mean)
library(lattice)
barchart(quarter~salary,data=mba_mean,col="blue",main="Mean disclosed salary
by quartile ranking",xlab="Mean salary",ylab="Quartile Ranking")
#Box-Plot 1:
boxplot(salary~sex,data=mba_disclosed2,xlab="Salary",
ylab="Gender",main="Disclosed Salaries of MBA students",horizontal = TRUE,yaxt="n")
axis(side=2,at=c(1,2),labels=c("Male","Female"))
#Box-Plot 2:
boxplot(salary~frstlang,data=mba_disclosed2,xlab="Salary",ylab="First Language"
,main="Disclosed Salaries of MBA students",horizontal=TRUE,yaxt="n")
axis(side=2,at=c(1,2),labels=c("English","Other Language"))
#Box-Plot 3:
boxplot(salary~quarter,data=mba_disclosed2,xlab="Salary",ylab="Quartile",
main="Disclosed salary of MBA students",horizontal=TRUE,yaxt="n")
axis(side=2,at=c(1,2,3,4),labels=c(1,2,3,4))
library(lattice)
#Scatter-Plot 1:
plot(mba_disclosed2$salary,mba_disclosed2$gmat_tpc,xlab="Salary",
ylab="GMAT Total Percentile",main="Disclosed salary of MBA students")
#Scatter-Plot 2:
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(mba_disclosed2$salary,mba_disclosed2$s_avg,xlab="Salary",
ylab="Spring MBA Average",main="Disclosed salary of MBA students")
#Scatter-Plot 3:
library(car)
scatterplot(mba_disclosed2$salary,mba_disclosed2$f_avg,xlab="Salary",
ylab="Fall MBA Average",main="Disclosed salary of students")
#Scatter-plot Matrix:
library(car)
scatterplotMatrix(mba_disclosed[,c("gmat_tot","s_avg","f_avg","salary")],
spread = FALSE,smoother.args = list(lty=2),
main="MBA Starting Salary")
#Corrgram:
library(corrgram)
corrgram(mba_disclosed,order=TRUE,lower.panel=panel.shade, upper.panel= panel.pie,
text.panel = panel.txt, main="Corrgram of MBA Starting Salaries")
#TASK 2-b:
mba_disclosed<-sal_mba.df[sal_mba.df$salary != 0,]
#3-way Contingency tables:
mba_t1<-xtabs(~sex+quarter+frstlang,data=mba_disclosed)
ftable(mba_t1)
## frstlang 1 2
## sex quarter
## 1 1 30 3
## 2 32 2
## 3 31 7
## 4 29 5
## 2 1 18 0
## 2 5 4
## 3 8 1
## 4 7 2
margin.table(mba_t1,1)
## sex
## 1 2
## 139 45
margin.table(mba_t1,2)
## quarter
## 1 2 3 4
## 51 43 47 43
addmargins(mba_t1)
## , , frstlang = 1
##
## quarter
## sex 1 2 3 4 Sum
## 1 30 32 31 29 122
## 2 18 5 8 7 38
## Sum 48 37 39 36 160
##
## , , frstlang = 2
##
## quarter
## sex 1 2 3 4 Sum
## 1 3 2 7 5 17
## 2 0 4 1 2 7
## Sum 3 6 8 7 24
##
## , , frstlang = Sum
##
## quarter
## sex 1 2 3 4 Sum
## 1 33 34 38 34 139
## 2 18 9 9 9 45
## Sum 51 43 47 43 184
#Chi-square test:
mba_t2<-xtabs(~sex+frstlang,data=mba_disclosed)
chisq.test(mba_t2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mba_t2
## X-squared = 0.10308, df = 1, p-value = 0.7482
mba_t3<-xtabs(~sex+quarter,data=mba_disclosed)
chisq.test(mba_t3)
##
## Pearson's Chi-squared test
##
## data: mba_t3
## X-squared = 4.5377, df = 3, p-value = 0.209
mba_t4<-xtabs(~quarter+frstlang,data=mba_disclosed)
chisq.test(mba_t4)
##
## Pearson's Chi-squared test
##
## data: mba_t4
## X-squared = 3.3899, df = 3, p-value = 0.3353
#T-test:
t.test(mba_disclosed$gmat_tpc,mba_disclosed$salary)
##
## Welch Two Sample t-test
##
## data: mba_disclosed$gmat_tpc and mba_disclosed$salary
## t = -14.99, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65667.09 -50391.33
## sample estimates:
## mean of x mean of y
## 85.13043 58114.34239
t.test(mba_disclosed$age,mba_disclosed$salary)
##
## Welch Two Sample t-test
##
## data: mba_disclosed$age and mba_disclosed$salary
## t = -15.005, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65725.43 -50449.67
## sample estimates:
## mean of x mean of y
## 26.79348 58114.34239
t.test(mba_disclosed$s_avg,mba_disclosed$salary)
##
## Welch Two Sample t-test
##
## data: mba_disclosed$s_avg and mba_disclosed$salary
## t = -15.011, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65749.20 -50473.44
## sample estimates:
## mean of x mean of y
## 3.022554 58114.342391
t.test(mba_disclosed$sex,mba_disclosed$salary)
##
## Welch Two Sample t-test
##
## data: mba_disclosed$sex and mba_disclosed$salary
## t = -15.012, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65750.98 -50475.22
## sample estimates:
## mean of x mean of y
## 1.244565 58114.342391
#Linear-Regression Model:
mba_reg<-lm(salary~age+gmat_tot+gmat_vpc+s_avg+quarter+frstlang,data=mba_disclosed)
summary(mba_reg)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_vpc + s_avg + quarter +
## frstlang, data = mba_disclosed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94209 -47628 16695 39941 156113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102308.90 88904.42 1.151 0.2514
## age 1963.11 1410.42 1.392 0.1657
## gmat_tot -161.84 108.72 -1.489 0.1384
## gmat_vpc -40.45 372.04 -0.109 0.9135
## s_avg 16885.28 18004.24 0.938 0.3496
## quarter -4311.60 6035.29 -0.714 0.4759
## frstlang -29586.41 12685.82 -2.332 0.0208 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50350 on 177 degrees of freedom
## Multiple R-squared: 0.1109, Adjusted R-squared: 0.08075
## F-statistic: 3.679 on 6 and 177 DF, p-value: 0.001808
mba_reg1<-lm(salary~sex+gmat_qpc+gmat_tpc+f_avg+work_yrs+satis,data=mba_disclosed)
summary(mba_reg1)
##
## Call:
## lm(formula = salary ~ sex + gmat_qpc + gmat_tpc + f_avg + work_yrs +
## satis, data = mba_disclosed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92622 -6057 7848 24808 113513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56821.329 30441.807 1.867 0.0636 .
## sex 5777.726 7206.841 0.802 0.4238
## gmat_qpc -481.644 279.833 -1.721 0.0870 .
## gmat_tpc 292.677 291.330 1.005 0.3164
## f_avg 6262.471 6021.104 1.040 0.2997
## work_yrs 2354.024 1242.936 1.894 0.0599 .
## satis -75.005 7.012 -10.697 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40440 on 177 degrees of freedom
## Multiple R-squared: 0.4264, Adjusted R-squared: 0.407
## F-statistic: 21.93 on 6 and 177 DF, p-value: < 2.2e-16
#TASK 2-c:
mba_not_placed<-sal_mba.df[sal_mba.df$salary==0,]
View(mba_not_placed)
#Contingency Table:
mba_np1<-xtabs(~sex+quarter+frstlang,data=mba_not_placed)
ftable(mba_np1)
## frstlang 1 2
## sex quarter
## 1 1 10 1
## 2 19 2
## 3 14 2
## 4 17 2
## 2 1 7 0
## 2 6 0
## 3 6 1
## 4 3 0
margin.table(mba_np1,1)
## sex
## 1 2
## 67 23
margin.table(mba_np1,2)
## quarter
## 1 2 3 4
## 18 27 23 22
addmargins(mba_np1)
## , , frstlang = 1
##
## quarter
## sex 1 2 3 4 Sum
## 1 10 19 14 17 60
## 2 7 6 6 3 22
## Sum 17 25 20 20 82
##
## , , frstlang = 2
##
## quarter
## sex 1 2 3 4 Sum
## 1 1 2 2 2 7
## 2 0 0 1 0 1
## Sum 1 2 3 2 8
##
## , , frstlang = Sum
##
## quarter
## sex 1 2 3 4 Sum
## 1 11 21 16 19 67
## 2 7 6 7 3 23
## Sum 18 27 23 22 90
#Chi-square test:
mba_np2<-xtabs(~sex+quarter,data=mba_not_placed)
chisq.test(mba_np2)
## Warning in chisq.test(mba_np2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mba_np2
## X-squared = 3.7704, df = 3, p-value = 0.2874
mba_np3<-xtabs(~sex+frstlang,data=mba_not_placed)
chisq.test(mba_np3)
## Warning in chisq.test(mba_np3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mba_np3
## X-squared = 0.21376, df = 1, p-value = 0.6438
mba_np4<-xtabs(~quarter+frstlang,data=mba_not_placed)
chisq.test(mba_np4)
## Warning in chisq.test(mba_np4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mba_np4
## X-squared = 0.81142, df = 3, p-value = 0.8467