R Markdown

This is an R Markdown document which contains the analysis of the case MBA Starting Salaries. The MBA Starting Salaries Dataset has been used here.

Reading the CSV file and creating summary statistics

setwd("D:/R Internship")
mba.df<-read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
View(mba.df)
summary(mba.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(mba.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
dim(mba.df)
## [1] 274  13

Creating a subset containing the list of students with disclosed salaries

mba_sal.df1<-mba.df[(mba.df$salary!=0),]
dim(mba_sal.df1)
## [1] 184  13
mba_sal.df2<-mba_sal.df1[(mba_sal.df1$salary!=998),]
dim(mba_sal.df2)
## [1] 138  13
mba_sal.df<-mba_sal.df2[(mba_sal.df2$salary!=999),]
dim(mba_sal.df)
## [1] 103  13

Histogram of Salaries

hist(mba_sal.df$salary,main ="Frequency count of Disclosed Salaries",
     xlab = "Salary",ylab = "No.of Students",col = "Grey")

Boxplot of Salary and Gender

boxplot(salary~sex,data = mba_sal.df,
        main="Boxplot of disclosed salaries and Gender",
        horizontal=TRUE,xlab="Salary",ylab="Gender",yaxt="n")
axis(side=2,at=c(1,2),labels=c("Male","Female"))

Boxplot of Salary and First Language

boxplot(salary~frstlang,data = mba_sal.df,
        main="Boxplot of disclosed salaries and First Language",
        horizontal=TRUE,xlab="Salary",ylab="First Language",yaxt="n")
axis(side=2,at=c(1,2),labels=c("English","Other"))

Barchart of Quartile Ranking and Salary

library(lattice)
## Warning: package 'lattice' was built under R version 3.3.3
sal_mean_quart<-aggregate(salary~quarter,data = mba_sal.df,mean)
sal_mean_quart
##   quarter   salary
## 1       1 106328.6
## 2       2 103612.0
## 3       3  98319.0
## 4       4 102142.6
barchart(quarter~salary,data = sal_mean_quart,col="Grey",
    main="Barchart of mean disclosed salary by quartile ranking",
    xlab="Mean Salary",ylab="Quartile Ranking")

Scatterplotmatrix between GMAT Percentile, Spring and Fall Average, Salary

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(formula=~gmat_tpc+s_avg+f_avg+salary,cex=0.6,data=mba_sal.df)

Corrgram of variables in the MBA Starting Salaries dataset

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(mba_sal.df,order = TRUE,lower.panel = panel.shade,
         upper.panel = panel.pie,text.panel = panel.txt,
         main="Corrgram of variables in the MBA Starting Salaries dataset")

3-way contingency table between sex, first language and quarter

table1<-ftable(xtabs(~sex+frstlang+quarter,data = mba_sal.df1))
table1
##              quarter  1  2  3  4
## sex frstlang                    
## 1   1                30 32 31 29
##     2                 3  2  7  5
## 2   1                18  5  8  7
##     2                 0  4  1  2

Chi-sq test between Sex and first language

table2<-xtabs(~sex+frstlang,data=mba_sal.df1)
table2
##    frstlang
## sex   1   2
##   1 122  17
##   2  38   7
chisq.test(table2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table2
## X-squared = 0.10308, df = 1, p-value = 0.7482

Chi-sq test between Sex and Quartile ranking

table3<-xtabs(~sex+quarter,data=mba_sal.df1)
table3
##    quarter
## sex  1  2  3  4
##   1 33 34 38 34
##   2 18  9  9  9
chisq.test(table3)
## 
##  Pearson's Chi-squared test
## 
## data:  table3
## X-squared = 4.5377, df = 3, p-value = 0.209

SO, there is no correlation between Sex and First Language as well as between Sex and Quartile Ranking.

T-tests between Age and Salary, Sex and Salary

t.test(mba_sal.df1$age,mba_sal.df1$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_sal.df1$age and mba_sal.df1$salary
## t = -15.005, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65725.43 -50449.67
## sample estimates:
##   mean of x   mean of y 
##    26.79348 58114.34239
t.test(mba_sal.df1$sex,mba_sal.df1$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_sal.df1$sex and mba_sal.df1$salary
## t = -15.012, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65750.98 -50475.22
## sample estimates:
##    mean of x    mean of y 
##     1.244565 58114.342391

So, there is significant difference in salary for different genders and ages.

Linear regression model between Salary wrt Age, Sex, First Language and Quartile Ranking

fit1<-lm(salary~age+sex+quarter+frstlang,data = mba_sal.df1)
summary(fit1)
## 
## Call:
## lm(formula = salary ~ age + sex + quarter + frstlang, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82795 -51156  16846  40981 149776 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    14703      41367   0.355   0.7227  
## age             2820       1396   2.020   0.0449 *
## sex            14121       8865   1.593   0.1130  
## quarter        -7472       3378  -2.212   0.0282 *
## frstlang      -27806      11435  -2.432   0.0160 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50640 on 179 degrees of freedom
## Multiple R-squared:  0.09018,    Adjusted R-squared:  0.06985 
## F-statistic: 4.436 on 4 and 179 DF,  p-value: 0.001924

Linear regression model between Salary wrt Spring and Fall Average, Work experience and degree of satisfaction

fit2<-lm(salary~s_avg+f_avg+work_yrs+satis,data = mba_sal.df1)
summary(fit2)
## 
## Call:
## lm(formula = salary ~ s_avg + f_avg + work_yrs + satis, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90334  -7908   9167  23072 136462 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2606.510  24534.451  -0.106  0.91551    
## s_avg       28894.347   9196.268   3.142  0.00196 ** 
## f_avg       -4587.269   6939.022  -0.661  0.50941    
## work_yrs     1798.036   1218.902   1.475  0.14193    
## satis         -74.529      6.834 -10.906  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39610 on 179 degrees of freedom
## Multiple R-squared:  0.4434, Adjusted R-squared:  0.431 
## F-statistic: 35.65 on 4 and 179 DF,  p-value: < 2.2e-16

Linear regression model between Salary wrt GMAT total percentile, total score, verbal and quants percentile

fit3<-lm(salary~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc,data = mba_sal.df1)
summary(fit3)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74505 -55243  26417  41678 154546 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 224287.4    76794.0   2.921  0.00394 **
## gmat_tot      -488.9      287.6  -1.700  0.09091 . 
## gmat_qpc       423.2      794.1   0.533  0.59471   
## gmat_vpc       814.7      722.3   1.128  0.26086   
## gmat_tpc       462.1      622.7   0.742  0.45904   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52100 on 179 degrees of freedom
## Multiple R-squared:  0.03695,    Adjusted R-squared:  0.01542 
## F-statistic: 1.717 on 4 and 179 DF,  p-value: 0.1482

Linear regression model between Salary wrt the significant factors in the above 3 models

fit_final<-lm(salary~frstlang+s_avg+satis,data = mba_sal.df1)
summary(fit_final)
## 
## Call:
## lm(formula = salary ~ frstlang + s_avg + satis, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92001 -10717  10528  22444 168786 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18051.389  26863.106   0.672   0.5025    
## frstlang    -14319.747   8777.099  -1.631   0.1045    
## s_avg        24900.867   7722.454   3.224   0.0015 ** 
## satis          -74.961      6.808 -11.010   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39570 on 180 degrees of freedom
## Multiple R-squared:  0.4416, Adjusted R-squared:  0.4323 
## F-statistic: 47.45 on 3 and 180 DF,  p-value: < 2.2e-16

So, based on R-squared values, the second and the final models are the most relaible ones.

Creating a subset of those students who didn’t get a job

mba_notplaced<-mba.df[mba.df$salary==0,]
dim(mba_notplaced)
## [1] 90 13

3-way contingency table between sex, first language and quarter

table4<-ftable(xtabs(~sex+frstlang+quarter,data = mba_notplaced))
table4
##              quarter  1  2  3  4
## sex frstlang                    
## 1   1                10 19 14 17
##     2                 1  2  2  2
## 2   1                 7  6  6  3
##     2                 0  0  1  0

Chi-sq test between Sex and first language

table5<-xtabs(~sex+frstlang,data=mba_notplaced)
table5
##    frstlang
## sex  1  2
##   1 60  7
##   2 22  1
chisq.test(table5)
## Warning in chisq.test(table5): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table5
## X-squared = 0.21376, df = 1, p-value = 0.6438

Chi-sq test between Sex and Quartile ranking

table6<-xtabs(~sex+quarter,data=mba_notplaced)
table6
##    quarter
## sex  1  2  3  4
##   1 11 21 16 19
##   2  7  6  7  3
chisq.test(table6)
## Warning in chisq.test(table6): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table6
## X-squared = 3.7704, df = 3, p-value = 0.2874

There isn’t any correlation between Sex and First Language as well as between Sex and Quartile ranking