R Markdown

This is an R Markdown document which contains the analysis of the case MBA Starting Salaries. The MBA Starting Salaries Dataset has been used here.

Reading the CSV file and creating summary statistics

setwd("D:/R Internship")
mba.df<-read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
View(mba.df)
summary(mba.df)

##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

library(psych)
describe(mba.df)

##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

dim(mba.df)

## [1] 274  13

Creating a subset containing the list of students with disclosed salaries

mba_sal.df1<-mba.df[(mba.df$salary!=0),]
dim(mba_sal.df1)

## [1] 184  13

mba_sal.df2<-mba_sal.df1[(mba_sal.df1$salary!=998),]
dim(mba_sal.df2)

## [1] 138  13

mba_sal.df<-mba_sal.df2[(mba_sal.df2$salary!=999),]
dim(mba_sal.df)

## [1] 103  13

Histogram of Salaries

hist(mba_sal.df$salary,main ="Frequency count of Disclosed Salaries",
     xlab = "Salary",ylab = "No.of Students",col = "Grey")

Boxplot of Salary and Gender

boxplot(salary~sex,data = mba_sal.df,
        main="Boxplot of disclosed salaries and Gender",
        horizontal=TRUE,xlab="Salary",ylab="Gender",yaxt="n")
axis(side=2,at=c(1,2),labels=c("Male","Female"))

Boxplot of Salary and First Language

boxplot(salary~frstlang,data = mba_sal.df,
        main="Boxplot of disclosed salaries and First Language",
        horizontal=TRUE,xlab="Salary",ylab="First Language",yaxt="n")
axis(side=2,at=c(1,2),labels=c("English","Other"))

Barchart of Quartile Ranking and Salary

library(lattice)

## Warning: package 'lattice' was built under R version 3.3.3

sal_mean_quart<-aggregate(salary~quarter,data = mba_sal.df,mean)
sal_mean_quart

##   quarter   salary
## 1       1 106328.6
## 2       2 103612.0
## 3       3  98319.0
## 4       4 102142.6

barchart(quarter~salary,data = sal_mean_quart,col="Grey",
    main="Barchart of mean disclosed salary by quartile ranking",
    xlab="Mean Salary",ylab="Quartile Ranking")

Scatterplotmatrix between GMAT Percentile, Spring and Fall Average, Salary

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplotMatrix(formula=~gmat_tpc+s_avg+f_avg+salary,cex=0.6,data=mba_sal.df)

Corrgram of variables in the MBA Starting Salaries dataset

library(corrgram)

## Warning: package 'corrgram' was built under R version 3.3.3

corrgram(mba_sal.df,order = TRUE,lower.panel = panel.shade,
         upper.panel = panel.pie,text.panel = panel.txt,
         main="Corrgram of variables in the MBA Starting Salaries dataset")

3-way contingency table between sex, first language and quarter

table1<-ftable(xtabs(~sex+frstlang+quarter,data = mba_sal.df1))
table1

##              quarter  1  2  3  4
## sex frstlang                    
## 1   1                30 32 31 29
##     2                 3  2  7  5
## 2   1                18  5  8  7
##     2                 0  4  1  2

Chi-sq test between Sex and first language

table2<-xtabs(~sex+frstlang,data=mba_sal.df1)
table2

##    frstlang
## sex   1   2
##   1 122  17
##   2  38   7

chisq.test(table2)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table2
## X-squared = 0.10308, df = 1, p-value = 0.7482

Chi-sq test between Sex and Quartile ranking

table3<-xtabs(~sex+quarter,data=mba_sal.df1)
table3

##    quarter
## sex  1  2  3  4
##   1 33 34 38 34
##   2 18  9  9  9

chisq.test(table3)

## 
##  Pearson's Chi-squared test
## 
## data:  table3
## X-squared = 4.5377, df = 3, p-value = 0.209

SO, there is no correlation between Sex and First Language as well as between Sex and Quartile Ranking.

T-tests between Age and Salary, Sex and Salary

t.test(mba_sal.df1$age,mba_sal.df1$salary)

## 
##  Welch Two Sample t-test
## 
## data:  mba_sal.df1$age and mba_sal.df1$salary
## t = -15.005, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65725.43 -50449.67
## sample estimates:
##   mean of x   mean of y 
##    26.79348 58114.34239

t.test(mba_sal.df1$sex,mba_sal.df1$salary)

## 
##  Welch Two Sample t-test
## 
## data:  mba_sal.df1$sex and mba_sal.df1$salary
## t = -15.012, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65750.98 -50475.22
## sample estimates:
##    mean of x    mean of y 
##     1.244565 58114.342391

So, there is significant difference in salary for different genders and ages.

Linear regression model between Salary wrt Age, Sex, First Language and Quartile Ranking

fit1<-lm(salary~age+sex+quarter+frstlang,data = mba_sal.df1)
summary(fit1)

## 
## Call:
## lm(formula = salary ~ age + sex + quarter + frstlang, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82795 -51156  16846  40981 149776 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    14703      41367   0.355   0.7227  
## age             2820       1396   2.020   0.0449 *
## sex            14121       8865   1.593   0.1130  
## quarter        -7472       3378  -2.212   0.0282 *
## frstlang      -27806      11435  -2.432   0.0160 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50640 on 179 degrees of freedom
## Multiple R-squared:  0.09018,    Adjusted R-squared:  0.06985 
## F-statistic: 4.436 on 4 and 179 DF,  p-value: 0.001924

Linear regression model between Salary wrt Spring and Fall Average, Work experience and degree of satisfaction

fit2<-lm(salary~s_avg+f_avg+work_yrs+satis,data = mba_sal.df1)
summary(fit2)

## 
## Call:
## lm(formula = salary ~ s_avg + f_avg + work_yrs + satis, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90334  -7908   9167  23072 136462 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2606.510  24534.451  -0.106  0.91551    
## s_avg       28894.347   9196.268   3.142  0.00196 ** 
## f_avg       -4587.269   6939.022  -0.661  0.50941    
## work_yrs     1798.036   1218.902   1.475  0.14193    
## satis         -74.529      6.834 -10.906  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39610 on 179 degrees of freedom
## Multiple R-squared:  0.4434, Adjusted R-squared:  0.431 
## F-statistic: 35.65 on 4 and 179 DF,  p-value: < 2.2e-16

Linear regression model between Salary wrt GMAT total percentile, total score, verbal and quants percentile

fit3<-lm(salary~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc,data = mba_sal.df1)
summary(fit3)

## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74505 -55243  26417  41678 154546 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 224287.4    76794.0   2.921  0.00394 **
## gmat_tot      -488.9      287.6  -1.700  0.09091 . 
## gmat_qpc       423.2      794.1   0.533  0.59471   
## gmat_vpc       814.7      722.3   1.128  0.26086   
## gmat_tpc       462.1      622.7   0.742  0.45904   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52100 on 179 degrees of freedom
## Multiple R-squared:  0.03695,    Adjusted R-squared:  0.01542 
## F-statistic: 1.717 on 4 and 179 DF,  p-value: 0.1482

Linear regression model between Salary wrt the significant factors in the above 3 models

fit_final<-lm(salary~frstlang+s_avg+satis,data = mba_sal.df1)
summary(fit_final)

## 
## Call:
## lm(formula = salary ~ frstlang + s_avg + satis, data = mba_sal.df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92001 -10717  10528  22444 168786 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18051.389  26863.106   0.672   0.5025    
## frstlang    -14319.747   8777.099  -1.631   0.1045    
## s_avg        24900.867   7722.454   3.224   0.0015 ** 
## satis          -74.961      6.808 -11.010   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39570 on 180 degrees of freedom
## Multiple R-squared:  0.4416, Adjusted R-squared:  0.4323 
## F-statistic: 47.45 on 3 and 180 DF,  p-value: < 2.2e-16

So, based on R-squared values, the second and the final models are the most relaible ones.

Creating a subset of those students who didn’t get a job

mba_notplaced<-mba.df[mba.df$salary==0,]
dim(mba_notplaced)

## [1] 90 13

3-way contingency table between sex, first language and quarter

table4<-ftable(xtabs(~sex+frstlang+quarter,data = mba_notplaced))
table4

##              quarter  1  2  3  4
## sex frstlang                    
## 1   1                10 19 14 17
##     2                 1  2  2  2
## 2   1                 7  6  6  3
##     2                 0  0  1  0

Chi-sq test between Sex and first language

table5<-xtabs(~sex+frstlang,data=mba_notplaced)
table5

##    frstlang
## sex  1  2
##   1 60  7
##   2 22  1

chisq.test(table5)

## Warning in chisq.test(table5): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table5
## X-squared = 0.21376, df = 1, p-value = 0.6438

Chi-sq test between Sex and Quartile ranking

table6<-xtabs(~sex+quarter,data=mba_notplaced)
table6

##    quarter
## sex  1  2  3  4
##   1 11 21 16 19
##   2  7  6  7  3

chisq.test(table6)

## Warning in chisq.test(table6): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  table6
## X-squared = 3.7704, df = 3, p-value = 0.2874

There isn’t any correlation between Sex and First Language as well as between Sex and Quartile ranking

MBA Starting Salaries Analysis

Chandrasekar Venkatarama

January 24, 2018

R Markdown

Reading the CSV file and creating summary statistics

Creating a subset containing the list of students with disclosed salaries

Histogram of Salaries

Boxplot of Salary and Gender

Boxplot of Salary and First Language

Barchart of Quartile Ranking and Salary

Scatterplotmatrix between GMAT Percentile, Spring and Fall Average, Salary

Corrgram of variables in the MBA Starting Salaries dataset

3-way contingency table between sex, first language and quarter

Chi-sq test between Sex and first language

Chi-sq test between Sex and Quartile ranking

T-tests between Age and Salary, Sex and Salary

Linear regression model between Salary wrt Age, Sex, First Language and Quartile Ranking

Linear regression model between Salary wrt Spring and Fall Average, Work experience and degree of satisfaction

Linear regression model between Salary wrt GMAT total percentile, total score, verbal and quants percentile

Linear regression model between Salary wrt the significant factors in the above 3 models

Creating a subset of those students who didn’t get a job

3-way contingency table between sex, first language and quarter

Chi-sq test between Sex and first language

Chi-sq test between Sex and Quartile ranking