MBA starting salaries

->Since,rating of MBA programs uses graduates’ salaries as a large component of its rating system in many of the surveys, various dependencies of the staring salary of the students should be signified. Also the degree of satisfaction of the MBA program plays a prominent role for analyzing purposes.

->Here, the dependence of various variables such as GMAT score, GMAT percentile, Spring average , Fall average , years of work experience , Quartile ranking , first language, sx,etc may positively ,neutrally or negatively rely on the starting salary and the degree of satisfaction of the MBA program .

–>Reading and viewing the dataset into R

mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep="")) 
View(mba.df)

–> Summarizing the data

library(psych)
describe(mba.df)

##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

–> Visualizing the distribution of each variable independently

par(mfrow=c(2,2))
with(mba.df, boxplot(mba.df$age, 
        main="Boxplot of age",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Age (in years)" ))
with(mba.df, boxplot(mba.df$gmat_tot, 
        main="Boxplot of GMAT Score",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="GMAT Score" ))
with(mba.df, boxplot(mba.df$gmat_tpc, 
        main="Boxplot of GMAT Percentile",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Overall GMAT Percentile" ))
with(mba.df, boxplot(mba.df$work_yrs, 
        main="Boxplot of work experience",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Work Years Experience" ))

par(mfrow=c(2,2))
with(mba.df, boxplot(mba.df$salary, 
        main="Boxplot of Starting salary",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Starting salaries" ))
with(mba.df,boxplot(mba.df$quarter, 
        main="Boxplot of Quartile Ranking",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Quartile Ranking" ))
with(mba.df, boxplot(mba.df$s_avg, 
        main="Boxplot of Spring MBA Average",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Spring MBA Average" ))
with(mba.df, boxplot(mba.df$f_avg, 
        main="Boxplot of Fall MBA Average",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="Fall MBA Average" ))

Frequency table of each variable

table(mba.df$sex)

## 
##   1   2 
## 206  68

table(mba.df$quarter)

## 
##  1  2  3  4 
## 69 70 70 65

table(mba.df$frstlang)

## 
##   1   2 
## 242  32

table(mba.df$satis)

## 
##   1   2   3   4   5   6   7 998 
##   1   1   5  17  74  97  33  46

–>Pair-wise Visualization

boxplot(mba.df$salary ~ mba.df$sex, data=mba.df, horizontal=TRUE, yaxt="n", 
        ylab="Gender", xlab="Salary",
        main="Comparison of Salaries of Males and Females")
axis(side=2, at=c(1,2), labels=c("Females", "Males"))

boxplot(mba.df$salary ~ mba.df$frstlang, data=mba.df, horizontal=TRUE, yaxt="n", 
        ylab="First language", xlab="Salary",
        main="Comparison of Salaries based on their first language")
axis(side=2, at=c(1,2), labels=c("english", "others"))

boxplot(mba.df$salary ~ mba.df$quarter, data=mba.df, horizontal=TRUE, yaxt="n", 
        ylab="quartile rating", xlab="Salary",
        main="Comparison of Salaries of Quartile ranking")
axis(side=2, at=c(1,2,3,4), labels=c("first", "secnd","thrd","fourth"))

boxplot(mba.df$salary ~ mba.df$satis, data=mba.df, horizontal=TRUE, yaxt="n", 
        ylab="Satisfactory degree", xlab="Salary",
        main="Comparison of Salaries on degree of satisfaction of students")
axis(side=2, at=c(1,2,3,4,5,6,7,8), labels=c("1", "2","3","4","5","6","7","no"))

–>Scatterplots to understand variable correlation

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplot(salary ~ gmat_tpc, data=mba.df, spread=FALSE,
            smoother.args=list(lty=2),pch=19,
            main="Scatterplot of Salary vs. GMAT percentile",
            xlab="gmat percentile",
            ylab="salary ",cex=0.6)

scatterplot(salary ~ gmat_tot, data=mba.df, spread=FALSE,
            smoother.args=list(lty=2),pch=19,xlim=c(1,1000),
            main="Scatterplot of Salary vs. GMAT SCore",
            xlab="GMAT Score",
            ylab="salary ",cex=0.6)

scatterplot(salary ~ work_yrs, data=mba.df, spread=FALSE,
            smoother.args=list(lty=2),pch=19,xlim=c(1,8),
            main="Scatterplot of Salary vs. their work experience",
            xlab="Work experience",
            ylab="salary ",cex=0.6)

scatterplot(salary ~ s_avg, data=mba.df, spread=FALSE,
            smoother.args=list(lty=2),pch=19,xlim=c(1,5),
            main="Scatterplot of Salary vs. Spring average",
            xlab="Spring average",
            ylab="salary ",cex=0.6)

–>Jitter plots visualization

plot(jitter(mba.df$satis), jitter(mba.df$salary), 
     xlim = c(1,10),
     xlab="Satisfaction degree", ylab="Salary")

plot(jitter(mba.df$quarter), jitter(mba.df$salary+1), 
     xlim = c(1,5),
     xlab="Quartile rating", ylab="Salary")

–>Scatterplotmatrix

library(car)
scatterplotMatrix(formula = ~ age + gmat_tpc + gmat_qpc +
                    gmat_vpc+ s_avg + f_avg + gmat_tot + salary , cex=0.6,
                  spread=FALSE, smoother.args=list(lty=2),pch=19,
                  data=mba.df, diagonal="histogram")

–>Correlation between variables

round(cor(mba.df[ , ]),2)

##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.03    -0.15    -0.22    -0.04    -0.17  0.15 -0.02
## sex      -0.03  1.00    -0.05    -0.16     0.07    -0.01  0.13  0.09
## gmat_tot -0.15 -0.05     1.00     0.72     0.75     0.85  0.11  0.10
## gmat_qpc -0.22 -0.16     0.72     1.00     0.15     0.65 -0.03  0.07
## gmat_vpc -0.04  0.07     0.75     0.15     1.00     0.67  0.20  0.08
## gmat_tpc -0.17 -0.01     0.85     0.65     0.67     1.00  0.12  0.08
## s_avg     0.15  0.13     0.11    -0.03     0.20     0.12  1.00  0.55
## f_avg    -0.02  0.09     0.10     0.07     0.08     0.08  0.55  1.00
## quarter  -0.05 -0.13    -0.09     0.04    -0.17    -0.08 -0.76 -0.45
## work_yrs  0.86 -0.01    -0.18    -0.24    -0.07    -0.17  0.13 -0.04
## frstlang  0.06  0.00    -0.14     0.14    -0.39    -0.10 -0.14 -0.04
## salary   -0.06  0.07    -0.05    -0.04    -0.01     0.00  0.15  0.03
## satis    -0.13 -0.05     0.08     0.06     0.06     0.09 -0.03  0.01
##          quarter work_yrs frstlang salary satis
## age        -0.05     0.86     0.06  -0.06 -0.13
## sex        -0.13    -0.01     0.00   0.07 -0.05
## gmat_tot   -0.09    -0.18    -0.14  -0.05  0.08
## gmat_qpc    0.04    -0.24     0.14  -0.04  0.06
## gmat_vpc   -0.17    -0.07    -0.39  -0.01  0.06
## gmat_tpc   -0.08    -0.17    -0.10   0.00  0.09
## s_avg      -0.76     0.13    -0.14   0.15 -0.03
## f_avg      -0.45    -0.04    -0.04   0.03  0.01
## quarter     1.00    -0.09     0.10  -0.16  0.00
## work_yrs   -0.09     1.00    -0.03   0.01 -0.11
## frstlang    0.10    -0.03     1.00  -0.09  0.08
## salary     -0.16     0.01    -0.09   1.00 -0.34
## satis       0.00    -0.11     0.08  -0.34  1.00

–>Corrogram

library(corrgram)
corrgram(mba.df, order=FALSE, 
         lower.panel=panel.shade,
         upper.panel=panel.pie, 
         diag.panel=panel.minmax,
         text.panel=panel.txt,
         main="Corrgram of all the  intercorrelations")

–>Correlation visualization

library(corrplot)

## corrplot 0.84 loaded

corrplot(corr=cor(mba.df[ , ], use="complete.obs"), 
         method ="ellipse")

–> Covariance between variables

round(cov(mba.df[ , ]),2)

##                age     sex   gmat_tot  gmat_qpc gmat_vpc gmat_tpc   s_avg
## age          13.77   -0.05     -31.16    -11.93    -2.76    -8.84    0.21
## sex          -0.05    0.19      -1.33     -1.05     0.55    -0.05    0.02
## gmat_tot    -31.16   -1.33    3310.69    620.02   726.00   683.99    2.48
## gmat_qpc    -11.93   -1.05     620.02    221.07    38.15   135.80   -0.17
## gmat_vpc     -2.76    0.55     726.00     38.15   284.25   157.49    1.31
## gmat_tpc     -8.84   -0.05     683.99    135.80   157.49   196.61    0.63
## s_avg         0.21    0.02       2.48     -0.17     1.31     0.63    0.15
## f_avg        -0.03    0.02       3.15      0.58     0.67     0.59    0.11
## quarter      -0.20   -0.06      -5.89      0.60    -3.27    -1.29   -0.32
## work_yrs     10.29   -0.02     -33.92    -11.37    -3.62    -7.86    0.16
## frstlang      0.07    0.00      -2.50      0.66    -2.11    -0.47   -0.02
## salary   -11830.42 1518.26 -161159.99 -33358.23 -5273.85  3522.75 2831.60
## satis      -176.35   -8.78    1765.26    334.84   392.36   484.25   -4.63
##           f_avg  quarter work_yrs frstlang        salary       satis
## age       -0.03    -0.20    10.29     0.07     -11830.42     -176.35
## sex        0.02    -0.06    -0.02     0.00       1518.26       -8.78
## gmat_tot   3.15    -5.89   -33.92    -2.50    -161159.99     1765.26
## gmat_qpc   0.58     0.60   -11.37     0.66     -33358.23      334.84
## gmat_vpc   0.67    -3.27    -3.62    -2.11      -5273.85      392.36
## gmat_tpc   0.59    -1.29    -7.86    -0.47       3522.75      484.25
## s_avg      0.11    -0.32     0.16    -0.02       2831.60       -4.63
## f_avg      0.28    -0.26    -0.07    -0.01        787.66        2.13
## quarter   -0.26     1.23    -0.31     0.04      -9296.21       -0.01
## work_yrs  -0.07    -0.31    10.45    -0.03       1486.15     -131.24
## frstlang  -0.01     0.04    -0.03     0.10      -1419.59        9.48
## salary   787.66 -9296.21  1486.15 -1419.59 2596061571.52 -6347115.38
## satis      2.13    -0.01  -131.24     9.48   -6347115.38   138097.38

–>Dimensions of dataset

dim(mba.df)

## [1] 274  13

–>Finding the dimensions of the subset with people who actually got the job

-subset with people who didnt get job

nojob.df<- mba.df[which(mba.df$salary=="0" ), ]
View(nojob.df)

-subset with people who did not fill the survey

nojob1.df<- mba.df[which(mba.df$salary=="998" ), ]
View(nojob1.df)

-subset with people who answered the survey but did not disclose salary data

nojob2.df<- mba.df[which(mba.df$salary=="999" ), ]
View(nojob2.df)

–>Dimensions of each subset

dim(nojob.df)

## [1] 90 13

90 people did not get the job

dim(nojob1.df)

## [1] 46 13

*46 people did not fill the survey

dim(nojob2.df)

## [1] 35 13

*35 people answered the survey but did not disclose salary data

-So,on a total , 171 people have no entry or no job

-Implying 103 (274-171) people had got the job

–>Subset (job) with people who got the job

attach(mba.df)
job <- mba.df[order(-salary)[1:103], ]
View(job)

–>Contingency table

-One-way contingency tables

mytable <- with(job, table(satis))
mytable

## satis
##  3  4  5  6  7 
##  1  1 29 50 22

mytable1 <- with(job, table(sex))
mytable1

## sex
##  1  2 
## 72 31

mytable2 <- with(job, table(quarter))
mytable2

## quarter
##  1  2  3  4 
## 35 25 24 19

mytable3 <- with(job, table(frstlang))
mytable3

## frstlang
##  1  2 
## 96  7

-Three-way Contingency tables

mytable21 <- xtabs(~ sex+quarter+satis, data=job)
ftable(mytable21)

##             satis  3  4  5  6  7
## sex quarter                     
## 1   1              0  1  7 10  5
##     2              0  0  4 13  2
##     3              0  0  4  9  4
##     4              0  0  2  8  3
## 2   1              1  0  6  3  2
##     2              0  0  4  1  1
##     3              0  0  1  3  3
##     4              0  0  1  3  2

mytable22 <- xtabs(~ frstlang+sex+satis, data=job)
ftable(mytable22)

##              satis  3  4  5  6  7
## frstlang sex                     
## 1        1          0  1 17 38 12
##          2          1  0 11  8  8
## 2        1          0  0  0  2  2
##          2          0  0  1  2  0

–>Chi-squared test

mytable11 <- xtabs(~ satis+sex, data=job)
mytable11

##      sex
## satis  1  2
##     3  0  1
##     4  1  0
##     5 17 12
##     6 40 10
##     7 14  8

chisq.test(mytable11)

## Warning in chisq.test(mytable11): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable11
## X-squared = 7.3413, df = 4, p-value = 0.1189

mytable12 <- xtabs(~ satis+quarter, data=job)
mytable12

##      quarter
## satis  1  2  3  4
##     3  1  0  0  0
##     4  1  0  0  0
##     5 13  8  5  3
##     6 13 14 12 11
##     7  7  3  7  5

chisq.test(mytable12)

## Warning in chisq.test(mytable12): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable12
## X-squared = 10.045, df = 12, p-value = 0.612

mytable13 <- xtabs(~ satis+frstlang, data=job)
mytable13

##      frstlang
## satis  1  2
##     3  1  0
##     4  1  0
##     5 28  1
##     6 46  4
##     7 20  2

chisq.test(mytable13)

## Warning in chisq.test(mytable13): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable13
## X-squared = 0.95627, df = 4, p-value = 0.9164

mytable14 <- xtabs(~ sex+frstlang, data=job)
mytable14

##    frstlang
## sex  1  2
##   1 68  4
##   2 28  3

chisq.test(mytable14)

## Warning in chisq.test(mytable14): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable14
## X-squared = 0.11264, df = 1, p-value = 0.7372

mytable15 <- xtabs(~ sex+quarter, data=job)
mytable15

##    quarter
## sex  1  2  3  4
##   1 23 19 17 13
##   2 12  6  7  6

chisq.test(mytable15)

## 
##  Pearson's Chi-squared test
## 
## data:  mytable15
## X-squared = 0.76332, df = 3, p-value = 0.8582

mytable16 <- xtabs(~ quarter+frstlang, data=job)
mytable16

##        frstlang
## quarter  1  2
##       1 34  1
##       2 23  2
##       3 22  2
##       4 17  2

chisq.test(mytable16)

## Warning in chisq.test(mytable16): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable16
## X-squared = 1.4214, df = 3, p-value = 0.7005

-From the above Pearson’s chi-squared tests , it is clear that there does not appear to be any relationship between any two categorical variables since p-value > 0.05 in ach of the possible cases.

-Hence, we fail to reject the null hypothesis that each of the paired variables are independent.

–>Running appropriate t-tests

t.test(satis~sex, data=job)

## 
##  Welch Two Sample t-test
## 
## data:  satis by sex
## t = 0.82112, df = 44.389, p-value = 0.416
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2273204  0.5400444
## sample estimates:
## mean in group 1 mean in group 2 
##        5.930556        5.774194

t.test(satis~frstlang, data=job)

## 
##  Welch Two Sample t-test
## 
## data:  satis by frstlang
## t = -1.0194, df = 7.1964, p-value = 0.3411
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9202355  0.3636879
## sample estimates:
## mean in group 1 mean in group 2 
##        5.864583        6.142857

t.test(quarter~sex,data=job)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by sex
## t = 0.20966, df = 53.827, p-value = 0.8347
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4450450  0.5489876
## sample estimates:
## mean in group 1 mean in group 2 
##        2.277778        2.225806

t.test(sex~frstlang, data=job)

## 
##  Welch Two Sample t-test
## 
## data:  sex by frstlang
## t = -0.66028, df = 6.6552, p-value = 0.5313
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6323891  0.3585796
## sample estimates:
## mean in group 1 mean in group 2 
##        1.291667        1.428571

t.test(frstlang~sex,data=job)

## 
##  Welch Two Sample t-test
## 
## data:  frstlang by sex
## t = -0.68201, df = 45.9, p-value = 0.4987
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1628792  0.0804419
## sample estimates:
## mean in group 1 mean in group 2 
##        1.055556        1.096774

t.test(quarter~frstlang,data=job)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by frstlang
## t = -1.1132, df = 6.9152, p-value = 0.3028
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.5181693  0.5479312
## sample estimates:
## mean in group 1 mean in group 2 
##        2.229167        2.714286

-From the above t-tests , it is clear that all the paired variables are independent , with a p-value >0.05

None of the variable effects the action of the other variable.

–>Regression model

All the variables are integer vectors

-Fitting the model

fit<-lm(salary~age+sex+gmat_qpc+gmat_vpc+gmat_tot+gmat_tpc+s_avg+f_avg+quarter+
          work_yrs+frstlang+satis, data=job)
summary(fit)

## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tot + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## age          1750.65    1130.92   1.548   0.1251  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_tot       16.19     178.85   0.090   0.9281  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## work_yrs      749.66    1135.90   0.660   0.5110  
## frstlang     7719.42    7373.27   1.047   0.2979  
## satis       -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05

Beta coefficients values

fit$coefficients

## (Intercept)         age         sex    gmat_qpc    gmat_vpc    gmat_tot 
## 78005.66171  1750.65216 -3584.07221   796.54809   546.30750    16.18545 
##    gmat_tpc       s_avg       f_avg     quarter    work_yrs    frstlang 
## -1457.08759  -931.53478 -2222.82135 -2336.55542   749.66083  7719.42304 
##       satis 
## -1086.54069

-Fitted values of Starting salaries ( Predicted values )

job$salary

##   [1] 220000 162000 146000 145800 130000 126710 120000 120000 120000 120000
##  [11] 118000 115000 115000 115000 115000 115000 112000 112000 112000 110000
##  [21] 108000 108000 107500 107300 107000 106000 106000 106000 105000 105000
##  [31] 105000 105000 105000 105000 105000 105000 105000 105000 105000 104000
##  [41] 104000 103000 102500 101600 101100 101000 101000 100400 100000 100000
##  [51] 100000 100000 100000 100000 100000 100000 100000  99000  98000  98000
##  [61]  98000  98000  98000  98000  98000  98000  98000  98000  97000  97000
##  [71]  96500  96000  96000  96000  96000  95000  95000  95000  95000  95000
##  [81]  95000  95000  93000  93000  93000  92000  92000  92000  90000  90000
##  [91]  90000  88500  88000  86000  86000  85000  85000  85000  85000  82000
## [101]  78256  77000  64000

fitted(fit)

##       274        69        68       139       138       273        64 
## 149398.28 104602.16 136886.06  98698.76 101605.57 102906.99 101991.06 
##        65        66        67        63        61        62       136 
## 110813.72 111969.57 111539.59 123387.17 105944.40 109162.92  98124.50 
##       137       272        60       135       209        59        58 
## 101332.25 106938.92 131710.65 112372.72  97072.00 106077.91 103332.66 
##       208        57       207       134        55        56       206 
## 101699.75 106244.01 114012.72 100678.80 104293.45 110392.90 102646.51 
##        49        50        51        52        53        54       131 
## 104295.07 104340.23 104267.61 107844.22 105534.72 117003.43  98990.78 
##       132       133       205       271       130       270       129 
## 104492.05 131489.13 100340.38  92509.57 109427.51 125479.19 104309.68 
##       204       269       203       128       202       268        46 
## 108064.28  98365.05 110014.17  97637.02  92620.45  99994.57  93884.52 
##        47        48       126       127       200       201       266 
## 102921.62 102001.73  93821.35 113592.03  99865.37  96527.43  88378.10 
##       267       125       122       123       124       194       195 
##  92917.86 110371.32  95635.59 101205.47 109265.61  88146.31  84770.11 
##       196       197       198       199       265       192       193 
##  92335.22 112385.70 102417.83 110646.67  95647.25 101244.93 102476.61 
##       121        44        45       120       264        41        42 
## 116400.42  94605.99  93758.43  92996.97  97349.69  98952.20  94274.66 
##        43       118       119       191       263        40       117 
## 100859.19 102167.87  99344.77 103009.25  97034.18  90278.44 117353.34 
##       190        39       116       262       188       189       261 
## 101175.19 108809.43  97637.02  89968.42  94419.48  97955.82 105666.52 
##       187        38        37       260        35        36       258 
## 102495.59  94280.82  99728.91  92634.78  94121.52 105118.73  94852.84 
##       259       115       186       257       256 
##  96624.19  95874.74  93074.55  89360.18  86668.13

Regression model after converting the integer categorical variables into factor variables

-Converting the integer variables into factor variables

str(job)

## 'data.frame':    103 obs. of  13 variables:
##  $ age     : int  40 25 40 24 26 26 27 28 30 30 ...
##  $ sex     : int  2 1 1 1 1 1 1 1 1 2 ...
##  $ gmat_tot: int  500 700 630 620 650 550 600 700 600 670 ...
##  $ gmat_qpc: int  60 98 71 88 89 72 67 95 77 87 ...
##  $ gmat_vpc: int  45 93 95 74 87 58 84 95 81 95 ...
##  $ gmat_tpc: int  51 98 91 87 93 69 83 98 84 95 ...
##  $ s_avg   : num  2.5 3.6 4 3.1 3.2 2.6 3.5 3.8 3.5 3.3 ...
##  $ f_avg   : num  2.75 3.75 0 3 3.25 2.75 3 4 3.25 3.25 ...
##  $ quarter : int  4 1 1 2 2 4 1 1 1 1 ...
##  $ work_yrs: int  15 1 15 2 4 3 3 5 5 8 ...
##  $ frstlang: int  2 1 1 1 1 1 1 1 1 1 ...
##  $ salary  : int  220000 162000 146000 145800 130000 126710 120000 120000 120000 120000 ...
##  $ satis   : int  6 5 6 6 7 6 5 5 6 6 ...

job$sex[job$sex==1]<-'Male'
job$sex[job$sex==2]<-'Female'
job$sex<-factor(job$sex)
job$quarter<-factor(job$quarter)
job$satis<-factor(job$satis)
job$frstlang[job$frstlang==1]<-'English'
job$frstlang[job$frstlang==2]<-'Other'
job$frstlang<-factor(job$frstlang)
str(job)

## 'data.frame':    103 obs. of  13 variables:
##  $ age     : int  40 25 40 24 26 26 27 28 30 30 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 2 2 1 ...
##  $ gmat_tot: int  500 700 630 620 650 550 600 700 600 670 ...
##  $ gmat_qpc: int  60 98 71 88 89 72 67 95 77 87 ...
##  $ gmat_vpc: int  45 93 95 74 87 58 84 95 81 95 ...
##  $ gmat_tpc: int  51 98 91 87 93 69 83 98 84 95 ...
##  $ s_avg   : num  2.5 3.6 4 3.1 3.2 2.6 3.5 3.8 3.5 3.3 ...
##  $ f_avg   : num  2.75 3.75 0 3 3.25 2.75 3 4 3.25 3.25 ...
##  $ quarter : Factor w/ 4 levels "1","2","3","4": 4 1 1 2 2 4 1 1 1 1 ...
##  $ work_yrs: int  15 1 15 2 4 3 3 5 5 8 ...
##  $ frstlang: Factor w/ 2 levels "English","Other": 2 1 1 1 1 1 1 1 1 1 ...
##  $ salary  : int  220000 162000 146000 145800 130000 126710 120000 120000 120000 120000 ...
##  $ satis   : Factor w/ 5 levels "3","4","5","6",..: 4 3 4 4 5 4 3 3 4 4 ...

-Fitting the model

fit1<-lm(salary~age+sex+gmat_qpc+gmat_vpc+gmat_tot+gmat_tpc+s_avg+f_avg+quarter+
           work_yrs+frstlang+satis, data=job)
summary(fit1)

## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tot + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24954  -6770   -738   6286  68558 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   68070.03   57512.21   1.184    0.240  
## age            1702.40    1162.20   1.465    0.147  
## sexMale        3562.39    3746.26   0.951    0.344  
## gmat_qpc        724.79     517.91   1.399    0.165  
## gmat_vpc        508.37     515.89   0.985    0.327  
## gmat_tot         26.48     185.10   0.143    0.887  
## gmat_tpc      -1397.38     744.64  -1.877    0.064 .
## s_avg         -2203.55    8656.50  -0.255    0.800  
## f_avg         -1761.46    4040.47  -0.436    0.664  
## quarter2      -4501.19    5228.76  -0.861    0.392  
## quarter3      -8555.83    6985.38  -1.225    0.224  
## quarter4      -6913.29    8477.17  -0.816    0.417  
## work_yrs        737.61    1167.26   0.632    0.529  
## frstlangOther  8151.67    7550.06   1.080    0.283  
## satis4          827.83   23405.78   0.035    0.972  
## satis5         5217.70   17062.76   0.306    0.761  
## satis6         4987.12   17224.96   0.290    0.773  
## satis7         1932.31   17443.97   0.111    0.912  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15750 on 85 degrees of freedom
## Multiple R-squared:  0.3529, Adjusted R-squared:  0.2234 
## F-statistic: 2.726 on 17 and 85 DF,  p-value: 0.00126

-Beta coefficients values

fit1$coefficients

##   (Intercept)           age       sexMale      gmat_qpc      gmat_vpc 
##   68070.03198    1702.40208    3562.39249     724.78984     508.36517 
##      gmat_tot      gmat_tpc         s_avg         f_avg      quarter2 
##      26.48354   -1397.37709   -2203.54692   -1761.45997   -4501.18529 
##      quarter3      quarter4      work_yrs frstlangOther        satis4 
##   -8555.83307   -6913.29417     737.61280    8151.67046     827.82618 
##        satis5        satis6        satis7 
##    5217.69668    4987.12257    1932.30731

-Fitted values of Starting salaries ( Predicted values )

job$salary

##   [1] 220000 162000 146000 145800 130000 126710 120000 120000 120000 120000
##  [11] 118000 115000 115000 115000 115000 115000 112000 112000 112000 110000
##  [21] 108000 108000 107500 107300 107000 106000 106000 106000 105000 105000
##  [31] 105000 105000 105000 105000 105000 105000 105000 105000 105000 104000
##  [41] 104000 103000 102500 101600 101100 101000 101000 100400 100000 100000
##  [51] 100000 100000 100000 100000 100000 100000 100000  99000  98000  98000
##  [61]  98000  98000  98000  98000  98000  98000  98000  98000  97000  97000
##  [71]  96500  96000  96000  96000  96000  95000  95000  95000  95000  95000
##  [81]  95000  95000  93000  93000  93000  92000  92000  92000  90000  90000
##  [91]  90000  88500  88000  86000  86000  85000  85000  85000  85000  82000
## [101]  78256  77000  64000

fitted(fit1)

##       274        69        68       139       138       273        64 
## 151442.28 105512.43 136243.72  98584.38  99492.65 105425.28 103202.44 
##        65        66        67        63        61        62       136 
## 111531.37 113439.36 113378.22 123671.63 106489.88 109989.89  98286.15 
##       137       272        60       135       209        59        58 
## 101495.57 108183.27 130932.61 112025.31  95653.47 106754.81 104914.89 
##       208        57       207       134        55        56       206 
## 100590.00 107087.52 111135.20  99963.18 104679.34 111728.71 101631.54 
##        49        50        51        52        53        54       131 
## 103859.86 106111.92 105753.32 108537.82 108039.67 118292.94  99363.57 
##       132       133       205       271       130       270       129 
## 103620.59 129954.37  97552.96  95895.17 108254.60 128084.88 103742.56 
##       204       269       203       128       202       268        46 
## 105842.83 100805.37 108317.02  96799.02  91059.72 100898.69  93697.64 
##        47        48       126       127       200       201       266 
## 104739.05 104441.34  93427.87 113315.74  98446.31  94950.84  91434.88 
##       267       125       122       123       124       194       195 
##  93729.90 110557.98  95772.93 100885.42 108466.15  85237.46  82554.78 
##       196       197       198       199       265       192       193 
##  91285.76 110941.87  98939.94 108400.43  98737.53  97626.68 101133.33 
##       121        44        45       120       264        41        42 
## 115359.30  95220.17  95774.66  91706.61 100014.60  95000.00  96234.86 
##        43       118       119       191       263        40       117 
##  95000.00 102029.86  99092.08  99529.67  97869.85  92902.36 117155.98 
##       190        39       116       262       188       189       261 
##  99489.09 110866.05  96799.02  90046.34  91093.50  95931.61 107346.98 
##       187        38        37       260        35        36       258 
## 100971.54  94510.65 100627.87  94440.10  95286.67 107046.34  96990.51 
##       259       115       186       257       256 
##  99447.11  94149.12  91340.45  92799.55  87117.69

–> Comparing the beta coefficients of the two models

fit$coefficients

## (Intercept)         age         sex    gmat_qpc    gmat_vpc    gmat_tot 
## 78005.66171  1750.65216 -3584.07221   796.54809   546.30750    16.18545 
##    gmat_tpc       s_avg       f_avg     quarter    work_yrs    frstlang 
## -1457.08759  -931.53478 -2222.82135 -2336.55542   749.66083  7719.42304 
##       satis 
## -1086.54069

fit1$coefficients

##   (Intercept)           age       sexMale      gmat_qpc      gmat_vpc 
##   68070.03198    1702.40208    3562.39249     724.78984     508.36517 
##      gmat_tot      gmat_tpc         s_avg         f_avg      quarter2 
##      26.48354   -1397.37709   -2203.54692   -1761.45997   -4501.18529 
##      quarter3      quarter4      work_yrs frstlangOther        satis4 
##   -8555.83307   -6913.29417     737.61280    8151.67046     827.82618 
##        satis5        satis6        satis7 
##    5217.69668    4987.12257    1932.30731

–>Comparing multiple models

summary(fit)

## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tot + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## age          1750.65    1130.92   1.548   0.1251  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_tot       16.19     178.85   0.090   0.9281  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## work_yrs      749.66    1135.90   0.660   0.5110  
## frstlang     7719.42    7373.27   1.047   0.2979  
## satis       -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05

summary(fit1)

## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tot + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24954  -6770   -738   6286  68558 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   68070.03   57512.21   1.184    0.240  
## age            1702.40    1162.20   1.465    0.147  
## sexMale        3562.39    3746.26   0.951    0.344  
## gmat_qpc        724.79     517.91   1.399    0.165  
## gmat_vpc        508.37     515.89   0.985    0.327  
## gmat_tot         26.48     185.10   0.143    0.887  
## gmat_tpc      -1397.38     744.64  -1.877    0.064 .
## s_avg         -2203.55    8656.50  -0.255    0.800  
## f_avg         -1761.46    4040.47  -0.436    0.664  
## quarter2      -4501.19    5228.76  -0.861    0.392  
## quarter3      -8555.83    6985.38  -1.225    0.224  
## quarter4      -6913.29    8477.17  -0.816    0.417  
## work_yrs        737.61    1167.26   0.632    0.529  
## frstlangOther  8151.67    7550.06   1.080    0.283  
## satis4          827.83   23405.78   0.035    0.972  
## satis5         5217.70   17062.76   0.306    0.761  
## satis6         4987.12   17224.96   0.290    0.773  
## satis7         1932.31   17443.97   0.111    0.912  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15750 on 85 degrees of freedom
## Multiple R-squared:  0.3529, Adjusted R-squared:  0.2234 
## F-statistic: 2.726 on 17 and 85 DF,  p-value: 0.00126

-The multiple R-squared are as above.

-The second model gives the accurate assertion since the critically reliable factors have higher beta coefficients in this model.

-Hence, the second model is the best fit

–>Creating subset of those who did not get a job

nojob <- mba.df[order(salary)[1:171], ]
View(nojob)

–>Contingency table

-One-way contingency tables

mytable <- with(nojob, table(satis))
mytable

## satis
##   1   2   3   4   5   6   7 998 
##   1   1   4  16  45  47  11  46

mytable1 <- with(nojob, table(sex))
mytable1

## sex
##   1   2 
## 134  37

mytable2 <- with(nojob, table(quarter))
mytable2

## quarter
##  1  2  3  4 
## 34 45 46 46

mytable3 <- with(nojob, table(frstlang))
mytable3

## frstlang
##   1   2 
## 146  25

-Three-way Contingency tables

mytable21 <- xtabs(~ sex+quarter+satis, data=nojob)
ftable(mytable21)

##             satis  1  2  3  4  5  6  7 998
## sex quarter                               
## 1   1              0  0  0  2  4  7  1   7
##     2              1  0  1  2 14  9  1   8
##     3              0  0  0  0  9 12  2  14
##     4              0  0  3  7  9 11  2   8
## 2   1              0  0  0  1  3  3  2   4
##     2              0  0  0  1  2  2  1   3
##     3              0  1  0  2  1  3  1   1
##     4              0  0  0  1  3  0  1   1

mytable22 <- xtabs(~ frstlang+sex+satis, data=nojob)
ftable(mytable22)

##              satis  1  2  3  4  5  6  7 998
## frstlang sex                               
## 1        1          1  0  2  6 30 38  6  31
##          2          0  0  0  3  9  8  5   7
## 2        1          0  0  2  5  6  1  0   6
##          2          0  1  0  2  0  0  0   2

–>Chi-squared test

mytable11 <- xtabs(~ satis+sex, data=nojob)
mytable11

##      sex
## satis  1  2
##   1    1  0
##   2    0  1
##   3    4  0
##   4   11  5
##   5   36  9
##   6   39  8
##   7    6  5
##   998 37  9

chisq.test(mytable11)

## Warning in chisq.test(mytable11): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable11
## X-squared = 10.333, df = 7, p-value = 0.1705

mytable12 <- xtabs(~ satis+quarter, data=nojob)
mytable12

##      quarter
## satis  1  2  3  4
##   1    0  1  0  0
##   2    0  0  1  0
##   3    0  1  0  3
##   4    3  3  2  8
##   5    7 16 10 12
##   6   10 11 15 11
##   7    3  2  3  3
##   998 11 11 15  9

chisq.test(mytable12)

## Warning in chisq.test(mytable12): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable12
## X-squared = 21.232, df = 21, p-value = 0.4449

mytable13 <- xtabs(~ satis+frstlang, data=nojob)
mytable13

##      frstlang
## satis  1  2
##   1    1  0
##   2    0  1
##   3    2  2
##   4    9  7
##   5   39  6
##   6   46  1
##   7   11  0
##   998 38  8

chisq.test(mytable13)

## Warning in chisq.test(mytable13): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable13
## X-squared = 29.002, df = 7, p-value = 0.0001446

mytable14 <- xtabs(~ sex+frstlang, data=nojob)
mytable14

##    frstlang
## sex   1   2
##   1 114  20
##   2  32   5

chisq.test(mytable14)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable14
## X-squared = 2.3563e-30, df = 1, p-value = 1

mytable15 <- xtabs(~ sex+quarter, data=nojob)
mytable15

##    quarter
## sex  1  2  3  4
##   1 21 36 37 40
##   2 13  9  9  6

chisq.test(mytable15)

## 
##  Pearson's Chi-squared test
## 
## data:  mytable15
## X-squared = 7.7155, df = 3, p-value = 0.05227

mytable16 <- xtabs(~ quarter+frstlang, data=nojob)
mytable16

##        frstlang
## quarter  1  2
##       1 31  3
##       2 39  6
##       3 37  9
##       4 39  7

chisq.test(mytable16)

## Warning in chisq.test(mytable16): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable16
## X-squared = 1.8892, df = 3, p-value = 0.5957

-From the above Pearson’s chi-squared tests , it is clear that there does not appear to be any relationship between any two categorical variables since p-value > 0.05 in ach of the possible casees except the one between degree of satisfaction and the first language.

-Hence, we fail to reject the null hypothesis that each of the paired variables are independent except degree of satisfaction and first language which are dependent on each other.

-Hence , the the first language of the students effects the degree of satisfaction according to the survey.

–>Running appropriate t-tests

t.test(satis~sex, data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  satis by sex
## t = 0.40356, df = 58.898, p-value = 0.688
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -128.9860  194.1554
## sample estimates:
## mean in group 1 mean in group 2 
##        279.3955        246.8108

t.test(satis~frstlang, data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  satis by frstlang
## t = -0.57688, df = 31.413, p-value = 0.5681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -264.9345  148.0573
## sample estimates:
## mean in group 1 mean in group 2 
##        263.8014        322.2400

t.test(quarter~sex,data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by sex
## t = 2.4522, df = 55.471, p-value = 0.01737
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.09148911 0.90891428
## sample estimates:
## mean in group 1 mean in group 2 
##        2.716418        2.216216

t.test(sex~frstlang, data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  sex by frstlang
## t = 0.2165, df = 33.079, p-value = 0.8299
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1610290  0.1993852
## sample estimates:
## mean in group 1 mean in group 2 
##        1.219178        1.200000

t.test(frstlang~sex,data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  frstlang by sex
## t = 0.21782, df = 58.908, p-value = 0.8283
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1155836  0.1438208
## sample estimates:
## mean in group 1 mean in group 2 
##        1.149254        1.135135

t.test(quarter~frstlang,data=nojob)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by frstlang
## t = -1.0222, df = 34.743, p-value = 0.3137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6709289  0.2216138
## sample estimates:
## mean in group 1 mean in group 2 
##        2.575342        2.800000

-From the above t-tests , it is clear that all the paired variables are independent , with a p-value >0.05

None of the variable effects the action of the other variable.

EXECUTIVE SUMMARY

-The starting salary of the Mba program of any individual student depends critically on the first language of the student and the degree of satisfaction estimated through various boxplots and the scatterplots.

-Even from the corrogram and the correlation matrices , it is quite clear that the starting salaries are strongly correlated with the first language.

-From the chi- squared tests and the t-tests between the people who got a job and those who did not get a job , it can be analysed that there is a significant relationship between the starting salaries , degree of satisfaction of the MBA program and the first language of the people.

-The Regression model ,ie the best fit model , here the second model helps us in concluding that the salary has more or less a positive effect from -age -sex -GMAT_score -GMAT_quantitative percentile -GMAT_verbalpercentile -work years experience -satisfaction degree and a negative effect from -first language -quarter -spring_average -fall_average -GMAT_overall percentile

MBA starting salaries

Pooja Gundu

December 26, 2017