df <- read.csv(paste("MBAStartingSalariesData.csv",sep=""))
View(df)

library('psych')
describe(df)

##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

head(df)

##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 5  24   2      710       93       98       98   3.6  3.75       1        2
## 6  24   1      640       82       89       91   3.9  3.75       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 5        1    999     5
## 6        1      0     6

tail(df)

##     age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 269  26   1      630       96       71       91   2.6  2.75       4
## 270  31   1      530       75       45       62   2.4  2.75       4
## 271  23   1      580       64       81       78   2.2  2.00       4
## 272  25   1      540       79       45       65   2.6  2.50       4
## 273  26   1      550       72       58       69   2.6  2.75       4
## 274  40   2      500       60       45       51   2.5  2.75       4
##     work_yrs frstlang salary satis
## 269        3        1 101600     6
## 270        4        2 104000     6
## 271        2        1 105000     6
## 272        3        1 115000     5
## 273        3        1 126710     6
## 274       15        2 220000     6

str(df)

## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

for(i in 1:nrow(df)){
  if(df$satis[i] == 999 | df$satis[i]==998){
    df$satis[i] <- 0
  }
}

library('ggplot2')

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

df$sex <-factor(df$sex)
gg<-ggplot(df,aes(x=gmat_tot,y=work_yrs))+
     geom_point(aes(col=sex,size=salary))+
     labs(title="GMAT total vs Work Years",
            y= "Work Experience",
            x="GMAT Scores",
            caption="Data Source : HBR.ORG")
plot(gg)

Let’s Create a categorical variable called level satisfaction.

df$levelsatisfaction <- ifelse(df$satis >= 4,1,0)

df[c(20:30),c(1:14)]

##    age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 20  27   1      600       91       58       83  3.40  3.25       1
## 21  27   2      570       65       82       77  3.30  3.25       1
## 22  27   1      740       99       96       99  3.50  3.50       1
## 23  27   1      750       99       98       99  3.40  3.50       1
## 24  28   2      540       75       50       65  3.60  4.00       1
## 25  29   1      580       56       87       78  3.64  3.33       1
## 26  30   1      620       82       84       87  3.40  2.80       1
## 27  31   2      560       60       78       72  3.30  3.75       1
## 28  32   1      760       99       99       99  3.40  3.00       1
## 29  32   1      640       79       91       91  3.60  3.75       1
## 30  32   1      570       71       71        0  3.50  3.50       1
##    work_yrs frstlang salary satis levelsatisfaction
## 20        4        1    998     0                 0
## 21        4        1    999     4                 1
## 22        3        1      0     6                 1
## 23        1        2      0     5                 1
## 24        5        1      0     5                 1
## 25        3        1      0     5                 1
## 26        5        1    999     6                 1
## 27       10        1      0     7                 1
## 28        5        1      0     5                 1
## 29        7        1      0     6                 1
## 30        4        1    999     4                 1

gg1<-ggplot(df,aes(x=work_yrs,y=salary))+
     geom_point(aes(col=satis,size=levelsatisfaction))+
     labs(title="Experience vs Salary",
            x= "Work Experience",
            y="Salary",
            caption="Data Source : HBR.ORG")
plot(gg1)

df$mba_avg <- (df$s_avg+df$f_avg)/2
df[20:30,1:15]

##    age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 20  27   1      600       91       58       83  3.40  3.25       1
## 21  27   2      570       65       82       77  3.30  3.25       1
## 22  27   1      740       99       96       99  3.50  3.50       1
## 23  27   1      750       99       98       99  3.40  3.50       1
## 24  28   2      540       75       50       65  3.60  4.00       1
## 25  29   1      580       56       87       78  3.64  3.33       1
## 26  30   1      620       82       84       87  3.40  2.80       1
## 27  31   2      560       60       78       72  3.30  3.75       1
## 28  32   1      760       99       99       99  3.40  3.00       1
## 29  32   1      640       79       91       91  3.60  3.75       1
## 30  32   1      570       71       71        0  3.50  3.50       1
##    work_yrs frstlang salary satis levelsatisfaction mba_avg
## 20        4        1    998     0                 0   3.325
## 21        4        1    999     4                 1   3.275
## 22        3        1      0     6                 1   3.500
## 23        1        2      0     5                 1   3.450
## 24        5        1      0     5                 1   3.800
## 25        3        1      0     5                 1   3.485
## 26        5        1    999     6                 1   3.100
## 27       10        1      0     7                 1   3.525
## 28        5        1      0     5                 1   3.200
## 29        7        1      0     6                 1   3.675
## 30        4        1    999     4                 1   3.500

summary(df$mba_avg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.825   3.050   3.043   3.295   4.000

str(df)

## 'data.frame':    274 obs. of  15 variables:
##  $ age              : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex              : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot         : int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc         : int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc         : int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc         : int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg            : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg            : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs         : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang         : int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary           : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis            : num  7 6 6 7 5 6 5 6 4 0 ...
##  $ levelsatisfaction: num  1 1 1 1 1 1 1 1 1 0 ...
##  $ mba_avg          : num  3.2 3.75 3.27 2.98 3.67 ...

df$frstlang <- factor(df$frstlang)
df$sex <-factor(df$sex)
ggplot(df,aes(x=sex,fill=frstlang))+
       theme_bw()+
       geom_bar()+
       labs(y="count",
            x="Gender",
            title ="Gender Distribution",
            caption ="Source : hbr.org")

ggplot(df,aes(x=sex,y=gmat_tot))+
      geom_boxplot(fill= "skyblue")+
      stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
      theme_bw()+
      labs(x="Gender",
           y="GMAT Scores",
           title ="Distribution of GMAT scores based on gender",
           caption ="Source:hbr.org")

for(i in 1:nrow(df)){
  if(df$salary[i] == 999){
    df$salary[i] <- 0
  }
}

str(df)

## 'data.frame':    274 obs. of  15 variables:
##  $ age              : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex              : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot         : int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc         : int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc         : int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc         : int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg            : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg            : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs         : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang         : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary           : num  0 0 0 0 0 0 0 0 0 998 ...
##  $ satis            : num  7 6 6 7 5 6 5 6 4 0 ...
##  $ levelsatisfaction: num  1 1 1 1 1 1 1 1 1 0 ...
##  $ mba_avg          : num  3.2 3.75 3.27 2.98 3.67 ...

ggplot(df,aes(x=sex,y=work_yrs))+
      geom_boxplot(fill= "lightblue")+
      stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
      theme_bw()+
      labs(x="Gender",
           y="Experience",
           title ="Distribution of Experience  based on gender",
           caption ="Source:hbr.org")

ggplot(df,aes(x=sex,y=age))+
      geom_boxplot(fill= "magenta")+
      stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
      theme_bw()+
      labs(x="Gender",
           y="Age",
           title ="Distribution of Age based on Gender",
           caption ="Source:hbr.org")

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplotMatrix(formula = ~ gmat_tot + gmat_qpc + gmat_vpc + mba_avg, data = df)

library(corrgram)
corrgram(df, order=TRUE, lower.panel=panel.shade,
        upper.panel=panel.pie, text.panel=panel.txt,
        main="Corrgram of correlations between variables")

boxplot(df$age,
        main = "Age Distribution",
        col= "Blue",
        horizontal = TRUE)

boxplot(df$gmat_tot,
        main = "Gmat Total Score",
        col= "RED",
        horizontal = TRUE)

boxplot(df$mba_avg,
        main = "Average Marks in MBA",
        col= "magenta",
        horizontal = TRUE)

cor(df[,c(1,3:10,12:15)])

##                           age    gmat_tot    gmat_qpc     gmat_vpc
## age                1.00000000 -0.14593840 -0.21616985 -0.044175472
## gmat_tot          -0.14593840  1.00000000  0.72473781  0.748391870
## gmat_qpc          -0.21616985  0.72473781  1.00000000  0.152180142
## gmat_vpc          -0.04417547  0.74839187  0.15218014  1.000000000
## gmat_tpc          -0.16990307  0.84779965  0.65137754  0.666216035
## s_avg              0.14970402  0.11311702 -0.02984873  0.204453647
## f_avg             -0.01744806  0.10442409  0.07370455  0.075922253
## quarter           -0.04967221 -0.09223903  0.03636638 -0.174607355
## work_yrs           0.85829810 -0.18235434 -0.23660827 -0.066390490
## salary            -0.06254173 -0.05527795 -0.04470904 -0.005829072
## satis              0.09234402 -0.06443910 -0.09312075  0.009719018
## levelsatisfaction  0.11214958 -0.06580207 -0.07281406 -0.017008495
## mba_avg            0.05980183  0.12229941  0.03412182  0.147077265
##                       gmat_tpc       s_avg        f_avg     quarter
## age               -0.169903066  0.14970402 -0.017448057 -0.04967221
## gmat_tot           0.847799647  0.11311702  0.104424092 -0.09223903
## gmat_qpc           0.651377538 -0.02984873  0.073704552  0.03636638
## gmat_vpc           0.666216035  0.20445365  0.075922253 -0.17460736
## gmat_tpc           1.000000000  0.11736245  0.079732099 -0.08303535
## s_avg              0.117362449  1.00000000  0.550621386 -0.76211664
## f_avg              0.079732099  0.55062139  1.000000000 -0.44750637
## quarter           -0.083035351 -0.76211664 -0.447506366  1.00000000
## work_yrs          -0.173361859  0.12929271 -0.039056921 -0.08602641
## salary             0.004895486  0.14671494  0.029892973 -0.16510933
## satis             -0.054042239  0.05186620 -0.024905812 -0.01267787
## levelsatisfaction -0.081568271  0.05237872 -0.001388558 -0.02218761
## mba_avg            0.108129767  0.83691669  0.917710988 -0.65610099
##                      work_yrs       salary        satis levelsatisfaction
## age                0.85829810 -0.062541728  0.092344022       0.112149576
## gmat_tot          -0.18235434 -0.055277947 -0.064439102      -0.065802070
## gmat_qpc          -0.23660827 -0.044709042 -0.093120750      -0.072814057
## gmat_vpc          -0.06639049 -0.005829072  0.009719018      -0.017008495
## gmat_tpc          -0.17336186  0.004895486 -0.054042239      -0.081568271
## s_avg              0.12929271  0.146714937  0.051866203       0.052378720
## f_avg             -0.03905692  0.029892973 -0.024905812      -0.001388558
## quarter           -0.08602641 -0.165109333 -0.012677865      -0.022187606
## work_yrs           1.00000000  0.009195480  0.111489429       0.106615008
## salary             0.00919548  1.000000000  0.411933827       0.348286941
## satis              0.11148943  0.411933827  1.000000000       0.928549252
## levelsatisfaction  0.10661501  0.348286941  0.928549252       1.000000000
## mba_avg            0.03591991  0.089419818  0.008352243       0.024015881
##                        mba_avg
## age                0.059801826
## gmat_tot           0.122299412
## gmat_qpc           0.034121821
## gmat_vpc           0.147077265
## gmat_tpc           0.108129767
## s_avg              0.836916691
## f_avg              0.917710988
## quarter           -0.656100987
## work_yrs           0.035919910
## salary             0.089419818
## satis              0.008352243
## levelsatisfaction  0.024015881
## mba_avg            1.000000000

cov(df[,c(1,3:10,12:15)])

##                             age      gmat_tot      gmat_qpc      gmat_vpc
## age                1.376904e+01 -3.115879e+01 -1.192655e+01    -2.7636427
## gmat_tot          -3.115879e+01  3.310688e+03  6.200233e+02   726.0006417
## gmat_qpc          -1.192655e+01  6.200233e+02  2.210731e+02    38.1482581
## gmat_vpc          -2.763643e+00  7.260006e+02  3.814826e+01   284.2481217
## gmat_tpc          -8.839978e+00  6.839911e+02  1.357997e+02   157.4932488
## s_avg              2.116874e-01  2.480257e+00 -1.691233e-01     1.3135702
## f_avg             -3.399348e-02  3.154688e+00  5.753854e-01     0.6720700
## quarter           -2.045935e-01 -5.891153e+00  6.001979e-01    -3.2676666
## work_yrs           1.029494e+01 -3.391634e+01 -1.137186e+01    -3.6181653
## salary            -1.184682e+04 -1.623645e+05 -3.393465e+04 -5016.8170905
## satis              7.770113e-01 -8.407663e+00 -3.139649e+00     0.3715676
## levelsatisfaction  1.646747e-01 -1.498222e+00 -4.284110e-01    -0.1134729
## mba_avg            8.884696e-02  2.817473e+00  2.031311e-01     0.9928201
##                       gmat_tpc         s_avg         f_avg       quarter
## age                 -8.8399775  2.116874e-01  -0.033993476 -2.045935e-01
## gmat_tot           683.9910698  2.480257e+00   3.154688377 -5.891153e+00
## gmat_qpc           135.7996845 -1.691233e-01   0.575385418  6.001979e-01
## gmat_vpc           157.4932488  1.313570e+00   0.672069998 -3.267667e+00
## gmat_tpc           196.6057057  6.271001e-01   0.586986177 -1.292372e+00
## s_avg                0.6271001  1.452176e-01   0.110168979 -3.223721e-01
## f_avg                0.5869862  1.101690e-01   0.275672367 -2.608088e-01
## quarter             -1.2923719 -3.223721e-01  -0.260808802  1.232119e+00
## work_yrs            -7.8575172  1.592639e-01  -0.066286998 -3.086682e-01
## salary            3504.0793562  2.854066e+03 801.208887463 -9.355738e+03
## satis               -1.7182963  4.481886e-02  -0.029652683 -3.191091e-02
## levelsatisfaction   -0.4525815  7.898452e-03  -0.000288495 -9.745729e-03
## mba_avg              0.6070431  1.276933e-01   0.192920673 -2.915905e-01
##                        work_yrs        salary         satis
## age                 10.29493864 -1.184682e+04  7.770113e-01
## gmat_tot           -33.91633914 -1.623645e+05 -8.407663e+00
## gmat_qpc           -11.37186171 -3.393465e+04 -3.139649e+00
## gmat_vpc            -3.61816529 -5.016817e+03  3.715676e-01
## gmat_tpc            -7.85751718  3.504079e+03 -1.718296e+00
## s_avg                0.15926392  2.854066e+03  4.481886e-02
## f_avg               -0.06628700  8.012089e+02 -2.965268e-02
## quarter             -0.30866822 -9.355738e+03 -3.191091e-02
## work_yrs            10.44882490  1.517358e+03  8.172108e-01
## salary            1517.35827919  2.605914e+09  4.768416e+04
## satis                0.81721077  4.768416e+04  5.142015e+00
## levelsatisfaction    0.13637336  7.035495e+03  8.331996e-01
## mba_avg              0.04648846  1.827637e+03  7.583086e-03
##                   levelsatisfaction       mba_avg
## age                    1.646747e-01  8.884696e-02
## gmat_tot              -1.498222e+00  2.817473e+00
## gmat_qpc              -4.284110e-01  2.031311e-01
## gmat_vpc              -1.134729e-01  9.928201e-01
## gmat_tpc              -4.525815e-01  6.070431e-01
## s_avg                  7.898452e-03  1.276933e-01
## f_avg                 -2.884950e-04  1.929207e-01
## quarter               -9.745729e-03 -2.915905e-01
## work_yrs               1.363734e-01  4.648846e-02
## salary                 7.035495e+03  1.827637e+03
## satis                  8.331996e-01  7.583086e-03
## levelsatisfaction      1.565867e-01  3.804978e-03
## mba_avg                3.804978e-03  1.603070e-01

ttable <-xtabs(~sex+levelsatisfaction,data=df)
addmargins(ttable)

##      levelsatisfaction
## sex     0   1 Sum
##   1    42 164 206
##   2    11  57  68
##   Sum  53 221 274

It appears like most of people are satisfied and gender has no role in deteremining level of satisfaction

table1 <-xtabs(~quarter+levelsatisfaction,data=df)
addmargins(table1)

##        levelsatisfaction
## quarter   0   1 Sum
##     1    12  57  69
##     2    13  57  70
##     3    16  54  70
##     4    12  53  65
##     Sum  53 221 274

It is apparent that quartile ranking doesn’t determine whether candidate or student likes MBA program

for(i in 1:nrow(df)){
  if(df$salary[i] == 999 | df$salary[i]==998){
    df$salary[i] <- 0
  }
}

Let’s create a categorical variable is_good_salary

df$is_good_salary <- ifelse(df$salary>mean(df$salary),1,0)

table2 <-xtabs(~levelsatisfaction+is_good_salary,data = df)
addmargins(table2)

##                  is_good_salary
## levelsatisfaction   0   1 Sum
##               0    52   1  53
##               1   119 102 221
##               Sum 171 103 274

It can be observed that those with more than average salary are more likely to like the MBA program

Assuming that people who didn’t answer or disclose their aren’t placed let’s create a variable called is_placed where 1->placed while 0-> not placed

df$is_placed <-ifelse(df$salary == 0,0,1)

Let’s have a null hypothesis that students who are placed are more likely to like the program.To verify if it’s true let’s chi-squared test.

chisq.test(df$is_placed,df$levelsatisfaction)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df$is_placed and df$levelsatisfaction
## X-squared = 33.844, df = 1, p-value = 5.97e-09

It’s more likely that students who are placed like the Program

chisq.test(df$levelsatisfaction,df$sex)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df$levelsatisfaction and df$sex
## X-squared = 0.34269, df = 1, p-value = 0.5583

Therefore we infer that Gender can’t determine if the candidate likes the program

t.test(satis~sex, data=df)

## 
##  Welch Two Sample t-test
## 
## data:  satis by sex
## t = -0.89694, df = 121.46, p-value = 0.3715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8823793  0.3321223
## sample estimates:
## mean in group 1 mean in group 2 
##        4.563107        4.838235

t.test(quarter~is_good_salary,data=df)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by is_good_salary
## t = 2.5051, df = 209.98, p-value = 0.013
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.07373829 0.61836413
## sample estimates:
## mean in group 0 mean in group 1 
##        2.608187        2.262136

t.test(mba_avg~is_placed,data=df)

## 
##  Welch Two Sample t-test
## 
## data:  mba_avg by is_placed
## t = -1.5967, df = 235.19, p-value = 0.1117
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.17246954  0.01805746
## sample estimates:
## mean in group 0 mean in group 1 
##        3.014444        3.091650

t.test(quarter~is_placed,data=df)

## 
##  Welch Two Sample t-test
## 
## data:  quarter by is_placed
## t = 2.5051, df = 209.98, p-value = 0.013
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.07373829 0.61836413
## sample estimates:
## mean in group 0 mean in group 1 
##        2.608187        2.262136

Building regression model

regressor<-lm(salary~age+sex+gmat_tot+quarter+ work_yrs+frstlang+levelsatisfaction+mba_avg, data=df)
summary(regressor)

## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + quarter + work_yrs + 
##     frstlang + levelsatisfaction + mba_avg, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71823 -42333  -5484  44188 192105 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       145632.97   56145.80   2.594    0.010 *  
## age                -3722.03    1535.70  -2.424    0.016 *  
## sex2                2915.33    6693.33   0.436    0.664    
## gmat_tot             -46.44      51.66  -0.899    0.369    
## quarter            -7630.09    3453.08  -2.210    0.028 *  
## work_yrs            2857.07    1773.08   1.611    0.108    
## frstlang2          -1634.18    9228.36  -0.177    0.860    
## levelsatisfaction  46021.37    7363.47   6.250 1.63e-09 ***
## mba_avg            -2008.87    9562.48  -0.210    0.834    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47160 on 265 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1509 
## F-statistic: 7.065 on 8 and 265 DF,  p-value: 1.838e-08

regressor$coefficients

##       (Intercept)               age              sex2          gmat_tot 
##      145632.97001       -3722.02575        2915.33393         -46.44401 
##           quarter          work_yrs         frstlang2 levelsatisfaction 
##       -7630.08650        2857.07456       -1634.17678       46021.36544 
##           mba_avg 
##       -2008.87235

Building a regressor with only significant variables and other variables that may influence salary

regressor1<-lm(salary~age+sex+quarter+ work_yrs+levelsatisfaction+mba_avg, data=df)

summary(regressor1)

## 
## Call:
## lm(formula = salary ~ age + sex + quarter + work_yrs + levelsatisfaction + 
##     mba_avg, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71704 -43138  -6254  44129 193786 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         118782      47246   2.514   0.0125 *  
## age                  -3791       1510  -2.511   0.0126 *  
## sex2                  3326       6659   0.499   0.6178    
## quarter              -7529       3441  -2.188   0.0295 *  
## work_yrs              3080       1735   1.775   0.0770 .  
## levelsatisfaction    46528       7253   6.415 6.36e-10 ***
## mba_avg              -2616       9503  -0.275   0.7833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47050 on 267 degrees of freedom
## Multiple R-squared:  0.1733, Adjusted R-squared:  0.1547 
## F-statistic: 9.327 on 6 and 267 DF,  p-value: 2.728e-09

regressor1$coefficients

##       (Intercept)               age              sex2           quarter 
##        118781.983         -3790.899          3326.249         -7529.168 
##          work_yrs levelsatisfaction           mba_avg 
##          3079.910         46528.335         -2616.527

Building a logistic regression model to determine if the candidate has liked mba Program

library(caTools)#Library to split data into training and test set to test the model
split <- sample.split(df$levelsatisfaction,SplitRatio = 0.7)
training_set <- subset(df,split==TRUE)
test_set <- subset(df,split==FALSE)

Building Model

classifier <-glm(levelsatisfaction~age+
                   gmat_tot+
                   quarter+
                   work_yrs+
                   salary+
                   satis+
                   mba_avg+
                   is_good_salary,
                  family = binomial(),
                  data = training_set)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(classifier)

## 
## Call:
## glm(formula = levelsatisfaction ~ age + gmat_tot + quarter + 
##     work_yrs + salary + satis + mba_avg + is_good_salary, family = binomial(), 
##     data = training_set)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -5.048e-05   2.100e-08   2.100e-08   2.100e-08   5.666e-05  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -1.500e+02  4.175e+05   0.000    1.000
## age             7.645e-01  1.433e+04   0.000    1.000
## gmat_tot       -3.513e-03  2.164e+02   0.000    1.000
## quarter        -7.311e-01  1.673e+04   0.000    1.000
## work_yrs       -1.506e+00  2.092e+04   0.000    1.000
## salary         -5.697e-04  1.119e+00  -0.001    1.000
## satis           4.265e+01  1.794e+04   0.002    0.998
## mba_avg        -3.187e+00  7.998e+04   0.000    1.000
## is_good_salary  4.995e+01  1.317e+05   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.8821e+02  on 191  degrees of freedom
## Residual deviance: 1.2795e-08  on 183  degrees of freedom
## AIC: 18
## 
## Number of Fisher Scoring iterations: 25

prob_pred = predict(classifier,type="response",newdata=test_set[,c(1,3,9,10,12,13,15,16)])

pred_results = ifelse(prob_pred>0.5,1,0)

comparison = table(test_set[,14],pred_results)
comparison

##    pred_results
##      0  1
##   0 16  0
##   1  0 66

that’s great we got 100% accurate results because diagnol elements add up to 82!!!

Similarly let’s build a model find if student is placed or not

classifier1 <-glm(is_placed~age+
                   gmat_tot+
                   quarter+
                   work_yrs+
                   salary+
                   satis+
                   mba_avg+
                   is_good_salary,
                  family = binomial(),
                  data = training_set)

## Warning: glm.fit: algorithm did not converge

summary(classifier1)

## 
## Call:
## glm(formula = is_placed ~ age + gmat_tot + quarter + work_yrs + 
##     salary + satis + mba_avg + is_good_salary, family = binomial(), 
##     data = training_set)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.409e-06  -2.409e-06  -2.409e-06   2.409e-06   2.409e-06  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -2.657e+01  5.496e+05       0        1
## age            -8.143e-13  1.355e+04       0        1
## gmat_tot       -4.166e-15  4.490e+02       0        1
## quarter        -3.970e-12  3.365e+04       0        1
## work_yrs        8.045e-13  1.644e+04       0        1
## salary          7.688e-14  2.190e+00       0        1
## satis           5.302e-14  1.292e+04       0        1
## mba_avg         4.908e-13  1.005e+05       0        1
## is_good_salary  5.313e+01  2.372e+05       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.5938e+02  on 191  degrees of freedom
## Residual deviance: 1.1139e-09  on 183  degrees of freedom
## AIC: 18
## 
## Number of Fisher Scoring iterations: 25

prob_pred1 = predict(classifier,type="response",newdata=test_set[,c(1,3,9,10,12,13,15,16)])

pred_results1 = ifelse(prob_pred1>0.3,1,0)

comparison1 = table(test_set[,17],pred_results1)
comparison1

##    pred_results1
##      0  1
##   0 16 41
##   1  0 25

MBA Starting Salaries Analysis

Dheeraj L S Tommandru

December 25, 2017

Let’s Create a categorical variable called level satisfaction.

It appears like most of people are satisfied and gender has no role in deteremining level of satisfaction

It is apparent that quartile ranking doesn’t determine whether candidate or student likes MBA program

Let’s create a categorical variable is_good_salary

It can be observed that those with more than average salary are more likely to like the MBA program

Assuming that people who didn’t answer or disclose their aren’t placed let’s create a variable called is_placed where 1->placed while 0-> not placed

Let’s have a null hypothesis that students who are placed are more likely to like the program.To verify if it’s true let’s chi-squared test.

It’s more likely that students who are placed like the Program

Therefore we infer that Gender can’t determine if the candidate likes the program

Building regression model

Building a regressor with only significant variables and other variables that may influence salary

Building a logistic regression model to determine if the candidate has liked mba Program

Building Model

that’s great we got 100% accurate results because diagnol elements add up to 82!!!

Similarly let’s build a model find if student is placed or not

Model in this case predicts only with 53% accuracy so we need better classifiaction algorithm