Reading & Viewing the Dataset

setwd("~/Desktop/Data Analytics Internship/MBA salary")
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep="new"))
View(mba.df)
summary(mba.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(mba.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
mba.df$sex[mba.df$sex == 1] <- 'Male'
mba.df$sex[mba.df$sex == 2] <- 'Female'
mba.df$sex <- factor(mba.df$sex)
str(mba.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

MBAs who got placed and disclosed their salaries

placed.df <- mba.df[which (mba.df$salary > 999),]
View(placed.df)
describe(placed.df)
##          vars   n      mean       sd   median   trimmed     mad     min
## age         1 103     26.78     3.27 2.60e+01     26.30    2.97    22.0
## sex*        2 103      1.70     0.46 2.00e+00      1.75    0.00     1.0
## gmat_tot    3 103    616.02    50.69 6.20e+02    615.90   59.30   500.0
## gmat_qpc    4 103     79.73    13.39 8.20e+01     81.05   13.34    39.0
## gmat_vpc    5 103     78.56    16.14 8.10e+01     80.33   16.31    30.0
## gmat_tpc    6 103     84.52    11.01 8.70e+01     85.60   11.86    51.0
## s_avg       7 103      3.09     0.38 3.10e+00      3.10    0.44     2.2
## f_avg       8 103      3.09     0.49 3.25e+00      3.13    0.37     0.0
## quarter     9 103      2.26     1.12 2.00e+00      2.20    1.48     1.0
## work_yrs   10 103      3.68     3.01 3.00e+00      3.11    1.48     0.0
## frstlang   11 103      1.07     0.25 1.00e+00      1.00    0.00     1.0
## salary     12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis      13 103      5.88     0.78 6.00e+00      5.89    1.48     3.0
##             max    range  skew kurtosis      se
## age          40     18.0  1.92     4.90    0.32
## sex*          2      1.0 -0.86    -1.28    0.05
## gmat_tot    720    220.0  0.01    -0.69    4.99
## gmat_qpc     99     60.0 -0.81     0.17    1.32
## gmat_vpc     99     69.0 -0.87     0.21    1.59
## gmat_tpc     99     48.0 -0.84     0.19    1.08
## s_avg         4      1.8 -0.13    -0.61    0.04
## f_avg         4      4.0 -2.52    13.86    0.05
## quarter       4      3.0  0.27    -1.34    0.11
## work_yrs     16     16.0  2.48     6.83    0.30
## frstlang      2      1.0  3.38     9.54    0.02
## salary   220000 156000.0  3.18    17.16 1760.67
## satis         7      4.0 -0.40     0.44    0.08

MBAs who got placed but did not disclose their salaries

notDisclosedSalary.df <- mba.df[which (mba.df$salary == 999) , ]
View(notDisclosedSalary.df)
describe(notDisclosedSalary.df)
##          vars  n   mean    sd median trimmed   mad    min    max range
## age         1 35  27.49  2.24   27.0   27.38  2.97  24.00  32.00   8.0
## sex*        2 35   1.86  0.36    2.0    1.93  0.00   1.00   2.00   1.0
## gmat_tot    3 35 628.86 60.72  620.0  625.52 59.30 500.00 790.00 290.0
## gmat_qpc    4 35  85.14 13.58   87.0   87.14 11.86  46.00  99.00  53.0
## gmat_vpc    5 35  76.31 19.80   84.0   77.52 19.27  41.00  99.00  58.0
## gmat_tpc    6 35  84.34 18.06   89.0   87.38 11.86   0.00  99.00  99.0
## s_avg       7 35   2.85  0.35    2.8    2.83  0.44   2.30   3.60   1.3
## f_avg       8 35   2.96  0.33    3.0    2.95  0.37   2.25   3.75   1.5
## quarter     9 35   2.94  1.11    3.0    3.03  1.48   1.00   4.00   3.0
## work_yrs   10 35   3.63  1.83    4.0    3.52  1.48   0.00   9.00   9.0
## frstlang   11 35   1.26  0.44    1.0    1.21  0.00   1.00   2.00   1.0
## salary     12 35 999.00  0.00  999.0  999.00  0.00 999.00 999.00   0.0
## satis      13 35   4.49  1.27    4.0    4.55  1.48   1.00   7.00   6.0
##           skew kurtosis    se
## age       0.36    -0.93  0.38
## sex*     -1.95     1.88  0.06
## gmat_tot  0.45     0.02 10.26
## gmat_qpc -1.26     0.87  2.30
## gmat_vpc -0.60    -1.19  3.35
## gmat_tpc -2.99    11.01  3.05
## s_avg     0.39    -0.95  0.06
## f_avg     0.20    -0.39  0.06
## quarter  -0.52    -1.20  0.19
## work_yrs  0.71     0.73  0.31
## frstlang  1.06    -0.89  0.07
## salary     NaN      NaN  0.00
## satis    -0.43     0.12  0.21

MBAs who were not placed

notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
describe(notPlaced.df)
##          vars  n   mean    sd median trimmed   mad min   max range  skew
## age         1 90  28.51  4.95   27.0   27.72  2.97  22  48.0  26.0  1.61
## sex*        2 90   1.74  0.44    2.0    1.81  0.00   1   2.0   1.0 -1.10
## gmat_tot    3 90 614.33 62.85  610.0  612.78 59.30 450 760.0 310.0  0.14
## gmat_qpc    4 90  78.91 17.00   82.0   80.75 17.79  28  99.0  71.0 -0.85
## gmat_vpc    5 90  77.63 16.13   81.0   79.29 14.83  22  99.0  77.0 -0.93
## gmat_tpc    6 90  82.29 15.91   86.0   84.39 14.08   0  99.0  99.0 -2.02
## s_avg       7 90   3.03  0.38    3.0    3.05  0.40   2   3.9   1.9 -0.43
## f_avg       8 90   3.06  0.56    3.0    3.09  0.37   0   4.0   4.0 -1.83
## quarter     9 90   2.54  1.07    2.5    2.56  0.74   1   4.0   3.0 -0.01
## work_yrs   10 90   4.59  4.30    3.0    3.72  2.22   0  22.0  22.0  2.19
## frstlang   11 90   1.09  0.29    1.0    1.00  0.00   1   2.0   1.0  2.84
## salary     12 90   0.00  0.00    0.0    0.00  0.00   0   0.0   0.0   NaN
## satis      13 90   5.62  0.74    6.0    5.58  1.48   4   7.0   3.0  0.07
##          kurtosis   se
## age          2.74 0.52
## sex*        -0.79 0.05
## gmat_tot    -0.34 6.63
## gmat_qpc    -0.08 1.79
## gmat_vpc     0.45 1.70
## gmat_tpc     6.82 1.68
## s_avg        0.21 0.04
## f_avg        8.48 0.06
## quarter     -1.28 0.11
## work_yrs     5.19 0.45
## frstlang     6.14 0.03
## salary        NaN 0.00
## satis       -0.45 0.08

Distribution of Salaries of Placed Students

library(lattice)
histogram(~salary, data = placed.df,main = "Distribution of Starting Salaries", xlab="Starting Salaries of Placed Students",col='orange' )

ScatterPlot of Placed Students

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~ work_yrs ,data=placed.df, main="ScatterPlot of Placed Students", xlab="Work Experience of Placed Students", ylab="MBAs Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

Corrgram of Salaries of Placed Students

library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(placed.df, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie,text.panel=panel.txt,main="Corrgram of Salaries of Placed Students")

Logistic Regression Model Challenge

placed.df$sex <- factor(placed.df$sex)
is.factor(placed.df$sex)
## [1] TRUE
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed.df)
summary(fit1)
## 
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = placed.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3292  -0.7626   0.5805   0.7894   1.4863  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.1064455  8.2384884  -0.013   0.9897  
## age          0.3643742  0.1889782   1.928   0.0538 .
## gmat_tot    -0.0162521  0.0269925  -0.602   0.5471  
## gmat_qpc     0.0435054  0.0770321   0.565   0.5722  
## gmat_vpc    -0.0084836  0.0780797  -0.109   0.9135  
## gmat_tpc     0.0561304  0.1181993   0.475   0.6349  
## s_avg       -0.1751868  1.5508906  -0.113   0.9101  
## f_avg       -1.5943945  1.0429927  -1.529   0.1263  
## quarter     -0.2901630  0.4253040  -0.682   0.4951  
## work_yrs    -0.2410914  0.1783851  -1.352   0.1765  
## frstlang    -2.4111026  1.0665299  -2.261   0.0238 *
## salary       0.0000184  0.0000191   0.963   0.3353  
## satis        0.2638553  0.3332759   0.792   0.4285  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 126.01  on 102  degrees of freedom
## Residual deviance: 107.49  on  90  degrees of freedom
## AIC: 133.49
## 
## Number of Fisher Scoring iterations: 5
anova(fit1, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                       102     126.01           
## age       1   2.3856       101     123.62  0.12245  
## gmat_tot  1   0.0744       100     123.55  0.78507  
## gmat_qpc  1   4.1847        99     119.36  0.04079 *
## gmat_vpc  1   1.8543        98     117.51  0.17329  
## gmat_tpc  1   0.0823        97     117.43  0.77423  
## s_avg     1   0.4155        96     117.01  0.51919  
## f_avg     1   2.1057        95     114.90  0.14675  
## quarter   1   0.4742        94     114.43  0.49107  
## work_yrs  1   0.5956        93     113.83  0.44026  
## frstlang  1   4.6687        92     109.17  0.03072 *
## salary    1   1.0389        91     108.13  0.30808  
## satis     1   0.6359        90     107.49  0.42521  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=placed.df,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != placed.df$sex) 
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0"
notPlaced.df$sex <- factor(notPlaced.df$sex)
is.factor(notPlaced.df$sex)
## [1] TRUE
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = notPlaced.df)
summary(fit2)
## 
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = notPlaced.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1976  -0.6019   0.4838   0.7582   1.5451  
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -13.39699    7.84772  -1.707   0.0878 .
## age          -0.05353    0.12071  -0.443   0.6574  
## gmat_tot      0.03439    0.02183   1.576   0.1151  
## gmat_qpc     -0.02944    0.06260  -0.470   0.6381  
## gmat_vpc     -0.10328    0.06711  -1.539   0.1238  
## gmat_tpc     -0.03205    0.06128  -0.523   0.6010  
## s_avg         0.47864    1.17187   0.408   0.6830  
## f_avg         0.58170    0.57645   1.009   0.3129  
## quarter       0.49321    0.36673   1.345   0.1787  
## work_yrs      0.08643    0.14181   0.609   0.5422  
## frstlang      0.31776    1.29059   0.246   0.8055  
## salary             NA         NA      NA       NA  
## satis         0.51118    0.40913   1.249   0.2115  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 102.304  on 89  degrees of freedom
## Residual deviance:  86.742  on 78  degrees of freedom
## AIC: 110.74
## 
## Number of Fisher Scoring iterations: 5
anova(fit2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                        89    102.304           
## age       1   0.4712        88    101.833  0.49244  
## gmat_tot  1   0.3130        87    101.520  0.57585  
## gmat_qpc  1   4.3705        86     97.150  0.03657 *
## gmat_vpc  1   5.0395        85     92.110  0.02478 *
## gmat_tpc  1   0.6560        84     91.454  0.41798  
## s_avg     1   0.0490        83     91.405  0.82487  
## f_avg     1   0.5497        82     90.855  0.45844  
## quarter   1   1.6354        81     89.220  0.20096  
## work_yrs  1   0.8609        80     88.359  0.35348  
## frstlang  1   0.0078        79     88.351  0.92960  
## salary    0   0.0000        79     88.351           
## satis     1   1.6093        78     86.742  0.20459  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=notPlaced.df,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != notPlaced.df$sex) 
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0"