Reading & Viewing the Dataset
setwd("~/Desktop/Data Analytics Internship/MBA salary")
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep="new"))
View(mba.df)
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
mba.df$sex[mba.df$sex == 1] <- 'Male'
mba.df$sex[mba.df$sex == 2] <- 'Female'
mba.df$sex <- factor(mba.df$sex)
str(mba.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
MBAs who got placed and disclosed their salaries
placed.df <- mba.df[which (mba.df$salary > 999),]
View(placed.df)
describe(placed.df)
## vars n mean sd median trimmed mad min
## age 1 103 26.78 3.27 2.60e+01 26.30 2.97 22.0
## sex* 2 103 1.70 0.46 2.00e+00 1.75 0.00 1.0
## gmat_tot 3 103 616.02 50.69 6.20e+02 615.90 59.30 500.0
## gmat_qpc 4 103 79.73 13.39 8.20e+01 81.05 13.34 39.0
## gmat_vpc 5 103 78.56 16.14 8.10e+01 80.33 16.31 30.0
## gmat_tpc 6 103 84.52 11.01 8.70e+01 85.60 11.86 51.0
## s_avg 7 103 3.09 0.38 3.10e+00 3.10 0.44 2.2
## f_avg 8 103 3.09 0.49 3.25e+00 3.13 0.37 0.0
## quarter 9 103 2.26 1.12 2.00e+00 2.20 1.48 1.0
## work_yrs 10 103 3.68 3.01 3.00e+00 3.11 1.48 0.0
## frstlang 11 103 1.07 0.25 1.00e+00 1.00 0.00 1.0
## salary 12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis 13 103 5.88 0.78 6.00e+00 5.89 1.48 3.0
## max range skew kurtosis se
## age 40 18.0 1.92 4.90 0.32
## sex* 2 1.0 -0.86 -1.28 0.05
## gmat_tot 720 220.0 0.01 -0.69 4.99
## gmat_qpc 99 60.0 -0.81 0.17 1.32
## gmat_vpc 99 69.0 -0.87 0.21 1.59
## gmat_tpc 99 48.0 -0.84 0.19 1.08
## s_avg 4 1.8 -0.13 -0.61 0.04
## f_avg 4 4.0 -2.52 13.86 0.05
## quarter 4 3.0 0.27 -1.34 0.11
## work_yrs 16 16.0 2.48 6.83 0.30
## frstlang 2 1.0 3.38 9.54 0.02
## salary 220000 156000.0 3.18 17.16 1760.67
## satis 7 4.0 -0.40 0.44 0.08
MBAs who got placed but did not disclose their salaries
notDisclosedSalary.df <- mba.df[which (mba.df$salary == 999) , ]
View(notDisclosedSalary.df)
describe(notDisclosedSalary.df)
## vars n mean sd median trimmed mad min max range
## age 1 35 27.49 2.24 27.0 27.38 2.97 24.00 32.00 8.0
## sex* 2 35 1.86 0.36 2.0 1.93 0.00 1.00 2.00 1.0
## gmat_tot 3 35 628.86 60.72 620.0 625.52 59.30 500.00 790.00 290.0
## gmat_qpc 4 35 85.14 13.58 87.0 87.14 11.86 46.00 99.00 53.0
## gmat_vpc 5 35 76.31 19.80 84.0 77.52 19.27 41.00 99.00 58.0
## gmat_tpc 6 35 84.34 18.06 89.0 87.38 11.86 0.00 99.00 99.0
## s_avg 7 35 2.85 0.35 2.8 2.83 0.44 2.30 3.60 1.3
## f_avg 8 35 2.96 0.33 3.0 2.95 0.37 2.25 3.75 1.5
## quarter 9 35 2.94 1.11 3.0 3.03 1.48 1.00 4.00 3.0
## work_yrs 10 35 3.63 1.83 4.0 3.52 1.48 0.00 9.00 9.0
## frstlang 11 35 1.26 0.44 1.0 1.21 0.00 1.00 2.00 1.0
## salary 12 35 999.00 0.00 999.0 999.00 0.00 999.00 999.00 0.0
## satis 13 35 4.49 1.27 4.0 4.55 1.48 1.00 7.00 6.0
## skew kurtosis se
## age 0.36 -0.93 0.38
## sex* -1.95 1.88 0.06
## gmat_tot 0.45 0.02 10.26
## gmat_qpc -1.26 0.87 2.30
## gmat_vpc -0.60 -1.19 3.35
## gmat_tpc -2.99 11.01 3.05
## s_avg 0.39 -0.95 0.06
## f_avg 0.20 -0.39 0.06
## quarter -0.52 -1.20 0.19
## work_yrs 0.71 0.73 0.31
## frstlang 1.06 -0.89 0.07
## salary NaN NaN 0.00
## satis -0.43 0.12 0.21
MBAs who were not placed
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
describe(notPlaced.df)
## vars n mean sd median trimmed mad min max range skew
## age 1 90 28.51 4.95 27.0 27.72 2.97 22 48.0 26.0 1.61
## sex* 2 90 1.74 0.44 2.0 1.81 0.00 1 2.0 1.0 -1.10
## gmat_tot 3 90 614.33 62.85 610.0 612.78 59.30 450 760.0 310.0 0.14
## gmat_qpc 4 90 78.91 17.00 82.0 80.75 17.79 28 99.0 71.0 -0.85
## gmat_vpc 5 90 77.63 16.13 81.0 79.29 14.83 22 99.0 77.0 -0.93
## gmat_tpc 6 90 82.29 15.91 86.0 84.39 14.08 0 99.0 99.0 -2.02
## s_avg 7 90 3.03 0.38 3.0 3.05 0.40 2 3.9 1.9 -0.43
## f_avg 8 90 3.06 0.56 3.0 3.09 0.37 0 4.0 4.0 -1.83
## quarter 9 90 2.54 1.07 2.5 2.56 0.74 1 4.0 3.0 -0.01
## work_yrs 10 90 4.59 4.30 3.0 3.72 2.22 0 22.0 22.0 2.19
## frstlang 11 90 1.09 0.29 1.0 1.00 0.00 1 2.0 1.0 2.84
## salary 12 90 0.00 0.00 0.0 0.00 0.00 0 0.0 0.0 NaN
## satis 13 90 5.62 0.74 6.0 5.58 1.48 4 7.0 3.0 0.07
## kurtosis se
## age 2.74 0.52
## sex* -0.79 0.05
## gmat_tot -0.34 6.63
## gmat_qpc -0.08 1.79
## gmat_vpc 0.45 1.70
## gmat_tpc 6.82 1.68
## s_avg 0.21 0.04
## f_avg 8.48 0.06
## quarter -1.28 0.11
## work_yrs 5.19 0.45
## frstlang 6.14 0.03
## salary NaN 0.00
## satis -0.45 0.08
Distribution of Salaries of Placed Students
library(lattice)
histogram(~salary, data = placed.df,main = "Distribution of Starting Salaries", xlab="Starting Salaries of Placed Students",col='orange' )

ScatterPlot of Placed Students
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~ work_yrs ,data=placed.df, main="ScatterPlot of Placed Students", xlab="Work Experience of Placed Students", ylab="MBAs Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

Corrgram of Salaries of Placed Students
library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(placed.df, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie,text.panel=panel.txt,main="Corrgram of Salaries of Placed Students")

Logistic Regression Model Challenge
placed.df$sex <- factor(placed.df$sex)
is.factor(placed.df$sex)
## [1] TRUE
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed.df)
summary(fit1)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = placed.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3292 -0.7626 0.5805 0.7894 1.4863
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1064455 8.2384884 -0.013 0.9897
## age 0.3643742 0.1889782 1.928 0.0538 .
## gmat_tot -0.0162521 0.0269925 -0.602 0.5471
## gmat_qpc 0.0435054 0.0770321 0.565 0.5722
## gmat_vpc -0.0084836 0.0780797 -0.109 0.9135
## gmat_tpc 0.0561304 0.1181993 0.475 0.6349
## s_avg -0.1751868 1.5508906 -0.113 0.9101
## f_avg -1.5943945 1.0429927 -1.529 0.1263
## quarter -0.2901630 0.4253040 -0.682 0.4951
## work_yrs -0.2410914 0.1783851 -1.352 0.1765
## frstlang -2.4111026 1.0665299 -2.261 0.0238 *
## salary 0.0000184 0.0000191 0.963 0.3353
## satis 0.2638553 0.3332759 0.792 0.4285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 126.01 on 102 degrees of freedom
## Residual deviance: 107.49 on 90 degrees of freedom
## AIC: 133.49
##
## Number of Fisher Scoring iterations: 5
anova(fit1, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 102 126.01
## age 1 2.3856 101 123.62 0.12245
## gmat_tot 1 0.0744 100 123.55 0.78507
## gmat_qpc 1 4.1847 99 119.36 0.04079 *
## gmat_vpc 1 1.8543 98 117.51 0.17329
## gmat_tpc 1 0.0823 97 117.43 0.77423
## s_avg 1 0.4155 96 117.01 0.51919
## f_avg 1 2.1057 95 114.90 0.14675
## quarter 1 0.4742 94 114.43 0.49107
## work_yrs 1 0.5956 93 113.83 0.44026
## frstlang 1 4.6687 92 109.17 0.03072 *
## salary 1 1.0389 91 108.13 0.30808
## satis 1 0.6359 90 107.49 0.42521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=placed.df,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != placed.df$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0"
notPlaced.df$sex <- factor(notPlaced.df$sex)
is.factor(notPlaced.df$sex)
## [1] TRUE
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = notPlaced.df)
summary(fit2)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = notPlaced.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1976 -0.6019 0.4838 0.7582 1.5451
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.39699 7.84772 -1.707 0.0878 .
## age -0.05353 0.12071 -0.443 0.6574
## gmat_tot 0.03439 0.02183 1.576 0.1151
## gmat_qpc -0.02944 0.06260 -0.470 0.6381
## gmat_vpc -0.10328 0.06711 -1.539 0.1238
## gmat_tpc -0.03205 0.06128 -0.523 0.6010
## s_avg 0.47864 1.17187 0.408 0.6830
## f_avg 0.58170 0.57645 1.009 0.3129
## quarter 0.49321 0.36673 1.345 0.1787
## work_yrs 0.08643 0.14181 0.609 0.5422
## frstlang 0.31776 1.29059 0.246 0.8055
## salary NA NA NA NA
## satis 0.51118 0.40913 1.249 0.2115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 102.304 on 89 degrees of freedom
## Residual deviance: 86.742 on 78 degrees of freedom
## AIC: 110.74
##
## Number of Fisher Scoring iterations: 5
anova(fit2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 89 102.304
## age 1 0.4712 88 101.833 0.49244
## gmat_tot 1 0.3130 87 101.520 0.57585
## gmat_qpc 1 4.3705 86 97.150 0.03657 *
## gmat_vpc 1 5.0395 85 92.110 0.02478 *
## gmat_tpc 1 0.6560 84 91.454 0.41798
## s_avg 1 0.0490 83 91.405 0.82487
## f_avg 1 0.5497 82 90.855 0.45844
## quarter 1 1.6354 81 89.220 0.20096
## work_yrs 1 0.8609 80 88.359 0.35348
## frstlang 1 0.0078 79 88.351 0.92960
## salary 0 0.0000 79 88.351
## satis 1 1.6093 78 86.742 0.20459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=notPlaced.df,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != notPlaced.df$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0"