salary <- read.csv(paste("MBA Starting Salaries Data.csv" , sep = ""))
#View(salary)
some(salary)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 8 25 2 650 88 89 92 3.3 3.75 1
## 42 25 2 560 52 81 72 3.3 3.50 1
## 84 25 1 690 96 89 97 3.0 3.00 2
## 156 26 1 660 88 93 94 2.9 2.75 3
## 166 27 1 730 95 99 99 2.9 3.33 3
## 181 29 1 560 57 74 73 2.8 3.00 3
## 184 34 1 610 82 78 86 2.7 3.00 3
## 212 25 1 600 53 95 84 2.5 3.00 4
## 218 25 1 700 99 87 98 2.0 2.00 4
## 260 26 2 630 85 81 90 2.9 3.25 4
## work_yrs frstlang salary satis
## 8 2 1 0 6
## 42 1 1 95000 5
## 84 3 1 998 998
## 156 3 2 998 998
## 166 0 1 999 5
## 181 4 1 999 5
## 184 12 1 0 5
## 212 2 1 999 4
## 218 1 1 0 7
## 260 3 1 86000 5
str(salary)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
summary(salary)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
describe(salary)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
sapply(salary, function(x) sum(x == 999))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 35 0
so 35 people didn’t provide data about their salaries
sapply(salary, function(x) sum(x == 998))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 46 46
and 46 people didn’t respond
sal <- salary[which((salary$salary != 999) & (salary$salary != 998)) , ]
#View(sal)
#plot.new()
attach(sal)
## The following object is masked _by_ .GlobalEnv:
##
## salary
histogram(~sex , main = "Comparison of No. of respondent males vs females" , xlab = "sex(1 = male 2 = female)")
boxplot(sal$salary ~ sal$sex , horizontal = TRUE , col = c("lightblue4" , "pink") ,yaxt = "n" , main = "Comaprison of salary of males and females" , xlab = "Salary" , ylab = "Sex")
axis(side = 2 , at = c(1,2) , labels = c("Male" , "Female"))
So, there isn’t much difference between salaries of males and females.
Variation of GMAT Scores
hist(gmat_tot , col = "grey" , main = "Variation in total GMAT scores" , xlab = "Score(out of 800)")
GMAT Percentiles
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
hist(gmat_tpc , col = "peachpuff" , main = "GMAT Percentile(Total)" , xlab = "Percentile")
hist(gmat_qpc , col = "khaki" , main = "GMAT Percentile(Quantitative)" , xlab = "Percentile")
hist(gmat_vpc , col = "lightblue1" , main = "GMAT Percentile(Verbal)" , xlab = "Percentile")
plot.new()
boxplot(gmat_vpc ~ frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of Verbal score with native language" , xlab = "Score" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))
plot.new()
boxplot(gmat_qpc ~ frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of Quantitative score with native language" , xlab = "Score" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))
As expected, people having English as their mother-tongue score better in verbal whereas the scores in Quantitative section are independent of mother tongue.
xyplot(s_avg ~ quarter , data = salary)
xyplot(f_avg ~ quarter , data = salary)
** We see that there is not much variation in the fall average. So the performance in spring separates people**
Creating a data frome of only the placed people
placed <- sal[which(sal$salary > 1000) , ]
#View(placed)
summary(placed)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :500 Min. :39.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580 1st Qu.:72.00
## Median :26.00 Median :1.000 Median :620 Median :82.00
## Mean :26.78 Mean :1.301 Mean :616 Mean :79.73
## 3rd Qu.:28.00 3rd Qu.:2.000 3rd Qu.:655 3rd Qu.:89.00
## Max. :40.00 Max. :2.000 Max. :720 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :30.00 Min. :51.00 Min. :2.200 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.00 1st Qu.:2.850 1st Qu.:2.915
## Median :81.00 Median :87.00 Median :3.100 Median :3.250
## Mean :78.56 Mean :84.52 Mean :3.092 Mean :3.091
## 3rd Qu.:92.00 3rd Qu.:93.50 3rd Qu.:3.400 3rd Qu.:3.415
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.00 Min. :1.000 Min. : 64000
## 1st Qu.:1.000 1st Qu.: 2.00 1st Qu.:1.000 1st Qu.: 95000
## Median :2.000 Median : 3.00 Median :1.000 Median :100000
## Mean :2.262 Mean : 3.68 Mean :1.068 Mean :103031
## 3rd Qu.:3.000 3rd Qu.: 4.00 3rd Qu.:1.000 3rd Qu.:106000
## Max. :4.000 Max. :16.00 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.883
## 3rd Qu.:6.000
## Max. :7.000
Salary vs English
plot.new()
boxplot(placed$salary ~ placed$frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of salary with native language" , xlab = "Salary" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))
How satisfied are the rich?
boxplot(placed$salary ~ placed$satis , horizontal = TRUE , xlab = "Salary" , ylab = "Satisfaction(& being the highest)")
Will salary grow as we age?
scatterplot(placed$salary ~ placed$age)
scatterplotMatrix(placed[,c(4,5,6,7,8,12)])
scatterplotMatrix(placed[,c(1,2,10,11,12,13)])
Correlation of Salary with Educational and Other factors
library("Hmisc", lib.loc="~/R/win-library/3.4")
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
edu <- placed[,c(3,4,5,6,7,8,9,12)]
other <- placed[,c(2,3,10,11,12,13)]
c1 <- rcorr(as.matrix(edu))
c2 <- rcorr(as.matrix(other))
c1
## gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter salary
## gmat_tot 1.00 0.67 0.78 0.97 0.17 0.12 -0.11 -0.09
## gmat_qpc 0.67 1.00 0.09 0.66 0.02 0.10 0.01 0.01
## gmat_vpc 0.78 0.09 1.00 0.78 0.16 0.02 -0.13 -0.14
## gmat_tpc 0.97 0.66 0.78 1.00 0.14 0.07 -0.10 -0.13
## s_avg 0.17 0.02 0.16 0.14 1.00 0.45 -0.84 0.10
## f_avg 0.12 0.10 0.02 0.07 0.45 1.00 -0.43 -0.11
## quarter -0.11 0.01 -0.13 -0.10 -0.84 -0.43 1.00 -0.13
## salary -0.09 0.01 -0.14 -0.13 0.10 -0.11 -0.13 1.00
##
## n= 103
##
##
## P
## gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter salary
## gmat_tot 0.0000 0.0000 0.0000 0.0824 0.2178 0.2875 0.3624
## gmat_qpc 0.0000 0.3415 0.0000 0.8767 0.3226 0.8991 0.8873
## gmat_vpc 0.0000 0.3415 0.0000 0.1095 0.8184 0.1954 0.1663
## gmat_tpc 0.0000 0.0000 0.0000 0.1603 0.4791 0.3171 0.1837
## s_avg 0.0824 0.8767 0.1095 0.1603 0.0000 0.0000 0.3065
## f_avg 0.2178 0.3226 0.8184 0.4791 0.0000 0.0000 0.2864
## quarter 0.2875 0.8991 0.1954 0.3171 0.0000 0.0000 0.1959
## salary 0.3624 0.8873 0.1663 0.1837 0.3065 0.2864 0.1959
c2
## sex gmat_tot work_yrs frstlang salary satis
## sex 1.00 -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.02 1.00 -0.12 -0.13 -0.09 0.06
## work_yrs -0.09 -0.12 1.00 0.20 0.45 0.06
## frstlang 0.08 -0.13 0.20 1.00 0.27 0.09
## salary -0.17 -0.09 0.45 0.27 1.00 -0.04
## satis -0.09 0.06 0.06 0.09 -0.04 1.00
##
## n= 103
##
##
## P
## sex gmat_tot work_yrs frstlang salary satis
## sex 0.8446 0.3536 0.4508 0.0932 0.3554
## gmat_tot 0.8446 0.2165 0.1850 0.3624 0.5159
## work_yrs 0.3536 0.2165 0.0469 0.0000 0.5273
## frstlang 0.4508 0.1850 0.0469 0.0064 0.3668
## salary 0.0932 0.3624 0.0000 0.0064 0.6879
## satis 0.3554 0.5159 0.5273 0.3668 0.6879
fit1 <- lm(salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis + quarter , data = placed)
summary(fit1)
##
## Call:
## lm(formula = salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs +
## sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis +
## quarter, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## gmat_tot 16.19 178.85 0.090 0.9281
## frstlang 7719.42 7373.27 1.047 0.2979
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## work_yrs 749.66 1135.90 0.660 0.5110
## sex -3584.07 3595.85 -0.997 0.3216
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## age 1750.65 1130.92 1.548 0.1251
## satis -1086.54 2157.76 -0.504 0.6158
## quarter -2336.56 2721.89 -0.858 0.3929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
library("leaps", lib.loc="~/R/win-library/3.4")
leap1 <- regsubsets(placed$salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis + quarter , nbest = 1 , data = placed)
plot(leap1 , scale = "adjr2")
So, the best fit model excludes gmat_tot , s_avg , f_avg and ’satis` variables
A best fit regression model
fit2 <- lm(salary ~ frstlang + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + quarter , data = placed)
summary(fit2)
##
## Call:
## lm(formula = salary ~ frstlang + work_yrs + sex + gmat_vpc +
## gmat_vpc + gmat_qpc + gmat_tpc + age + quarter, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26192 -8279 -497 5867 70294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66157.6 28278.0 2.340 0.0214 *
## frstlang 7385.1 7051.4 1.047 0.2976
## work_yrs 809.9 1112.7 0.728 0.4685
## sex -3740.1 3436.2 -1.088 0.2792
## gmat_vpc 572.0 354.1 1.616 0.1095
## gmat_qpc 824.8 352.1 2.342 0.0213 *
## gmat_tpc -1451.7 685.9 -2.117 0.0369 *
## age 1755.4 1099.8 1.596 0.1138
## quarter -1824.8 1381.1 -1.321 0.1896
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15160 on 94 degrees of freedom
## Multiple R-squared: 0.337, Adjusted R-squared: 0.2806
## F-statistic: 5.974 on 8 and 94 DF, p-value: 3.463e-06
library("coefplot", lib.loc="~/R/win-library/3.4")
coefplot(fit2 , intercept = FALSE)
## Warning: Ignoring unknown aesthetics: xmin, xmax
Results
Creating a data frame of the people who were not placed
notplaced <- sal[which(sal$salary == 0),]
View(notplaced)
t.test(placed$quarter , notplaced$quarter)
##
## Welch Two Sample t-test
##
## data: placed$quarter and notplaced$quarter
## t = -1.7872, df = 189.39, p-value = 0.0755
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.59388972 0.02927267
## sample estimates:
## mean of x mean of y
## 2.262136 2.544444
t.test(placed$gmat_tot , notplaced$gmat_tot)
##
## Welch Two Sample t-test
##
## data: placed$gmat_tot and notplaced$gmat_tot
## t = 0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14.69189 18.06406
## sample estimates:
## mean of x mean of y
## 616.0194 614.3333
t.test(placed$gmat_tpc , notplaced$gmat_tpc)
##
## Welch Two Sample t-test
##
## data: placed$gmat_tpc and notplaced$gmat_tpc
## t = 1.119, df = 155.27, p-value = 0.2649
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.710571 6.181337
## sample estimates:
## mean of x mean of y
## 84.52427 82.28889
t.test((placed$f_avg + placed$s_avg) , (notplaced$f_avg + notplaced$s_avg))
##
## Welch Two Sample t-test
##
## data: (placed$f_avg + placed$s_avg) and (notplaced$f_avg + notplaced$s_avg)
## t = 0.78225, df = 178.77, p-value = 0.4351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1364781 0.3157467
## sample estimates:
## mean of x mean of y
## 6.183301 6.093667
t.test((placed$work_yrs) , (notplaced$work_yrs))
##
## Welch Two Sample t-test
##
## data: (placed$work_yrs) and (notplaced$work_yrs)
## t = -1.6778, df = 156.44, p-value = 0.09538
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.9797552 0.1612007
## sample estimates:
## mean of x mean of y
## 3.679612 4.588889
t.test((placed$age) , (notplaced$age))
##
## Welch Two Sample t-test
##
## data: (placed$age) and (notplaced$age)
## t = -2.8289, df = 150.8, p-value = 0.005307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.9457989 -0.5230252
## sample estimates:
## mean of x mean of y
## 26.77670 28.51111
t.test((placed$satis) , (notplaced$satis))
##
## Welch Two Sample t-test
##
## data: (placed$satis) and (notplaced$satis)
## t = 2.3757, df = 189.69, p-value = 0.01851
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.04433331 0.47821254
## sample estimates:
## mean of x mean of y
## 5.883495 5.622222
t.test((placed$frstlang) , (notplaced$frstlang))
##
## Welch Two Sample t-test
##
## data: (placed$frstlang) and (notplaced$frstlang)
## t = -0.53486, df = 179.13, p-value = 0.5934
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09813827 0.05628282
## sample estimates:
## mean of x mean of y
## 1.067961 1.088889
Using T-Tests, we have the parameters(Age , Satisfaction and Quarter) against which we have to conduct a logistic regression
We create a new coloumn telling whether a person was placed or not
sal$pl <- !(sal$salary == 0)
sal$pl[sal$pl == TRUE] <- 1
sal$pl[sal$pl == FALSE] <- 0
sal$pl <- as.integer(sal$pl)
The Logit Model using the significant looking factors
model <- glm(formula = pl ~ age + quarter + satis + work_yrs + gmat_tpc, family = binomial(link = "logit") , data = sal[,c(1:11,13,14)])
summary(model)
##
## Call:
## glm(formula = pl ~ age + quarter + satis + work_yrs + gmat_tpc,
## family = binomial(link = "logit"), data = sal[, c(1:11, 13,
## 14)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9510 -1.1631 0.7745 1.0619 1.8749
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.87063 2.46195 1.166 0.2436
## age -0.19327 0.08117 -2.381 0.0173 *
## quarter -0.28599 0.14398 -1.986 0.0470 *
## satis 0.43376 0.20494 2.116 0.0343 *
## work_yrs 0.10858 0.09051 1.200 0.2303
## gmat_tpc 0.00401 0.01187 0.338 0.7354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 246.92 on 187 degrees of freedom
## AIC: 258.92
##
## Number of Fisher Scoring iterations: 4
The lower the AIC and higher the difference b/w Null Deviance and Residual Deviance , the better
Determining correlation b/w placement and various other factors
anova(model , test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: pl
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 192 266.68
## age 1 8.4714 191 258.21 0.003608 **
## quarter 1 4.1662 190 254.04 0.041238 *
## satis 1 5.6202 189 248.42 0.017755 *
## work_yrs 1 1.3901 188 247.03 0.238381
## gmat_tpc 1 0.1151 187 246.91 0.734391
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To find out how well your model fits, see the McFadden R2
library("pscl", lib.loc="~/R/win-library/3.4")
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
## llh llhNull G2 McFadden r2ML
## -123.45771948 -133.33925034 19.76306171 0.07410819 0.09733094
## r2CU
## 0.12997175
Since the fit is not so good, we try to fit it using all the factors available
The Logit Model using all the factors
model2 <- glm(formula = pl ~ ., family = binomial(link = "logit") , data = sal[,c(1:11,13,14)])
summary(model2)
##
## Call:
## glm(formula = pl ~ ., family = binomial(link = "logit"), data = sal[,
## c(1:11, 13, 14)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0419 -1.1439 0.7517 1.0238 1.9665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.56015 4.57567 0.997 0.3190
## age -0.19812 0.08562 -2.314 0.0207 *
## sex 0.14396 0.36089 0.399 0.6900
## gmat_tot -0.01027 0.01289 -0.797 0.4257
## gmat_qpc -0.02177 0.04606 -0.473 0.6364
## gmat_vpc -0.02058 0.04401 -0.468 0.6401
## gmat_tpc 0.08833 0.06492 1.361 0.1736
## s_avg 0.24369 0.67258 0.362 0.7171
## f_avg -0.09871 0.36617 -0.270 0.7875
## quarter -0.23624 0.21132 -1.118 0.2636
## work_yrs 0.10479 0.09386 1.116 0.2642
## frstlang 0.31939 0.64213 0.497 0.6189
## satis 0.42640 0.21308 2.001 0.0454 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 242.92 on 180 degrees of freedom
## AIC: 268.92
##
## Number of Fisher Scoring iterations: 5
Determining correlation b/w placement and all the other factors
anova(model2 , test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: pl
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 192 266.68
## age 1 8.4714 191 258.21 0.003608 **
## sex 1 0.3867 190 257.82 0.534052
## gmat_tot 1 0.0156 189 257.81 0.900742
## gmat_qpc 1 0.0186 188 257.79 0.891625
## gmat_vpc 1 1.3430 187 256.44 0.246503
## gmat_tpc 1 4.0117 186 252.43 0.045185 *
## s_avg 1 2.0591 185 250.37 0.151303
## f_avg 1 0.2676 184 250.10 0.604976
## quarter 1 1.2924 183 248.81 0.255602
## work_yrs 1 1.6279 182 247.19 0.201987
## frstlang 1 0.1575 181 247.03 0.691446
## satis 1 4.1033 180 242.92 0.042799 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To find out how well your model fits, see the McFadden R2
library("pscl", lib.loc="~/R/win-library/3.4")
pR2(model2)
## llh llhNull G2 McFadden r2ML
## -121.4618563 -133.3392503 23.7547880 0.0890765 0.1158087
## r2CU
## 0.1546461
We see that :
AIC increases
Difference b/w Null and Residual Deviances increases
McFadden R2 increases
So, this model is slightly better, but to conclude I will say that out of the given factors - using Age , Satisfaction and Quarter(MBA Performance), we can determine whether a person is placed or not.