Let’s Create a categorical variable called level satisfaction.
df$levelsatisfaction <- ifelse(df$satis >= 4,1,0)
df[c(20:30),c(1:14)]
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 20 27 1 600 91 58 83 3.40 3.25 1
## 21 27 2 570 65 82 77 3.30 3.25 1
## 22 27 1 740 99 96 99 3.50 3.50 1
## 23 27 1 750 99 98 99 3.40 3.50 1
## 24 28 2 540 75 50 65 3.60 4.00 1
## 25 29 1 580 56 87 78 3.64 3.33 1
## 26 30 1 620 82 84 87 3.40 2.80 1
## 27 31 2 560 60 78 72 3.30 3.75 1
## 28 32 1 760 99 99 99 3.40 3.00 1
## 29 32 1 640 79 91 91 3.60 3.75 1
## 30 32 1 570 71 71 0 3.50 3.50 1
## work_yrs frstlang salary satis levelsatisfaction
## 20 4 1 998 0 0
## 21 4 1 999 4 1
## 22 3 1 0 6 1
## 23 1 2 0 5 1
## 24 5 1 0 5 1
## 25 3 1 0 5 1
## 26 5 1 999 6 1
## 27 10 1 0 7 1
## 28 5 1 0 5 1
## 29 7 1 0 6 1
## 30 4 1 999 4 1
gg1<-ggplot(df,aes(x=work_yrs,y=salary))+
geom_point(aes(col=satis,size=levelsatisfaction))+
labs(title="Experience vs Salary",
x= "Work Experience",
y="Salary",
caption="Data Source : HBR.ORG")
plot(gg1)

df$mba_avg <- (df$s_avg+df$f_avg)/2
df[20:30,1:15]
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 20 27 1 600 91 58 83 3.40 3.25 1
## 21 27 2 570 65 82 77 3.30 3.25 1
## 22 27 1 740 99 96 99 3.50 3.50 1
## 23 27 1 750 99 98 99 3.40 3.50 1
## 24 28 2 540 75 50 65 3.60 4.00 1
## 25 29 1 580 56 87 78 3.64 3.33 1
## 26 30 1 620 82 84 87 3.40 2.80 1
## 27 31 2 560 60 78 72 3.30 3.75 1
## 28 32 1 760 99 99 99 3.40 3.00 1
## 29 32 1 640 79 91 91 3.60 3.75 1
## 30 32 1 570 71 71 0 3.50 3.50 1
## work_yrs frstlang salary satis levelsatisfaction mba_avg
## 20 4 1 998 0 0 3.325
## 21 4 1 999 4 1 3.275
## 22 3 1 0 6 1 3.500
## 23 1 2 0 5 1 3.450
## 24 5 1 0 5 1 3.800
## 25 3 1 0 5 1 3.485
## 26 5 1 999 6 1 3.100
## 27 10 1 0 7 1 3.525
## 28 5 1 0 5 1 3.200
## 29 7 1 0 6 1 3.675
## 30 4 1 999 4 1 3.500
summary(df$mba_avg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.825 3.050 3.043 3.295 4.000
str(df)
## 'data.frame': 274 obs. of 15 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot : int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc : int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc : int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc : int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang : int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : num 7 6 6 7 5 6 5 6 4 0 ...
## $ levelsatisfaction: num 1 1 1 1 1 1 1 1 1 0 ...
## $ mba_avg : num 3.2 3.75 3.27 2.98 3.67 ...
df$frstlang <- factor(df$frstlang)
df$sex <-factor(df$sex)
ggplot(df,aes(x=sex,fill=frstlang))+
theme_bw()+
geom_bar()+
labs(y="count",
x="Gender",
title ="Gender Distribution",
caption ="Source : hbr.org")

ggplot(df,aes(x=sex,y=gmat_tot))+
geom_boxplot(fill= "skyblue")+
stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
theme_bw()+
labs(x="Gender",
y="GMAT Scores",
title ="Distribution of GMAT scores based on gender",
caption ="Source:hbr.org")

for(i in 1:nrow(df)){
if(df$salary[i] == 999){
df$salary[i] <- 0
}
}
str(df)
## 'data.frame': 274 obs. of 15 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot : int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc : int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc : int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc : int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : num 0 0 0 0 0 0 0 0 0 998 ...
## $ satis : num 7 6 6 7 5 6 5 6 4 0 ...
## $ levelsatisfaction: num 1 1 1 1 1 1 1 1 1 0 ...
## $ mba_avg : num 3.2 3.75 3.27 2.98 3.67 ...
ggplot(df,aes(x=sex,y=work_yrs))+
geom_boxplot(fill= "lightblue")+
stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
theme_bw()+
labs(x="Gender",
y="Experience",
title ="Distribution of Experience based on gender",
caption ="Source:hbr.org")

ggplot(df,aes(x=sex,y=age))+
geom_boxplot(fill= "magenta")+
stat_summary(fun.y = "mean",geom = "point",shape=8,size=2,color="blue")+
theme_bw()+
labs(x="Gender",
y="Age",
title ="Distribution of Age based on Gender",
caption ="Source:hbr.org")

library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula = ~ gmat_tot + gmat_qpc + gmat_vpc + mba_avg, data = df)

library(corrgram)
corrgram(df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of correlations between variables")

boxplot(df$age,
main = "Age Distribution",
col= "Blue",
horizontal = TRUE)

boxplot(df$gmat_tot,
main = "Gmat Total Score",
col= "RED",
horizontal = TRUE)

boxplot(df$mba_avg,
main = "Average Marks in MBA",
col= "magenta",
horizontal = TRUE)

cor(df[,c(1,3:10,12:15)])
## age gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.14593840 -0.21616985 -0.044175472
## gmat_tot -0.14593840 1.00000000 0.72473781 0.748391870
## gmat_qpc -0.21616985 0.72473781 1.00000000 0.152180142
## gmat_vpc -0.04417547 0.74839187 0.15218014 1.000000000
## gmat_tpc -0.16990307 0.84779965 0.65137754 0.666216035
## s_avg 0.14970402 0.11311702 -0.02984873 0.204453647
## f_avg -0.01744806 0.10442409 0.07370455 0.075922253
## quarter -0.04967221 -0.09223903 0.03636638 -0.174607355
## work_yrs 0.85829810 -0.18235434 -0.23660827 -0.066390490
## salary -0.06254173 -0.05527795 -0.04470904 -0.005829072
## satis 0.09234402 -0.06443910 -0.09312075 0.009719018
## levelsatisfaction 0.11214958 -0.06580207 -0.07281406 -0.017008495
## mba_avg 0.05980183 0.12229941 0.03412182 0.147077265
## gmat_tpc s_avg f_avg quarter
## age -0.169903066 0.14970402 -0.017448057 -0.04967221
## gmat_tot 0.847799647 0.11311702 0.104424092 -0.09223903
## gmat_qpc 0.651377538 -0.02984873 0.073704552 0.03636638
## gmat_vpc 0.666216035 0.20445365 0.075922253 -0.17460736
## gmat_tpc 1.000000000 0.11736245 0.079732099 -0.08303535
## s_avg 0.117362449 1.00000000 0.550621386 -0.76211664
## f_avg 0.079732099 0.55062139 1.000000000 -0.44750637
## quarter -0.083035351 -0.76211664 -0.447506366 1.00000000
## work_yrs -0.173361859 0.12929271 -0.039056921 -0.08602641
## salary 0.004895486 0.14671494 0.029892973 -0.16510933
## satis -0.054042239 0.05186620 -0.024905812 -0.01267787
## levelsatisfaction -0.081568271 0.05237872 -0.001388558 -0.02218761
## mba_avg 0.108129767 0.83691669 0.917710988 -0.65610099
## work_yrs salary satis levelsatisfaction
## age 0.85829810 -0.062541728 0.092344022 0.112149576
## gmat_tot -0.18235434 -0.055277947 -0.064439102 -0.065802070
## gmat_qpc -0.23660827 -0.044709042 -0.093120750 -0.072814057
## gmat_vpc -0.06639049 -0.005829072 0.009719018 -0.017008495
## gmat_tpc -0.17336186 0.004895486 -0.054042239 -0.081568271
## s_avg 0.12929271 0.146714937 0.051866203 0.052378720
## f_avg -0.03905692 0.029892973 -0.024905812 -0.001388558
## quarter -0.08602641 -0.165109333 -0.012677865 -0.022187606
## work_yrs 1.00000000 0.009195480 0.111489429 0.106615008
## salary 0.00919548 1.000000000 0.411933827 0.348286941
## satis 0.11148943 0.411933827 1.000000000 0.928549252
## levelsatisfaction 0.10661501 0.348286941 0.928549252 1.000000000
## mba_avg 0.03591991 0.089419818 0.008352243 0.024015881
## mba_avg
## age 0.059801826
## gmat_tot 0.122299412
## gmat_qpc 0.034121821
## gmat_vpc 0.147077265
## gmat_tpc 0.108129767
## s_avg 0.836916691
## f_avg 0.917710988
## quarter -0.656100987
## work_yrs 0.035919910
## salary 0.089419818
## satis 0.008352243
## levelsatisfaction 0.024015881
## mba_avg 1.000000000
cov(df[,c(1,3:10,12:15)])
## age gmat_tot gmat_qpc gmat_vpc
## age 1.376904e+01 -3.115879e+01 -1.192655e+01 -2.7636427
## gmat_tot -3.115879e+01 3.310688e+03 6.200233e+02 726.0006417
## gmat_qpc -1.192655e+01 6.200233e+02 2.210731e+02 38.1482581
## gmat_vpc -2.763643e+00 7.260006e+02 3.814826e+01 284.2481217
## gmat_tpc -8.839978e+00 6.839911e+02 1.357997e+02 157.4932488
## s_avg 2.116874e-01 2.480257e+00 -1.691233e-01 1.3135702
## f_avg -3.399348e-02 3.154688e+00 5.753854e-01 0.6720700
## quarter -2.045935e-01 -5.891153e+00 6.001979e-01 -3.2676666
## work_yrs 1.029494e+01 -3.391634e+01 -1.137186e+01 -3.6181653
## salary -1.184682e+04 -1.623645e+05 -3.393465e+04 -5016.8170905
## satis 7.770113e-01 -8.407663e+00 -3.139649e+00 0.3715676
## levelsatisfaction 1.646747e-01 -1.498222e+00 -4.284110e-01 -0.1134729
## mba_avg 8.884696e-02 2.817473e+00 2.031311e-01 0.9928201
## gmat_tpc s_avg f_avg quarter
## age -8.8399775 2.116874e-01 -0.033993476 -2.045935e-01
## gmat_tot 683.9910698 2.480257e+00 3.154688377 -5.891153e+00
## gmat_qpc 135.7996845 -1.691233e-01 0.575385418 6.001979e-01
## gmat_vpc 157.4932488 1.313570e+00 0.672069998 -3.267667e+00
## gmat_tpc 196.6057057 6.271001e-01 0.586986177 -1.292372e+00
## s_avg 0.6271001 1.452176e-01 0.110168979 -3.223721e-01
## f_avg 0.5869862 1.101690e-01 0.275672367 -2.608088e-01
## quarter -1.2923719 -3.223721e-01 -0.260808802 1.232119e+00
## work_yrs -7.8575172 1.592639e-01 -0.066286998 -3.086682e-01
## salary 3504.0793562 2.854066e+03 801.208887463 -9.355738e+03
## satis -1.7182963 4.481886e-02 -0.029652683 -3.191091e-02
## levelsatisfaction -0.4525815 7.898452e-03 -0.000288495 -9.745729e-03
## mba_avg 0.6070431 1.276933e-01 0.192920673 -2.915905e-01
## work_yrs salary satis
## age 10.29493864 -1.184682e+04 7.770113e-01
## gmat_tot -33.91633914 -1.623645e+05 -8.407663e+00
## gmat_qpc -11.37186171 -3.393465e+04 -3.139649e+00
## gmat_vpc -3.61816529 -5.016817e+03 3.715676e-01
## gmat_tpc -7.85751718 3.504079e+03 -1.718296e+00
## s_avg 0.15926392 2.854066e+03 4.481886e-02
## f_avg -0.06628700 8.012089e+02 -2.965268e-02
## quarter -0.30866822 -9.355738e+03 -3.191091e-02
## work_yrs 10.44882490 1.517358e+03 8.172108e-01
## salary 1517.35827919 2.605914e+09 4.768416e+04
## satis 0.81721077 4.768416e+04 5.142015e+00
## levelsatisfaction 0.13637336 7.035495e+03 8.331996e-01
## mba_avg 0.04648846 1.827637e+03 7.583086e-03
## levelsatisfaction mba_avg
## age 1.646747e-01 8.884696e-02
## gmat_tot -1.498222e+00 2.817473e+00
## gmat_qpc -4.284110e-01 2.031311e-01
## gmat_vpc -1.134729e-01 9.928201e-01
## gmat_tpc -4.525815e-01 6.070431e-01
## s_avg 7.898452e-03 1.276933e-01
## f_avg -2.884950e-04 1.929207e-01
## quarter -9.745729e-03 -2.915905e-01
## work_yrs 1.363734e-01 4.648846e-02
## salary 7.035495e+03 1.827637e+03
## satis 8.331996e-01 7.583086e-03
## levelsatisfaction 1.565867e-01 3.804978e-03
## mba_avg 3.804978e-03 1.603070e-01
ttable <-xtabs(~sex+levelsatisfaction,data=df)
addmargins(ttable)
## levelsatisfaction
## sex 0 1 Sum
## 1 42 164 206
## 2 11 57 68
## Sum 53 221 274
It appears like most of people are satisfied and gender has no role in deteremining level of satisfaction
table1 <-xtabs(~quarter+levelsatisfaction,data=df)
addmargins(table1)
## levelsatisfaction
## quarter 0 1 Sum
## 1 12 57 69
## 2 13 57 70
## 3 16 54 70
## 4 12 53 65
## Sum 53 221 274
It is apparent that quartile ranking doesn’t determine whether candidate or student likes MBA program
for(i in 1:nrow(df)){
if(df$salary[i] == 999 | df$salary[i]==998){
df$salary[i] <- 0
}
}
Let’s create a categorical variable is_good_salary
df$is_good_salary <- ifelse(df$salary>mean(df$salary),1,0)
table2 <-xtabs(~levelsatisfaction+is_good_salary,data = df)
addmargins(table2)
## is_good_salary
## levelsatisfaction 0 1 Sum
## 0 52 1 53
## 1 119 102 221
## Sum 171 103 274
It can be observed that those with more than average salary are more likely to like the MBA program
Assuming that people who didn’t answer or disclose their aren’t placed let’s create a variable called is_placed where 1->placed while 0-> not placed
df$is_placed <-ifelse(df$salary == 0,0,1)
Let’s have a null hypothesis that students who are placed are more likely to like the program.To verify if it’s true let’s chi-squared test.
chisq.test(df$is_placed,df$levelsatisfaction)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$is_placed and df$levelsatisfaction
## X-squared = 33.844, df = 1, p-value = 5.97e-09
It’s more likely that students who are placed like the Program
chisq.test(df$levelsatisfaction,df$sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$levelsatisfaction and df$sex
## X-squared = 0.34269, df = 1, p-value = 0.5583
Therefore we infer that Gender can’t determine if the candidate likes the program
t.test(satis~sex, data=df)
##
## Welch Two Sample t-test
##
## data: satis by sex
## t = -0.89694, df = 121.46, p-value = 0.3715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8823793 0.3321223
## sample estimates:
## mean in group 1 mean in group 2
## 4.563107 4.838235
t.test(quarter~is_good_salary,data=df)
##
## Welch Two Sample t-test
##
## data: quarter by is_good_salary
## t = 2.5051, df = 209.98, p-value = 0.013
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07373829 0.61836413
## sample estimates:
## mean in group 0 mean in group 1
## 2.608187 2.262136
t.test(mba_avg~is_placed,data=df)
##
## Welch Two Sample t-test
##
## data: mba_avg by is_placed
## t = -1.5967, df = 235.19, p-value = 0.1117
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.17246954 0.01805746
## sample estimates:
## mean in group 0 mean in group 1
## 3.014444 3.091650
t.test(quarter~is_placed,data=df)
##
## Welch Two Sample t-test
##
## data: quarter by is_placed
## t = 2.5051, df = 209.98, p-value = 0.013
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07373829 0.61836413
## sample estimates:
## mean in group 0 mean in group 1
## 2.608187 2.262136
Building regression model
regressor<-lm(salary~age+sex+gmat_tot+quarter+ work_yrs+frstlang+levelsatisfaction+mba_avg, data=df)
summary(regressor)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + quarter + work_yrs +
## frstlang + levelsatisfaction + mba_avg, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71823 -42333 -5484 44188 192105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 145632.97 56145.80 2.594 0.010 *
## age -3722.03 1535.70 -2.424 0.016 *
## sex2 2915.33 6693.33 0.436 0.664
## gmat_tot -46.44 51.66 -0.899 0.369
## quarter -7630.09 3453.08 -2.210 0.028 *
## work_yrs 2857.07 1773.08 1.611 0.108
## frstlang2 -1634.18 9228.36 -0.177 0.860
## levelsatisfaction 46021.37 7363.47 6.250 1.63e-09 ***
## mba_avg -2008.87 9562.48 -0.210 0.834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47160 on 265 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1509
## F-statistic: 7.065 on 8 and 265 DF, p-value: 1.838e-08
regressor$coefficients
## (Intercept) age sex2 gmat_tot
## 145632.97001 -3722.02575 2915.33393 -46.44401
## quarter work_yrs frstlang2 levelsatisfaction
## -7630.08650 2857.07456 -1634.17678 46021.36544
## mba_avg
## -2008.87235
Building a regressor with only significant variables and other variables that may influence salary
regressor1<-lm(salary~age+sex+quarter+ work_yrs+levelsatisfaction+mba_avg, data=df)
summary(regressor1)
##
## Call:
## lm(formula = salary ~ age + sex + quarter + work_yrs + levelsatisfaction +
## mba_avg, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71704 -43138 -6254 44129 193786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118782 47246 2.514 0.0125 *
## age -3791 1510 -2.511 0.0126 *
## sex2 3326 6659 0.499 0.6178
## quarter -7529 3441 -2.188 0.0295 *
## work_yrs 3080 1735 1.775 0.0770 .
## levelsatisfaction 46528 7253 6.415 6.36e-10 ***
## mba_avg -2616 9503 -0.275 0.7833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47050 on 267 degrees of freedom
## Multiple R-squared: 0.1733, Adjusted R-squared: 0.1547
## F-statistic: 9.327 on 6 and 267 DF, p-value: 2.728e-09
regressor1$coefficients
## (Intercept) age sex2 quarter
## 118781.983 -3790.899 3326.249 -7529.168
## work_yrs levelsatisfaction mba_avg
## 3079.910 46528.335 -2616.527
Building a logistic regression model to determine if the candidate has liked mba Program
library(caTools)#Library to split data into training and test set to test the model
split <- sample.split(df$levelsatisfaction,SplitRatio = 0.7)
training_set <- subset(df,split==TRUE)
test_set <- subset(df,split==FALSE)
Building Model
classifier <-glm(levelsatisfaction~age+
gmat_tot+
quarter+
work_yrs+
salary+
satis+
mba_avg+
is_good_salary,
family = binomial(),
data = training_set)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(classifier)
##
## Call:
## glm(formula = levelsatisfaction ~ age + gmat_tot + quarter +
## work_yrs + salary + satis + mba_avg + is_good_salary, family = binomial(),
## data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.048e-05 2.100e-08 2.100e-08 2.100e-08 5.666e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.500e+02 4.175e+05 0.000 1.000
## age 7.645e-01 1.433e+04 0.000 1.000
## gmat_tot -3.513e-03 2.164e+02 0.000 1.000
## quarter -7.311e-01 1.673e+04 0.000 1.000
## work_yrs -1.506e+00 2.092e+04 0.000 1.000
## salary -5.697e-04 1.119e+00 -0.001 1.000
## satis 4.265e+01 1.794e+04 0.002 0.998
## mba_avg -3.187e+00 7.998e+04 0.000 1.000
## is_good_salary 4.995e+01 1.317e+05 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.8821e+02 on 191 degrees of freedom
## Residual deviance: 1.2795e-08 on 183 degrees of freedom
## AIC: 18
##
## Number of Fisher Scoring iterations: 25
prob_pred = predict(classifier,type="response",newdata=test_set[,c(1,3,9,10,12,13,15,16)])
pred_results = ifelse(prob_pred>0.5,1,0)
comparison = table(test_set[,14],pred_results)
comparison
## pred_results
## 0 1
## 0 16 0
## 1 0 66
that’s great we got 100% accurate results because diagnol elements add up to 82!!!
Similarly let’s build a model find if student is placed or not
classifier1 <-glm(is_placed~age+
gmat_tot+
quarter+
work_yrs+
salary+
satis+
mba_avg+
is_good_salary,
family = binomial(),
data = training_set)
## Warning: glm.fit: algorithm did not converge
summary(classifier1)
##
## Call:
## glm(formula = is_placed ~ age + gmat_tot + quarter + work_yrs +
## salary + satis + mba_avg + is_good_salary, family = binomial(),
## data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.409e-06 -2.409e-06 -2.409e-06 2.409e-06 2.409e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01 5.496e+05 0 1
## age -8.143e-13 1.355e+04 0 1
## gmat_tot -4.166e-15 4.490e+02 0 1
## quarter -3.970e-12 3.365e+04 0 1
## work_yrs 8.045e-13 1.644e+04 0 1
## salary 7.688e-14 2.190e+00 0 1
## satis 5.302e-14 1.292e+04 0 1
## mba_avg 4.908e-13 1.005e+05 0 1
## is_good_salary 5.313e+01 2.372e+05 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.5938e+02 on 191 degrees of freedom
## Residual deviance: 1.1139e-09 on 183 degrees of freedom
## AIC: 18
##
## Number of Fisher Scoring iterations: 25
prob_pred1 = predict(classifier,type="response",newdata=test_set[,c(1,3,9,10,12,13,15,16)])
pred_results1 = ifelse(prob_pred1>0.3,1,0)
comparison1 = table(test_set[,17],pred_results1)
comparison1
## pred_results1
## 0 1
## 0 16 41
## 1 0 25
Model in this case predicts only with 53% accuracy so we need better classifiaction algorithm