#Reading the dataset and creating a data frame
mba.df<-read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
#Viewing the data frame mba.df
View(mba.df)
#Analyzing the summary of the data and describing the variables
library(psych)
describe(mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
#Creating a subset of those who have given their salary details and taken part in the survey
mbasalary.df<-mba.df[which(mba.df$salary>999),]
View(mbasalary.df)
#Visualizing through Box Plots / Bar Plots the distribution of each variable independently
##BOX PLOTS##
#GMAT Score
boxplot(mba.df$gmat_tot,horizontal = TRUE, xlab="GMAT Score",main="BoxPlot Presentation Of GMAT Score")
#GMAT Percentages
par(mfrow=c(1,3))
with(mba.df, boxplot(mba.df$gmat_qpc,main="GMAT percentage quantitive",ylab="Percentage %"))
with(mba.df, boxplot(mba.df$gmat_vpc,main="GMAT percentage verbal",ylab="Percentage %"))
with(mba.df, boxplot(mba.df$gmat_tpc,main="GMAT percentage total",ylab="Percentage %"))
par(mfrow=c(1,1))
#Spring avg and Fall avg
par(mfrow=c(1,2))
with(mba.df, boxplot(mba.df$s_avg,main="Spring MBA Average",ylab="Average"))
with(mba.df, boxplot(mba.df$f_avg,main="Fall MBA Average",ylab="Average"))
par(mfrow=c(1,1))
#Working years
boxplot(mba.df$work_yrs,horizontal = TRUE, xlab="Working Yeats",main=" Working experience in years")
#Starting Salaries
table(mba.df$salary>999)
##
## FALSE TRUE
## 171 103
salarygiven<-mba.df$salary[mba.df$salary>999]
##This is done to avoid the mean of those who didn't give their salary details or not placed.
boxplot(salarygiven,horizontal = TRUE, xlab="Salary",main="Boxplot presentation of Starting salary ")
##BAR PLOTS##
#Age
count<-table(mba.df$age)
barplot(count, main = "Barplot for age", xlab = "Age in Years")
#Sex
count1<-table(mba.df$sex)
barplot(count1, main = "Barplot for sex of the people", xlab = "Gender, Male(1) Female(2)")
axis(side=1, at=c(1,2), labels = c("Male", "Female"))
#Quartile Ranking
count2<-table(mba.df$quarter)
barplot(count2, main = "Barplot for quartile ranking")
#First Language
count3<-table(mba.df$frstlang)
barplot(count3, main = "Barplot for First language selected",xlab = "English(1), Others(2)")
axis(side=1, at=c(1,2), labels = c("English", "Others"))
#Degree of satisfaction
count4<-table(mba.df$satis[mba.df$satis<998])
barplot(count4, main = "Barplot for Degree of satisfaction", xlab = "Rating")
#Scatter Plots to understand how are the variables correlated pair-wise
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(mbasalary.df$salary,mbasalary.df$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(mbasalary.df$salary,mbasalary.df$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(mbasalary.df$salary,mbasalary.df$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)
#Plots for binary categorical data with salaries of MBAs
plot(jitter(mbasalary.df$sex),jitter(mbasalary.df$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)
plot(jitter(mbasalary.df$frstlang),jitter(mbasalary.df$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)
plot(jitter(mbasalary.df$satis),jitter(mbasalary.df$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)
#Scatterplot matrix
scatterplotMatrix(
mbasalary.df[
,c("salary","work_yrs","gmat_tpc")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix", diagonal = "histogram")
##Correlation tests to find relationship between different parameters
# Correlation matrix,covariance matrix, Corrgram
corr.test(mbasalary.df, use = "complete")
## Call:corr.test(x = mbasalary.df, use = "complete")
## Correlation matrix
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
## Sample Size
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sex 0.15 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## gmat_tot 0.43 0.84 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## gmat_qpc 0.10 0.14 0.00 0.00 1.00 0.00 1.00 1.00 1.00
## gmat_vpc 0.86 0.59 0.00 0.34 0.00 0.00 1.00 1.00 1.00
## gmat_tpc 0.33 0.64 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## s_avg 0.11 0.42 0.08 0.88 0.11 0.16 0.00 0.00 0.00
## f_avg 0.03 0.09 0.22 0.32 0.82 0.48 0.00 0.00 0.00
## quarter 0.21 0.83 0.29 0.90 0.20 0.32 0.00 0.00 0.00
## work_yrs 0.00 0.35 0.22 0.06 0.78 0.18 0.10 0.03 0.19
## frstlang 0.00 0.45 0.19 0.89 0.03 0.10 0.16 0.61 0.27
## salary 0.00 0.09 0.36 0.89 0.17 0.18 0.31 0.29 0.20
## satis 0.28 0.36 0.52 0.97 0.13 0.24 0.15 0.24 0.02
## work_yrs frstlang salary satis
## age 0.00 0.02 0.00 1
## sex 1.00 1.00 1.00 1
## gmat_tot 1.00 1.00 1.00 1
## gmat_qpc 1.00 1.00 1.00 1
## gmat_vpc 1.00 1.00 1.00 1
## gmat_tpc 1.00 1.00 1.00 1
## s_avg 1.00 1.00 1.00 1
## f_avg 1.00 1.00 1.00 1
## quarter 1.00 1.00 1.00 1
## work_yrs 0.00 1.00 0.00 1
## frstlang 0.05 0.00 0.42 1
## salary 0.00 0.01 0.00 1
## satis 0.53 0.37 0.69 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
x<-mbasalary.df[,c("age","sex","gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","quarter", "work_yrs", "frstlang", "salary","satis")]
y<-mbasalary.df[,c("salary","gmat_tpc","work_yrs","satis","age")]
cor(x,y)
## salary gmat_tpc work_yrs satis age
## age 0.49964284 -0.09609156 0.88052470 0.108323083 1.00000000
## sex -0.16628869 -0.04686981 -0.09233003 -0.091995338 -0.14352927
## gmat_tot -0.09067141 0.96680810 -0.12280018 0.064742057 -0.07871678
## gmat_qpc 0.01414130 0.65865003 -0.18270126 -0.003984632 -0.16503906
## gmat_vpc -0.13743230 0.78443167 -0.02812182 0.148634805 0.01799420
## gmat_tpc -0.13201783 1.00000000 -0.13246963 0.116308417 -0.09609156
## s_avg 0.10173175 0.13938500 0.16328236 -0.143565573 0.15654954
## f_avg -0.10603897 0.07051391 -0.21633018 -0.117733043 -0.21699191
## quarter -0.12848526 -0.09955033 -0.12896722 0.225119851 -0.12568145
## work_yrs 0.45466634 -0.13246963 1.00000000 0.062999256 0.88052470
## frstlang 0.26701953 -0.16437561 0.19627277 0.089834769 0.35026743
## salary 1.00000000 -0.13201783 0.45466634 -0.040050600 0.49964284
## satis -0.04005060 0.11630842 0.06299926 1.000000000 0.10832308
cov(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
## age
## age 10.7045498
## sex -0.2164477
## gmat_tot -13.0544451
## gmat_qpc -7.2279650
## gmat_vpc 0.9505045
## gmat_tpc -3.4602132
## s_avg 0.1938587
## f_avg -0.3462517
## quarter -0.4604988
## work_yrs 8.6728536
## frstlang 0.2898344
## salary 29210.5193223
## satis 0.2776509
var(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
## age
## age 10.7045498
## sex -0.2164477
## gmat_tot -13.0544451
## gmat_qpc -7.2279650
## gmat_vpc 0.9505045
## gmat_tpc -3.4602132
## s_avg 0.1938587
## f_avg -0.3462517
## quarter -0.4604988
## work_yrs 8.6728536
## frstlang 0.2898344
## salary 29210.5193223
## satis 0.2776509
#Visualizing relation through corrplots
library(corrplot)
## corrplot 0.84 loaded
corrplot(corr=cor(mbasalary.df[,c(1:13)],use = "complete.obs"), method = "ellipse")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#VIsualizing by corrgram
library(corrgram)
corrgram(mbasalary.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of MBA Salaries")
#Generating Contingency table and performing chi-Square Test
mytable<-xtabs(~sex+work_yrs, data = mbasalary.df)
addmargins(mytable)
## work_yrs
## sex 0 1 2 3 4 5 6 7 8 10 15 16 Sum
## 1 1 4 24 16 10 4 5 1 3 1 1 2 72
## 2 0 4 14 5 1 3 2 0 1 0 1 0 31
## Sum 1 8 38 21 11 7 7 1 4 1 2 2 103
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 8.1579, df = 11, p-value = 0.6991
##Because p value is more than 0.05 we can't reject the null hypothesis and
## the parameters sex and work_yrs are independent.
mytable1<-xtabs(~work_yrs+satis, data = mbasalary.df)
addmargins(mytable1)
## satis
## work_yrs 3 4 5 6 7 Sum
## 0 0 1 0 0 0 1
## 1 0 0 5 1 2 8
## 2 0 0 8 19 11 38
## 3 1 0 6 12 2 21
## 4 0 0 3 5 3 11
## 5 0 0 3 3 1 7
## 6 0 0 2 5 0 7
## 7 0 0 1 0 0 1
## 8 0 0 0 3 1 4
## 10 0 0 0 0 1 1
## 15 0 0 0 2 0 2
## 16 0 0 1 0 1 2
## Sum 1 1 29 50 22 103
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 131.13, df = 44, p-value = 1.35e-10
##Because p value is less than 0.05 we can reject the null hypothesis and
## the parameters are not independent.
mytable2<-xtabs(~sex+frstlang, data = mbasalary.df)
addmargins(mytable2)
## frstlang
## sex 1 2 Sum
## 1 68 4 72
## 2 28 3 31
## Sum 96 7 103
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable2
## X-squared = 0.11264, df = 1, p-value = 0.7372
##Because p value is more than 0.05 we can't reject the null hypothesis and
## the parameters are independent.
mytable3<-xtabs(~work_yrs+frstlang, data = mbasalary.df)
addmargins(mytable3)
## frstlang
## work_yrs 1 2 Sum
## 0 1 0 1
## 1 8 0 8
## 2 36 2 38
## 3 20 1 21
## 4 10 1 11
## 5 6 1 7
## 6 7 0 7
## 7 1 0 1
## 8 4 0 4
## 10 0 1 1
## 15 1 1 2
## 16 2 0 2
## Sum 96 7 103
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 22.274, df = 11, p-value = 0.02233
##Because p value is less than 0.05 we can reject the null hypothesis and
## the parameters are not independent.
#T-Test
#1.Average Salary of Males is greater than the average salaries of Females
t.test(salary~sex,alternative="greater",data=mbasalary.df)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.09047
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -1527.96 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
##Because p value is more than 0.05 we can't reject the null hypothesis
#2.Average Salary of people those who have English as their first language is greater than
#average salaries of those of speak other language
t.test(salary~frstlang, alternative="greater", data = mbasalary.df)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.8476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -51508.45 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
##Because p value is more than 0.05 we can't reject the null hypothesis
#3.Average GMAT Percentile of Males is greater than the average GMAT percentile of Females
t.test(gmat_tpc~sex, alternative="greater", data = mbasalary.df)
##
## Welch Two Sample t-test
##
## data: gmat_tpc by sex
## t = 0.43873, df = 48.83, p-value = 0.3314
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -3.157889 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 84.86111 83.74194
##Because p value is more than 0.05 we can't reject the null hypothesis
#Generating a multiple linear regression model for MBAs Salaries
#1.
model1<-lm(salary~work_yrs+gmat_tot-1, data = mbasalary.df)
summary(model1)
##
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot - 1, data = mbasalary.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30428 -9691 -624 8110 97678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## work_yrs 3264.289 579.553 5.632 1.61e-07 ***
## gmat_tot 146.716 4.449 32.976 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17920 on 101 degrees of freedom
## Multiple R-squared: 0.9712, Adjusted R-squared: 0.9706
## F-statistic: 1702 on 2 and 101 DF, p-value: < 2.2e-16
#Coefficents of the model
model1$coefficients
## work_yrs gmat_tot
## 3264.2887 146.7158
#Residuals of the model
residuals(model1)
## 35 36 37 38 39 40
## -15096.6876 -24229.6066 -20295.2915 -17158.1075 -18696.7642 -14285.2123
## 41 42 43 44 45 46
## -3299.5565 9574.8879 7447.7832 -2629.5301 -3619.4509 -1893.8188
## 47 48 49 50 51 52
## 11310.5992 -9082.3435 3106.1812 14513.4681 11579.1529 13376.2841
## 53 54 55 56 57 58
## 649.1028 1786.2868 -11078.0785 -13352.4464 5276.2076 10177.6803
## 59 60 61 62 63 64
## -3483.8162 -28258.0732 18314.8642 7567.5246 -5606.6559 22177.6803
## 65 66 67 68 69 115
## 977.5272 15649.1028 -4413.8663 4604.7429 56034.6821 -19563.8452
## 116 117 118 119 120 121
## -5492.3461 -12340.8180 -9828.1339 -4289.4773 7310.5992 16613.5446
## 122 123 124 125 126 127
## 4909.1265 7843.4417 4579.1529 -2416.5821 8376.2841 4782.0218
## 128 129 130 131 132 133
## 3507.6539 2243.3651 8782.0218 -4559.5802 4243.3651 -27922.2854
## 134 135 136 137 138 139
## 12441.9690 -5885.2888 20771.9426 9511.9189 21577.6037 48307.6539
## 186 187 188 189 190 191
## -1300.4818 -12256.6349 -1623.7159 -8959.5037 -9553.7660 -3482.2670
## 192 193 194 195 196 197
## -7020.9236 2111.9954 -6828.1339 9310.5992 6376.2841 -8295.2915
## 198 199 200 201 202 203
## -4553.7660 -29018.2078 -13961.0529 1040.4963 6441.9690 -21846.7087
## 204 205 206 207 208 209
## -15385.2888 13376.2841 -13352.4464 3939.0236 11314.8642 5704.7085
## 256 257 258 259 260 261
## -24689.4008 -26360.9764 -30428.2105 -12492.3461 -16223.7924 -21686.6850
## 262 263 264 265 266 267
## -11360.9764 -1025.1886 1441.9690 7843.4417 5441.9690 -1893.8188
## 268 269 270 271 272 273
## -1823.7924 -623.7924 13183.4944 13376.2841 25980.6256 36223.4681
## 274
## 97677.7912
#Fitting the model
fitted(model1)
## 35 36 37 38 39 40 41
## 100096.69 109229.61 106295.29 105158.11 110696.76 107285.21 98299.56
## 42 43 44 45 46 47 48
## 85425.11 87552.22 98629.53 99619.45 101893.82 88689.40 109082.34
## 49 50 51 52 53 54 55
## 101893.82 90486.53 93420.85 91623.72 104350.90 103213.71 117078.08
## 56 57 58 59 60 61 62
## 119352.45 102223.79 97822.32 113483.82 140258.07 96685.14 107432.48
## 63 64 65 66 67 68 69
## 123606.66 97822.32 119022.47 104350.90 124413.87 141395.26 105965.32
## 115 116 117 118 119 120 121
## 101563.85 97492.35 105340.82 104828.13 99289.48 88689.40 79886.46
## 122 123 124 125 126 127 128
## 93090.87 90156.56 93420.85 101416.58 91623.72 95217.98 97492.35
## 129 130 131 132 133 134 135
## 100756.63 95217.98 109559.58 100756.63 132922.29 94558.03 117885.29
## 136 137 138 139 186 187 188
## 94228.06 105488.08 108422.40 97492.35 79556.48 100756.63 91623.72
## 189 190 191 192 193 194 195
## 98959.50 102553.77 98482.27 104020.92 94888.00 104828.13 88689.40
## 196 197 198 199 200 201 202
## 91623.72 106295.29 102553.77 127018.21 113961.05 98959.50 94558.03
## 203 204 205 206 207 208 209
## 122946.71 117885.29 91623.72 119352.45 103360.98 96685.14 106295.29
## 256 257 258 259 260 261 262
## 88689.40 103360.98 115428.21 97492.35 102223.79 111686.68 103360.98
## 263 264 265 266 267 268 269
## 96025.19 94558.03 90156.56 94558.03 101893.82 102223.79 102223.79
## 270 271 272 273 274
## 90816.51 91623.72 89019.37 90486.53 122322.21
###. Model1: salary = b0 + b1*Work_yrs + b2*gmat_tot
# b0 = -1(assumption), b1 = 3264.2887, b2=146.7158
# Model: salary = -1 + 3264.2887*work_yrs + 146.7158*gmat_tot
model2<-lm(salary~work_yrs+age*frstlang+gmat_tot+sex-1, data = mbasalary.df)
summary(model2)
##
## Call:
## lm(formula = salary ~ work_yrs + age * frstlang + gmat_tot +
## sex - 1, data = mbasalary.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28406 -9496 -820 6174 69521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## work_yrs -958.87 1091.08 -0.879 0.381667
## age 2905.80 830.35 3.499 0.000706 ***
## frstlang -15290.15 25367.28 -0.603 0.548081
## gmat_tot 40.36 31.33 1.288 0.200705
## sex -2260.92 3407.03 -0.664 0.508518
## age:frstlang 794.41 797.85 0.996 0.321878
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15790 on 97 degrees of freedom
## Multiple R-squared: 0.9785, Adjusted R-squared: 0.9772
## F-statistic: 736.6 on 6 and 97 DF, p-value: < 2.2e-16
#Coefficents of the model
model2$coefficients
## work_yrs age frstlang gmat_tot sex
## -958.86681 2905.80137 -15290.15225 40.35853 -2260.92128
## age:frstlang
## 794.41173
#Residuals of the model
model2$fitted.values
## 35 36 37 38 39 40 41
## 87270.46 106427.00 98219.40 96049.78 109091.50 104021.93 98294.25
## 42 43 44 45 46 47 48
## 94335.24 91206.75 100228.43 96369.61 89608.22 91937.08 101220.18
## 49 50 51 52 53 54 55
## 99269.56 94274.84 98782.23 100144.68 112876.10 114406.69 110806.67
## 56 57 58 59 60 61 62
## 112885.00 104903.95 96292.77 103870.23 135630.49 101523.57 110050.37
## 63 64 65 66 67 68 69
## 130915.01 103693.20 109511.53 112876.10 110563.68 141500.32 102246.36
## 115 116 117 118 119 120 121
## 102474.89 95797.89 111278.20 100076.73 96696.36 91937.08 89515.57
## 122 123 124 125 126 127 128
## 89447.63 92340.67 101885.86 109360.96 90483.33 115920.84 95797.89
## 129 130 131 132 133 134 135
## 104500.37 104820.20 103221.67 104500.37 115111.49 100951.85 111042.12
## 136 137 138 139 186 187 188
## 95317.46 103945.09 101052.04 94358.59 85320.47 104500.37 96444.47
## 189 190 191 192 193 194 195
## 98462.39 95737.49 105300.63 103541.50 106586.24 94115.60 93376.38
## 196 197 198 199 200 201 202
## 94183.55 114039.71 106838.13 111697.59 97032.00 96201.47 91290.50
## 203 204 205 206 207 208 209
## 108720.80 114742.33 92744.25 111445.70 135705.66 105223.79 96780.11
## 256 257 258 259 260 261 262
## 89676.16 92272.72 98874.87 101759.02 98942.82 103793.39 99673.15
## 263 264 265 266 267 268 269
## 97655.22 100951.85 92340.67 91290.50 100708.86 112304.38 101203.74
## 270 271 272 273 274
## 124046.70 89044.04 93871.26 97975.06 150479.11
#Fitting the model
model2$residuals
## 35 36 37 38 39 40
## -2270.4554 -21426.9952 -12219.3985 -8049.7758 -17091.5018 -11021.9256
## 41 42 43 44 45 46
## -3294.2550 664.7582 3793.2476 -4228.4307 -369.6104 10391.7836
## 47 48 49 50 51 52
## 8062.9168 -1220.1817 5730.4361 10725.1558 6217.7721 4855.3200
## 53 54 55 56 57 58
## -7876.1025 -9406.6929 -4806.6727 -6884.9967 2596.0473 11707.2293
## 59 60 61 62 63 64
## 6129.7706 -23630.4855 13476.4258 4949.6314 -12915.0130 16306.8032
## 65 66 67 68 69 115
## 10488.4708 7123.8975 9436.3222 4499.6787 59753.6429 -20474.8931
## 116 117 118 119 120 121
## -3797.8868 -18278.2035 -5076.7345 -1696.3559 4062.9168 6984.4285
## 122 123 124 125 126 127
## 8552.3740 5659.3315 -3885.8592 -10360.9605 9516.6675 -15920.8413
## 128 129 130 131 132 133
## 5202.1132 -1500.3674 -820.2020 1778.3340 499.6326 -10111.4936
## 134 135 136 137 138 139
## 6048.1494 957.8804 19682.5383 11054.9141 28947.9566 51441.4051
## 186 187 188 189 190 191
## -7064.4745 -16000.3674 -6444.4669 -8462.3933 -2737.4891 -10300.6270
## 192 193 194 195 196 197
## -6541.5006 -9586.2394 3884.3999 4623.6250 3816.4544 -16039.7103
## 198 199 200 201 202 203
## -8838.1284 -13697.5940 2968.0043 3798.5280 9709.4969 -7620.8007
## 204 205 206 207 208 209
## -12242.3327 12255.7462 -5445.7049 -28405.6638 2776.2127 15219.8933
## 256 257 258 259 260 261
## -25676.1619 -15272.7230 -13874.8728 -16759.0211 -12942.8183 -13793.3897
## 262 263 264 265 266 267
## -7673.1492 -2655.2227 -4951.8506 5659.3315 8709.4969 -708.8557
## 268 269 270 271 272 273
## -11904.3789 396.2604 -20046.6967 15955.9593 21128.7411 28734.9427
## 274
## 69520.8920
###. Model2: salary = b0 + b1*Work_yrs + b2*age +b3*frstlang +b4*gmat_tot + b5*sex + b6*age*frstlangg
# b0 = -1(assumption), b1 = -958.86681 , b2=2905.80137 ,b3=-15290.15225, b4=40.35853
# b5=-2260.92128, b6=794.41173
# Model: salary = -1 -958.86681*work_yrs + 2905.80137*age - 15290.15225*frstlang
#+ 40.35853*gmat_tot - 2260.92128*sex +794.41173*age*frstlangg
#3.
model3<-lm(salary~work_yrs+age, data = mbasalary.df)
summary(model3)
##
## Call:
## lm(formula = salary ~ work_yrs + age, data = mbasalary.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31675 -8099 -2108 4411 80650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36967.5 23323.8 1.585 0.1161
## work_yrs 388.8 1084.0 0.359 0.7206
## age 2413.8 997.4 2.420 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared: 0.2506, Adjusted R-squared: 0.2356
## F-statistic: 16.72 on 2 and 100 DF, p-value: 5.438e-07
#Coefficents of the model
model3$coefficients
## (Intercept) work_yrs age
## 36967.4546 388.8347 2413.7599
#Residuals of the model
model3$fitted.values
## 35 36 37 38 39 40 41
## 90459.01 102916.64 98089.12 98477.96 102916.64 106496.91 94897.69
## 42 43 44 45 46 47 48
## 97700.29 98477.96 97700.29 101280.55 93261.60 95675.36 104471.98
## 49 50 51 52 53 54 55
## 98089.12 98477.96 100891.72 100502.88 111324.43 114127.02 112490.93
## 56 57 58 59 60 61 62
## 111713.26 103305.48 98477.96 106885.74 137325.45 103694.31 102527.81
## 63 64 65 66 67 68 69
## 120509.88 103305.48 106496.91 111324.43 112490.93 139350.37 97700.29
## 115 116 117 118 119 120 121
## 100114.05 98089.12 114904.69 98089.12 98477.96 95675.36 95675.36
## 122 123 124 125 126 127 128
## 93261.60 95675.36 100891.72 106496.91 95675.36 113349.35 98089.12
## 129 130 131 132 133 134 135
## 103305.48 106108.07 100891.72 103305.48 125256.65 100502.88 109299.50
## 136 137 138 139 186 187 188
## 95286.53 103694.31 101280.55 95675.36 92872.77 103305.48 98089.12
## 189 190 191 192 193 194 195
## 98089.12 98866.79 108910.67 103694.31 105719.24 95675.36 98089.12
## 196 197 198 199 200 201 202
## 98089.12 102916.64 106108.07 109688.33 96064.20 98089.12 95675.36
## 203 204 205 206 207 208 209
## 110077.17 111713.26 95675.36 109299.50 114985.44 106108.07 95675.36
## 256 257 258 259 260 261 262
## 95675.36 93261.60 98477.96 100502.88 100891.72 104083.15 98089.12
## 263 264 265 266 267 268 269
## 98089.12 100502.88 95675.36 95675.36 100502.88 108133.00 100891.72
## 270 271 272 273 274
## 113349.35 93261.60 98477.96 100891.72 139350.37
#Fitting the model
model3$residuals
## 35 36 37 38 39
## -5459.00732 -17916.64155 -12089.12173 -10477.95640 -10916.64155
## 40 41 42 43 44
## -13496.90548 102.30753 -2700.28706 -3477.95640 -1700.28706
## 45 46 47 48 49
## -5280.55098 6738.39810 4324.63818 -4471.98024 6910.87827
## 50 51 52 53 54
## 6522.04360 4108.28369 4497.11836 -6324.42530 -9127.01989
## 55 56 57 58 59
## -6490.92932 -5713.25997 4194.52378 9522.04360 3114.25985
## 60 61 62 63 64
## -25325.44590 11305.68911 12472.19312 -2509.87840 16694.52378
## 65 66 67 68 69
## 13503.09452 8675.57470 7509.07068 6649.62886 64299.71294
## 115 116 117 118 119
## -18114.04697 -6089.12173 -21904.68923 -3089.12173 -3477.95640
## 120 121 122 123 124
## 324.63818 824.63818 4738.39810 2324.63818 -2891.71631
## 125 126 127 128 129
## -7496.90548 4324.63818 -13349.35054 2910.87827 -305.47622
## 130 131 132 133 134
## -2108.07081 4108.28369 1694.52378 -20256.64634 6497.11836
## 135 136 137 138 139
## 2700.49994 19713.47286 11305.68911 28719.44902 50124.63818
## 186 187 188 189 190
## -14616.76723 -14805.47622 -8089.12173 -8089.12173 -5866.79107
## 191 192 193 194 195
## -13910.66539 -6694.31089 -8719.23613 2324.63818 -89.12173
## 196 197 198 199 200
## -89.12173 -4916.64155 -8108.07081 -11688.33473 3935.80351
## 201 202 203 204 205
## 1910.87827 5324.63818 -8977.16941 -9213.25997 9324.63818
## 206 207 208 209 256
## -3299.50006 -7685.44111 1891.92919 16324.63818 -31675.36182
## 257 258 259 260 261
## -16261.60190 -13477.95640 -15502.88164 -14891.71631 -14083.14557
## 262 263 264 265 266
## -6089.12173 -3089.12173 -4502.88164 2324.63818 4324.63818
## 267 268 269 270 271
## -502.88164 -7732.99605 708.28369 -9349.35054 11738.39810
## 272 273 274
## 16522.04360 25818.28369 80649.62886
###. Model3: salary = b0 + b1*Work_yrs + b2*age
# b0 = 36967.5, b1 = 388.8, b2= 2413.8
# Model: salary = 36967.5 + 388.8*work_yrs + 2413.8*age
#4.
model4<-lm(salary~work_yrs+sex, data = mbasalary.df)
summary(model4)
##
## Call:
## lm(formula = salary ~ work_yrs + sex, data = mbasalary.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31896 -8086 -2076 4789 90595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99676.9 5267.7 18.922 < 2e-16 ***
## work_yrs 2630.0 525.7 5.003 2.42e-06 ***
## sex -4860.6 3433.4 -1.416 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15910 on 100 degrees of freedom
## Multiple R-squared: 0.2223, Adjusted R-squared: 0.2068
## F-statistic: 14.29 on 2 and 100 DF, p-value: 3.471e-06
#Coefficents of the model
model4$coefficients
## (Intercept) work_yrs sex
## 99676.944 2629.973 -4860.589
#Residuals of the model
model4$fitted.values
## 35 36 37 38 39 40 41
## 92585.74 95215.71 95215.71 97845.68 100076.30 103105.63 94816.35
## 42 43 44 45 46 47 48
## 92585.74 97845.68 97446.33 100475.66 95215.71 100076.30 110596.19
## 49 50 51 52 53 54 55
## 100076.30 102706.27 102706.27 100076.30 107966.22 110596.19 115856.14
## 56 57 58 59 60 61 62
## 105735.60 102706.27 102706.27 105735.60 136895.92 105336.25 97446.33
## 63 64 65 66 67 68 69
## 121116.08 102706.27 107966.22 107966.22 110995.55 134265.95 97446.33
## 115 116 117 118 119 120 121
## 92585.74 95215.71 115856.14 100076.30 102706.27 100076.30 100076.30
## 122 123 124 125 126 127 128
## 100076.30 100076.30 97845.68 103105.63 95215.71 105336.25 95215.71
## 129 130 131 132 133 134 135
## 102706.27 105336.25 102706.27 102706.27 136895.92 100076.30 110596.19
## 136 137 138 139 186 187 188
## 97446.33 105336.25 105336.25 100076.30 92585.74 102706.27 100076.30
## 189 190 191 192 193 194 195
## 100076.30 105336.25 103105.63 105336.25 102706.27 95215.71 95215.71
## 196 197 198 199 200 201 202
## 95215.71 100076.30 105336.25 113226.16 102706.27 95215.71 95215.71
## 203 204 205 206 207 208 209
## 115856.14 110596.19 100076.30 110596.19 100076.30 105336.25 100076.30
## 256 257 258 259 260 261 262
## 95215.71 100076.30 97845.68 100076.30 97845.68 107966.22 100076.30
## 263 264 265 266 267 268 269
## 100076.30 100076.30 100076.30 95215.71 95215.71 102706.27 102706.27
## 270 271 272 273 274
## 105336.25 100076.30 102706.27 102706.27 129405.36
#Fitting the model
model4$residuals
## 35 36 37 38 39 40
## -7585.7388 -10215.7116 -9215.7116 -9845.6844 -8076.3006 -10105.6301
## 41 42 43 44 45 46
## 183.6450 2414.2612 -2845.6844 -1446.3278 -4475.6572 4784.2884
## 47 48 49 50 51 52
## -76.3006 -10596.1918 4923.6994 2293.7266 2293.7266 4923.6994
## 53 54 55 56 57 58
## -2966.2190 -5596.1918 -9856.1375 264.3971 4793.7266 5293.7266
## 59 60 61 62 63 64
## 4264.3971 -24895.9200 9663.7538 17553.6722 -3116.0831 17293.7266
## 65 66 67 68 69 115
## 12033.7810 12033.7810 9004.4515 11734.0528 64553.6722 -10585.7388
## 116 117 118 119 120 121
## -3215.7116 -22856.1375 -5076.3006 -7706.2734 -4076.3006 -3576.3006
## 122 123 124 125 126 127
## -2076.3006 -2076.3006 154.3156 -4105.6301 4784.2884 -5336.2462
## 128 129 130 131 132 133
## 5784.2884 293.7266 -1336.2462 2293.7266 2293.7266 -31895.9200
## 134 135 136 137 138 139
## 6923.6994 1403.8082 17553.6722 9663.7538 24663.7538 45723.6994
## 186 187 188 189 190 191
## -14329.7388 -14206.2734 -10076.3006 -10076.3006 -12336.2462 -8105.6301
## 192 193 194 195 196 197
## -8336.2462 -5706.2734 2784.2884 2784.2884 2784.2884 -2076.3006
## 198 199 200 201 202 203
## -7336.2462 -15226.1647 -2706.2734 4784.2884 5784.2884 -14756.1375
## 204 205 206 207 208 209
## -8096.1918 4923.6994 -4596.1918 7223.6994 2663.7538 11923.6994
## 256 257 258 259 260 261
## -31215.7116 -23076.3006 -12845.6844 -15076.3006 -11845.6844 -17966.2190
## 262 263 264 265 266 267
## -8076.3006 -5076.3006 -4076.3006 -2076.3006 4784.2884 4784.2884
## 268 269 270 271 272 273
## -2306.2734 -1106.2734 -1336.2462 4923.6994 12293.7266 24003.7266
## 274
## 90594.6418
###. Model4: salary = b0 + b1*Work_yrs + b2*sex
# b0 = 99676.944 , b1 = 2629.973, b2= -4860.589
# Model: salary = 99676.944 + 2629.973*work_yrs + -4860.589*sex
#Creating a subset of those who were not placed
mbasalary0.df<-mba.df[which(mba.df$salary<998),]
View(mbasalary0.df)
#Generating Contingency table and performing chi-Square Test
mytable<-xtabs(~sex+work_yrs, data = mbasalary0.df)
addmargins(mytable)
## work_yrs
## sex 0 1 2 3 4 5 6 7 8 9 10 11 12 13 16 18 22 Sum
## 1 1 12 16 9 8 7 2 3 2 0 0 1 2 0 1 1 2 67
## 2 0 0 6 5 1 5 0 2 0 1 1 1 0 1 0 0 0 23
## Sum 1 12 22 14 9 12 2 5 2 1 1 2 2 1 1 1 2 90
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 21.229, df = 16, p-value = 0.1699
##Because p value is more than 0.05 we can't reject the null hypothesis and
## the parameters sex and work_yrs are independent.
mytable2<-xtabs(~sex+frstlang, data = mbasalary0.df)
addmargins(mytable2)
## frstlang
## sex 1 2 Sum
## 1 60 7 67
## 2 22 1 23
## Sum 82 8 90
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable2
## X-squared = 0.21376, df = 1, p-value = 0.6438
##Because p value is more than 0.05 we can't reject the null hypothesis and
## the parameters sex and frstlang are independent.
#CHALENGE ACCEPTED#
#Generating model for those who got placed
mbasalary.df$sex <- factor(mbasalary.df$sex)
is.factor(mbasalary.df$sex)
## [1] TRUE
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = mbasalary.df)
summary(fit1)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = mbasalary.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4863 -0.7894 -0.5805 0.7626 2.3292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1064455 8.2384884 0.013 0.9897
## age -0.3643742 0.1889782 -1.928 0.0538 .
## gmat_tot 0.0162521 0.0269925 0.602 0.5471
## gmat_qpc -0.0435054 0.0770321 -0.565 0.5722
## gmat_vpc 0.0084836 0.0780797 0.109 0.9135
## gmat_tpc -0.0561304 0.1181993 -0.475 0.6349
## s_avg 0.1751868 1.5508906 0.113 0.9101
## f_avg 1.5943945 1.0429927 1.529 0.1263
## quarter 0.2901630 0.4253040 0.682 0.4951
## work_yrs 0.2410914 0.1783851 1.352 0.1765
## frstlang 2.4111026 1.0665299 2.261 0.0238 *
## salary -0.0000184 0.0000191 -0.963 0.3353
## satis -0.2638553 0.3332759 -0.792 0.4285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 126.01 on 102 degrees of freedom
## Residual deviance: 107.49 on 90 degrees of freedom
## AIC: 133.49
##
## Number of Fisher Scoring iterations: 5
##Now we can analyze the fitting and interpret what the model is telling us.
##Now we can run the anova() function on the model to analyze the table of deviance
anova(fit1, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 102 126.01
## age 1 2.3856 101 123.62 0.12245
## gmat_tot 1 0.0744 100 123.55 0.78507
## gmat_qpc 1 4.1847 99 119.36 0.04079 *
## gmat_vpc 1 1.8543 98 117.51 0.17329
## gmat_tpc 1 0.0823 97 117.43 0.77423
## s_avg 1 0.4155 96 117.01 0.51919
## f_avg 1 2.1057 95 114.90 0.14675
## quarter 1 0.4742 94 114.43 0.49107
## work_yrs 1 0.5956 93 113.83 0.44026
## frstlang 1 4.6687 92 109.17 0.03072 *
## salary 1 1.0389 91 108.13 0.30808
## satis 1 0.6359 90 107.49 0.42521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=mbasalary.df,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != mbasalary.df$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0485436893203883"
#Generating model for those who got placed
mbasalary0.df$sex <- factor(mbasalary0.df$sex)
is.factor(mbasalary0.df$sex)
## [1] TRUE
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = mbasalary0.df)
summary(fit2)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = mbasalary0.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5451 -0.7582 -0.4838 0.6019 2.1976
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.39699 7.84772 1.707 0.0878 .
## age 0.05353 0.12071 0.443 0.6574
## gmat_tot -0.03439 0.02183 -1.576 0.1151
## gmat_qpc 0.02944 0.06260 0.470 0.6381
## gmat_vpc 0.10328 0.06711 1.539 0.1238
## gmat_tpc 0.03205 0.06128 0.523 0.6010
## s_avg -0.47864 1.17187 -0.408 0.6830
## f_avg -0.58170 0.57645 -1.009 0.3129
## quarter -0.49321 0.36673 -1.345 0.1787
## work_yrs -0.08643 0.14181 -0.609 0.5422
## frstlang -0.31776 1.29059 -0.246 0.8055
## salary NA NA NA NA
## satis -0.51118 0.40913 -1.249 0.2115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 102.304 on 89 degrees of freedom
## Residual deviance: 86.742 on 78 degrees of freedom
## AIC: 110.74
##
## Number of Fisher Scoring iterations: 5
##Now we can analyze the fitting and interpret what the model is telling us.
##Now we can run the anova() function on the model to analyze the table of deviance
anova(fit2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 89 102.304
## age 1 0.4712 88 101.833 0.49244
## gmat_tot 1 0.3130 87 101.520 0.57585
## gmat_qpc 1 4.3705 86 97.150 0.03657 *
## gmat_vpc 1 5.0395 85 92.110 0.02478 *
## gmat_tpc 1 0.6560 84 91.454 0.41798
## s_avg 1 0.0490 83 91.405 0.82487
## f_avg 1 0.5497 82 90.855 0.45844
## quarter 1 1.6354 81 89.220 0.20096
## work_yrs 1 0.8609 80 88.359 0.35348
## frstlang 1 0.0078 79 88.351 0.92960
## salary 0 0.0000 79 88.351
## satis 1 1.6093 78 86.742 0.20459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#TO test the accuracy
fitted.results <- predict(fit2,data=mbasalary0.df,type='response')
fitted.results <- ifelse (fitted.results > 0.5,1,yes = 0)
misClasificError <- mean(fitted.results != mbasalary0.df$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.7"
` ```