This is the analysis of MBA Starting Salaries HBR Case Study. The data set with which we are dealing with here is a classification data set about MBA students and about their starting salaries , GMAT Scores , percentiles, age, sex etc.Some of them answered the survey,some don’t reveal their salary amount,some didn’t respond at all.
library(readr)
MBA_SS <- read_csv("C:/Users/Internship/MBA Starting Salaries Data.csv")
## Parsed with column specification:
## cols(
## age = col_integer(),
## sex = col_integer(),
## gmat_tot = col_integer(),
## gmat_qpc = col_integer(),
## gmat_vpc = col_integer(),
## gmat_tpc = col_integer(),
## s_avg = col_double(),
## f_avg = col_double(),
## quarter = col_integer(),
## work_yrs = col_integer(),
## frstlang = col_integer(),
## salary = col_integer(),
## satis = col_integer()
## )
View(MBA_SS)
summary(MBA_SS)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(MBA_SS)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
placed<-MBA_SS[which(MBA_SS$salary>999),]
View(placed)
PLACED<-MBA_SS$salary[MBA_SS$salary>999]
View(PLACED)
notplaced<-MBA_SS[which(MBA_SS$salary==0),]
View(notplaced)
MBA_SS1=MBA_SS[,1:13]
View(MBA_SS1)
MBA_SS2=log(MBA_SS1+1)
boxplot(MBA_SS2,xlab="Value",ylab="Parameters",main="Boxplot Presentation of different Parameters")
placed1=placed[,1:13]
View(placed1)
placed2=log(placed1+1)
boxplot(placed2,xlab="Value",ylab="Parameters",main="Boxplot Presentation of different Parameters")
boxplot(MBA_SS$gmat_tot,horizontal = TRUE, xlab="GMAT Score",main="BoxPlot Presentation Of GMAT Score")
par(mfrow=c(1,3))
with(MBA_SS, boxplot(MBA_SS$gmat_qpc,main="GMAT percentage quantitive",ylab="Percentage %"))
with(MBA_SS, boxplot(MBA_SS$gmat_vpc,main="GMAT percentage verbal",ylab="Percentage %"))
with(MBA_SS, boxplot(MBA_SS$gmat_tpc,main="GMAT percentage total",ylab="Percentage %"))
par(mfrow=c(1,1))
par(mfrow=c(1,2))
with(MBA_SS, boxplot(MBA_SS$s_avg,main="Spring MBA Average",ylab="Average"))
par(mfrow=c(1,3))
hist(MBA_SS$s_avg,xlab="s_avg",ylab="Frequency",main="spring MBA average", col=c("red","blue","green","yellow"))
boxplot(MBA_SS$s_avg,main="spring MBA average", xlab="s_avg",ylab="Frequency")
barplot(MBA_SS$s_avg,main = "spring MBA average", xlab="s_avg",ylab="Frequency")
with(MBA_SS, boxplot(MBA_SS$f_avg,main="Fall MBA Average",ylab="Average"))
par(mfrow=c(1,3))
hist(MBA_SS$f_avg,xlab="f_avg",ylab="Frequency",main="fall MBA average", col=c("red","blue","green","yellow"))
boxplot(MBA_SS$f_avg,main="fall MBA average", xlab="f_avg",ylab="Frequency")
barplot(MBA_SS$f_avg,main = "fall MBA average", xlab="f_avg",ylab="Frequency")
par(mfrow=c(1,1)) #Working years
boxplot(MBA_SS$work_yrs,horizontal = TRUE, xlab="Working Yeats",main=" Working experience in years")
table(MBA_SS$salary>999)
##
## FALSE TRUE
## 171 103
boxplot(placed,horizontal = TRUE, xlab="Salary",main="Boxplot presentation of Starting salary ")
#Age
count<-table(MBA_SS$age)
barplot(count, main = "Barplot for age", xlab = "Age in Years")
par(mfrow=c(1,3))
hist(placed$age,xlab="Age",ylab="Frequency",main="AGE", col=c("red","black","pink","yellow"))
boxplot(placed$age,main="AGE", xlab="Age",ylab="Frequency")
barplot(placed$age,main = "AGE", xlab="Age",ylab="Frequency")
#Sex
count1<-table(MBA_SS$sex)
par(mfrow=c(1,1))
pie(table(MBA_SS$sex),col=c("yellow","blue"),main="Gender Split Up")
#Quartile Ranking
count2<-table(MBA_SS$quarter)
par(mfrow=c(1,1))
pie(table(MBA_SS$quarter),col=c("red","yellow","grey","black"),main="Quartile Ranking")
par(mfrow=c(1,3))
hist(placed$quarter,xlab="quarter",ylab="Frequency",main="Quartile Ranking", col=c("pink","blue","black","yellow"))
boxplot(placed$quarter,main="quarter", xlab="quarter",ylab="Quartile Ranking")
barplot(placed$quarter,main = "quarter", xlab="quarter",ylab="Quartile Ranking")
#First Language
count3<-table(MBA_SS$frstlang)
par(mfrow=c(1,1))
pie(table(MBA_SS$frstlang),col=c("red","yellow"),main="First Language")
#Degree of satisfaction
count4<-table(MBA_SS$satis[MBA_SS$satis<998])
par(mfrow=c(1,1))
pie(table(MBA_SS$satis[MBA_SS$satis<998]),col=c("red","yellow","violet","black","blue","green","orange"),main="Degree of Satisfaction with MBA Program ")
par(mfrow=c(1,3))
hist(placed$satis,xlab="satis",ylab="Frequency",main="Degree of satisfaction", col=c("red","blue","green","yellow"))
boxplot(placed$satis,main="Degree of satisfaction", xlab="satis",ylab="Frequency")
barplot(placed$satis,main = "Degree of satisfaction", xlab="satis",ylab="Frequency")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(MBA_SS$salary,MBA_SS$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(MBA_SS$salary,MBA_SS$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(MBA_SS$salary,MBA_SS$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)
library(car)
scatterplot(placed$salary,placed$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(placed$salary,placed$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(placed$salary,placed$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)
## Creating JitterPlots
plot(jitter(placed$sex),jitter(placed$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)
plot(jitter(placed$frstlang),jitter(placed$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)
plot(jitter(placed$satis),jitter(placed$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)
plot(jitter(MBA_SS$sex),jitter(MBA_SS$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)
plot(jitter(MBA_SS$frstlang),jitter(MBA_SS$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)
plot(jitter(MBA_SS$satis),jitter(MBA_SS$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)
##Correlation tests to find relationship between different parameters
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
## Sample Size
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sex 0.15 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## gmat_tot 0.43 0.84 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## gmat_qpc 0.10 0.14 0.00 0.00 1.00 0.00 1.00 1.00 1.00
## gmat_vpc 0.86 0.59 0.00 0.34 0.00 0.00 1.00 1.00 1.00
## gmat_tpc 0.33 0.64 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## s_avg 0.11 0.42 0.08 0.88 0.11 0.16 0.00 0.00 0.00
## f_avg 0.03 0.09 0.22 0.32 0.82 0.48 0.00 0.00 0.00
## quarter 0.21 0.83 0.29 0.90 0.20 0.32 0.00 0.00 0.00
## work_yrs 0.00 0.35 0.22 0.06 0.78 0.18 0.10 0.03 0.19
## frstlang 0.00 0.45 0.19 0.89 0.03 0.10 0.16 0.61 0.27
## salary 0.00 0.09 0.36 0.89 0.17 0.18 0.31 0.29 0.20
## satis 0.28 0.36 0.52 0.97 0.13 0.24 0.15 0.24 0.02
## work_yrs frstlang salary satis
## age 0.00 0.02 0.00 1
## sex 1.00 1.00 1.00 1
## gmat_tot 1.00 1.00 1.00 1
## gmat_qpc 1.00 1.00 1.00 1
## gmat_vpc 1.00 1.00 1.00 1
## gmat_tpc 1.00 1.00 1.00 1
## s_avg 1.00 1.00 1.00 1
## f_avg 1.00 1.00 1.00 1
## quarter 1.00 1.00 1.00 1
## work_yrs 0.00 1.00 0.00 1
## frstlang 0.05 0.00 0.42 1
## salary 0.00 0.01 0.00 1
## satis 0.53 0.37 0.69 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
x<-placed[,c("age","sex","gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","quarter", "work_yrs", "frstlang", "salary","satis")]
y<-placed[,c("salary","gmat_tpc","work_yrs","satis")]
cor(x,y)
## salary gmat_tpc work_yrs satis
## age 0.49964284 -0.09609156 0.88052470 0.108323083
## sex -0.16628869 -0.04686981 -0.09233003 -0.091995338
## gmat_tot -0.09067141 0.96680810 -0.12280018 0.064742057
## gmat_qpc 0.01414130 0.65865003 -0.18270126 -0.003984632
## gmat_vpc -0.13743230 0.78443167 -0.02812182 0.148634805
## gmat_tpc -0.13201783 1.00000000 -0.13246963 0.116308417
## s_avg 0.10173175 0.13938500 0.16328236 -0.143565573
## f_avg -0.10603897 0.07051391 -0.21633018 -0.117733043
## quarter -0.12848526 -0.09955033 -0.12896722 0.225119851
## work_yrs 0.45466634 -0.13246963 1.00000000 0.062999256
## frstlang 0.26701953 -0.16437561 0.19627277 0.089834769
## salary 1.00000000 -0.13201783 0.45466634 -0.040050600
## satis -0.04005060 0.11630842 0.06299926 1.000000000
cov(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
var(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
## Sample Size
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sex 0.15 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## gmat_tot 0.43 0.84 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## gmat_qpc 0.10 0.14 0.00 0.00 1.00 0.00 1.00 1.00 1.00
## gmat_vpc 0.86 0.59 0.00 0.34 0.00 0.00 1.00 1.00 1.00
## gmat_tpc 0.33 0.64 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## s_avg 0.11 0.42 0.08 0.88 0.11 0.16 0.00 0.00 0.00
## f_avg 0.03 0.09 0.22 0.32 0.82 0.48 0.00 0.00 0.00
## quarter 0.21 0.83 0.29 0.90 0.20 0.32 0.00 0.00 0.00
## work_yrs 0.00 0.35 0.22 0.06 0.78 0.18 0.10 0.03 0.19
## frstlang 0.00 0.45 0.19 0.89 0.03 0.10 0.16 0.61 0.27
## salary 0.00 0.09 0.36 0.89 0.17 0.18 0.31 0.29 0.20
## satis 0.28 0.36 0.52 0.97 0.13 0.24 0.15 0.24 0.02
## work_yrs frstlang salary satis
## age 0.00 0.02 0.00 1
## sex 1.00 1.00 1.00 1
## gmat_tot 1.00 1.00 1.00 1
## gmat_qpc 1.00 1.00 1.00 1
## gmat_vpc 1.00 1.00 1.00 1
## gmat_tpc 1.00 1.00 1.00 1
## s_avg 1.00 1.00 1.00 1
## f_avg 1.00 1.00 1.00 1
## quarter 1.00 1.00 1.00 1
## work_yrs 0.00 1.00 0.00 1
## frstlang 0.05 0.00 0.42 1
## salary 0.00 0.01 0.00 1
## satis 0.53 0.37 0.69 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
library(corrplot)
## corrplot 0.84 loaded
corrplot(corr=cor(placed[,c(1:13)],use = "complete.obs"), method = "ellipse")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
corrplot.mixed(corr=cor(placed[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)
library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")
library(corrplot)
corrplot(corr=cor(MBA_SS[,c(1:13)],use = "complete.obs"), method = "ellipse")
library(gplots)
corrplot.mixed(corr=cor(MBA_SS[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)
library(corrgram)
corrgram(MBA_SS, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")
## Visualising using the GPLOT command
library(ggvis)
placed %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
library(ggvis)
MBA_SS %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_tot, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
mytable<-xtabs(~sex+work_yrs, data = placed)
addmargins(mytable)
## work_yrs
## sex 0 1 2 3 4 5 6 7 8 10 15 16 Sum
## 1 1 4 24 16 10 4 5 1 3 1 1 2 72
## 2 0 4 14 5 1 3 2 0 1 0 1 0 31
## Sum 1 8 38 21 11 7 7 1 4 1 2 2 103
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 8.1579, df = 11, p-value = 0.6991
p-value = 0.6991 We can’t reject the null hypothesis and the parameters works_yr and sex are independent.We can’t predict the sex of the MBA student from work experience .
mytable1<-xtabs(~work_yrs+satis, data = placed)
addmargins(mytable1)
## satis
## work_yrs 3 4 5 6 7 Sum
## 0 0 1 0 0 0 1
## 1 0 0 5 1 2 8
## 2 0 0 8 19 11 38
## 3 1 0 6 12 2 21
## 4 0 0 3 5 3 11
## 5 0 0 3 3 1 7
## 6 0 0 2 5 0 7
## 7 0 0 1 0 0 1
## 8 0 0 0 3 1 4
## 10 0 0 0 0 1 1
## 15 0 0 0 2 0 2
## 16 0 0 1 0 1 2
## Sum 1 1 29 50 22 103
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 131.13, df = 44, p-value = 1.35e-10
p-value = 1.35e-10(<0.05) We can reject the null hypothesis and the parameters works_yr and satis are not independent.we can predict work experience from his or her degree of satisfaction.
mytable2<-xtabs(~frstlang+~sex, data = placed)
addmargins(mytable2)
## sex
## frstlang 1 2 Sum
## 1 68 28 96
## 2 4 3 7
## Sum 72 31 103
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable2
## X-squared = 0.11264, df = 1, p-value = 0.7372
Since, p-value = 0.7372 We can’t reject the null hypothesis and the parameters sex and frstlang are independent.we can’t predict work experience from her first language.
mytable3<-xtabs(~work_yrs+frstlang, data = placed)
addmargins(mytable3)
## frstlang
## work_yrs 1 2 Sum
## 0 1 0 1
## 1 8 0 8
## 2 36 2 38
## 3 20 1 21
## 4 10 1 11
## 5 6 1 7
## 6 7 0 7
## 7 1 0 1
## 8 4 0 4
## 10 0 1 1
## 15 1 1 2
## 16 2 0 2
## Sum 96 7 103
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 22.274, df = 11, p-value = 0.02233
Since, p-value = 0.02233(<0.05) we can reject the null hypothesis and the parameters works_yr and frstlang are not independent. That is, we can predict work experience from her first language. ##T-Tests For Hypothesis
t.test(salary~sex,alternative="greater",data=placed) #Average Salary of Males is greater than the average salary of Females
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.09047
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -1527.96 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
t.test(salary~frstlang, alternative="greater", data = placed) #Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.8476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -51508.45 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
t.test(gmat_tpc~sex, alternative="greater", data = placed) #Average GMAT Percentile of Males is greater than the average GMAT percentile of Females
##
## Welch Two Sample t-test
##
## data: gmat_tpc by sex
## t = 0.43873, df = 48.83, p-value = 0.3314
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -3.157889 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 84.86111 83.74194
Since, p-value = 0.09047 (>0.05) We can’t say that the average salary of males is greater than females as we can’t reject the null hypothesis that Average Salary of Males is greater than the average salary of Females
Since, p-value = 0.8476 (>0.05) We can’t say that the average salary of English speaking students is greater than other language speaking students as we can’t reject the null hypothesis that Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage
we can’t say the the average GMAT percentile of males is more than that of females Since, p-value = 0.3314 (>0.05) & we can’t reject the null hypothesis that Average GMAT Percentile of Males is greater than the average GMAT percentile of Females ##The Model (SUbmodels)
placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
model1 <- lm(salary ~ age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
summary(model1)
##
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + gmat_tot + sex +
## satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27974 -7315 -119 5041 72348
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57744.917 50742.159 1.138 0.2581
## age 2314.479 1013.453 2.284 0.0247 *
## work_yrs 407.556 1091.395 0.373 0.7097
## mb_avg -2606.422 7640.220 -0.341 0.7338
## gmat_tot -2.674 170.717 -0.016 0.9875
## sex -3128.430 3495.714 -0.895 0.3732
## satis -1362.352 2070.020 -0.658 0.5121
## gmat_vpc 551.916 489.787 1.127 0.2627
## gmat_qpc 836.303 491.191 1.703 0.0920 .
## gmat_tpc -1434.011 712.038 -2.014 0.0469 *
## s_avg 4777.674 7525.889 0.635 0.5271
## f_avg NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 92 degrees of freedom
## Multiple R-squared: 0.3276, Adjusted R-squared: 0.2546
## F-statistic: 4.483 on 10 and 92 DF, p-value: 3.734e-05
model1$coefficients
## (Intercept) age work_yrs mb_avg gmat_tot sex
## 57744.91736 2314.47880 407.55619 -2606.42188 -2.67356 -3128.42951
## satis gmat_vpc gmat_qpc gmat_tpc s_avg f_avg
## -1362.35189 551.91609 836.30336 -1434.01111 4777.67360 NA
Initially a model can be generated for salary where variables are taken maximum but removing few based on out previous hypothesis testing. so first model is created with predictive variables age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg
And Adjusted R-squared: 0.2546 is noted. Then I removed one by one variable in subsequent stepsand keep doing regression until maximum adjustable R-squared is obtained. Which is actually best fit to regression. ##Fitting Mutilple Regression With y=f(x1,x2,x3,.x9) ##Multi-variable Regression Line
placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
modelA <- lm(salary ~ age + work_yrs + mb_avg + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelA)
##
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + sex + satis +
## gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27980 -7307 -92 5027 72329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57125.6 31624.0 1.806 0.0741 .
## age 2313.4 1005.5 2.301 0.0236 *
## work_yrs 407.9 1085.3 0.376 0.7079
## mb_avg -2625.6 7500.5 -0.350 0.7271
## sex -3135.1 3451.2 -0.908 0.3660
## satis -1356.3 2022.4 -0.671 0.5041
## gmat_vpc 546.7 358.6 1.524 0.1308
## gmat_qpc 831.1 358.6 2.318 0.0227 *
## gmat_tpc -1435.8 699.3 -2.053 0.0429 *
## s_avg 4779.9 7484.0 0.639 0.5246
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15340 on 93 degrees of freedom
## Multiple R-squared: 0.3276, Adjusted R-squared: 0.2626
## F-statistic: 5.036 on 9 and 93 DF, p-value: 1.573e-05
modelB <- lm(salary ~ age + work_yrs + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelB)
##
## Call:
## lm(formula = salary ~ age + work_yrs + sex + satis + gmat_vpc +
## gmat_qpc + gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27658 -7463 -293 4791 71831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55014.6 30898.5 1.780 0.0782 .
## age 2336.0 998.7 2.339 0.0215 *
## work_yrs 446.0 1074.8 0.415 0.6791
## sex -3281.6 3409.7 -0.962 0.3383
## satis -1357.9 2012.9 -0.675 0.5016
## gmat_vpc 539.6 356.4 1.514 0.1334
## gmat_qpc 818.4 355.1 2.305 0.0234 *
## gmat_tpc -1415.7 693.7 -2.041 0.0441 *
## s_avg 2620.6 4218.4 0.621 0.5359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15270 on 94 degrees of freedom
## Multiple R-squared: 0.3268, Adjusted R-squared: 0.2695
## F-statistic: 5.703 on 8 and 94 DF, p-value: 6.544e-06
modelC <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelC)
##
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc +
## gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27509 -7439 -100 4348 72056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47208.3 24404.0 1.934 0.0560 .
## age 2697.3 486.8 5.541 2.68e-07 ***
## sex -3200.4 3389.2 -0.944 0.3474
## satis -1394.6 2002.2 -0.697 0.4878
## gmat_vpc 526.8 353.5 1.490 0.1394
## gmat_qpc 806.5 352.4 2.289 0.0243 *
## gmat_tpc -1397.3 689.2 -2.027 0.0454 *
## s_avg 2710.2 4194.4 0.646 0.5198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15210 on 95 degrees of freedom
## Multiple R-squared: 0.3255, Adjusted R-squared: 0.2758
## F-statistic: 6.55 on 7 and 95 DF, p-value: 2.59e-06
modelD <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelD)
##
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc +
## gmat_tpc, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25949 -7801 -299 5037 70273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53662.6 22198.3 2.417 0.0175 *
## age 2759.8 475.6 5.803 8.38e-08 ***
## sex -3002.5 3365.1 -0.892 0.3745
## satis -1640.2 1959.8 -0.837 0.4047
## gmat_vpc 516.1 352.0 1.466 0.1459
## gmat_qpc 788.6 350.2 2.252 0.0266 *
## gmat_tpc -1353.4 683.8 -1.979 0.0507 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15160 on 96 degrees of freedom
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.2802
## F-statistic: 7.618 on 6 and 96 DF, p-value: 1.057e-06
modelE <- lm(salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelE)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28067 -8243 10 5811 69769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45736.1 20045.4 2.282 0.0247 *
## age 2723.1 472.9 5.759 9.95e-08 ***
## sex -2753.2 3346.7 -0.823 0.4127
## gmat_vpc 520.6 351.4 1.481 0.1418
## gmat_qpc 806.5 349.0 2.311 0.0230 *
## gmat_tpc -1387.1 681.6 -2.035 0.0446 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15140 on 97 degrees of freedom
## Multiple R-squared: 0.3176, Adjusted R-squared: 0.2824
## F-statistic: 9.03 on 5 and 97 DF, p-value: 4.494e-07
modelF <- lm(salary ~ age + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelF)
##
## Call:
## lm(formula = salary ~ age + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29373 -8011 280 5705 67116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39383.0 18467.6 2.133 0.0355 *
## age 2791.1 464.8 6.005 3.25e-08 ***
## gmat_vpc 513.2 350.7 1.463 0.1467
## gmat_qpc 822.3 347.9 2.363 0.0201 *
## gmat_tpc -1383.8 680.4 -2.034 0.0447 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15110 on 98 degrees of freedom
## Multiple R-squared: 0.3129, Adjusted R-squared: 0.2848
## F-statistic: 11.16 on 4 and 98 DF, p-value: 1.691e-07
modelH <- lm(salary ~ age + work_yrs + gmat_tot + sex + gmat_tpc + mb_avg, data = placed)
summary(modelH)
##
## Call:
## lm(formula = salary ~ age + work_yrs + gmat_tot + sex + gmat_tpc +
## mb_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31851 -7752 -1585 6885 75610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16870.1 38025.6 0.444 0.6583
## age 2160.3 1008.4 2.142 0.0347 *
## work_yrs 482.3 1091.0 0.442 0.6594
## gmat_tot 180.1 122.9 1.465 0.1461
## sex -4703.9 3446.0 -1.365 0.1754
## gmat_tpc -948.7 562.8 -1.686 0.0951 .
## mb_avg 618.0 4352.6 0.142 0.8874
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15570 on 96 degrees of freedom
## Multiple R-squared: 0.2853, Adjusted R-squared: 0.2407
## F-statistic: 6.389 on 6 and 96 DF, p-value: 1.096e-05
modelI <- lm(salary ~ age + work_yrs + sex + gmat_tpc + mb_avg, data = placed)
summary(modelI)
##
## Call:
## lm(formula = salary ~ age + work_yrs + sex + gmat_tpc + mb_avg,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31606 -7795 -2434 5953 81096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52297.7 29522.8 1.771 0.0796 .
## age 2304.0 1009.5 2.282 0.0247 *
## work_yrs 374.3 1094.9 0.342 0.7332
## sex -4276.7 3453.9 -1.238 0.2186
## gmat_tpc -151.0 143.6 -1.052 0.2955
## mb_avg 1937.7 4283.5 0.452 0.6520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15660 on 97 degrees of freedom
## Multiple R-squared: 0.2694, Adjusted R-squared: 0.2317
## F-statistic: 7.152 on 5 and 97 DF, p-value: 9.811e-06
modelJ <- lm(salary ~ age + work_yrs + gmat_tot + mb_avg, data = placed)
summary(modelJ)
##
## Call:
## lm(formula = salary ~ age + work_yrs + gmat_tot + mb_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33021 -7914 -1891 4615 79250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44101.55 31514.13 1.399 0.1648
## age 2457.29 1008.22 2.437 0.0166 *
## work_yrs 316.81 1100.09 0.288 0.7740
## gmat_tot -18.51 31.50 -0.588 0.5581
## mb_avg 1089.07 4289.73 0.254 0.8001
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15750 on 98 degrees of freedom
## Multiple R-squared: 0.2534, Adjusted R-squared: 0.223
## F-statistic: 8.317 on 4 and 98 DF, p-value: 8.093e-06
model2 <- lm(salary ~ age + gmat_tot + mb_avg, data = placed)
summary(model2)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + mb_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32905 -8104 -1887 4804 79345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39016.57 25981.91 1.502 0.136
## age 2712.82 476.56 5.692 1.28e-07 ***
## gmat_tot -19.54 31.15 -0.627 0.532
## mb_avg 1102.87 4269.55 0.258 0.797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15680 on 99 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2302
## F-statistic: 11.16 on 3 and 99 DF, p-value: 2.248e-06
model2$coefficients
## (Intercept) age gmat_tot mb_avg
## 39016.56653 2712.81802 -19.53809 1102.87444
plot(modelA)
plot(modelC)
plot(modelC)
plot(modelC)
plot(model2)
model3<-lm(salary~work_yrs+gmat_tot-1, data = placed)
summary(model3)
##
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot - 1, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30428 -9691 -624 8110 97678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## work_yrs 3264.289 579.553 5.632 1.61e-07 ***
## gmat_tot 146.716 4.449 32.976 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17920 on 101 degrees of freedom
## Multiple R-squared: 0.9712, Adjusted R-squared: 0.9706
## F-statistic: 1702 on 2 and 101 DF, p-value: < 2.2e-16
plot(model3)
model3$coefficients
## work_yrs gmat_tot
## 3264.2887 146.7158
model4<-lm(salary~work_yrs+age*frstlang+gmat_tot+sex-1, data = placed)
summary(model4)
##
## Call:
## lm(formula = salary ~ work_yrs + age * frstlang + gmat_tot +
## sex - 1, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28406 -9496 -820 6174 69521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## work_yrs -958.87 1091.08 -0.879 0.381667
## age 2905.80 830.35 3.499 0.000706 ***
## frstlang -15290.15 25367.28 -0.603 0.548081
## gmat_tot 40.36 31.33 1.288 0.200705
## sex -2260.92 3407.03 -0.664 0.508518
## age:frstlang 794.41 797.85 0.996 0.321878
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15790 on 97 degrees of freedom
## Multiple R-squared: 0.9785, Adjusted R-squared: 0.9772
## F-statistic: 736.6 on 6 and 97 DF, p-value: < 2.2e-16
plot(model4)
model4$coefficients
## work_yrs age frstlang gmat_tot sex
## -958.86681 2905.80137 -15290.15225 40.35853 -2260.92128
## age:frstlang
## 794.41173
model5<-lm(salary~work_yrs+age, data = placed)
summary(model5)
##
## Call:
## lm(formula = salary ~ work_yrs + age, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31675 -8099 -2108 4411 80650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36967.5 23323.8 1.585 0.1161
## work_yrs 388.8 1084.0 0.359 0.7206
## age 2413.8 997.4 2.420 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared: 0.2506, Adjusted R-squared: 0.2356
## F-statistic: 16.72 on 2 and 100 DF, p-value: 5.438e-07
plot(model5)
model5$coefficients
## (Intercept) work_yrs age
## 36967.4546 388.8347 2413.7599
model6<-lm(salary~work_yrs+sex, data = placed)
summary(model6)
##
## Call:
## lm(formula = salary ~ work_yrs + sex, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31896 -8086 -2076 4789 90595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99676.9 5267.7 18.922 < 2e-16 ***
## work_yrs 2630.0 525.7 5.003 2.42e-06 ***
## sex -4860.6 3433.4 -1.416 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15910 on 100 degrees of freedom
## Multiple R-squared: 0.2223, Adjusted R-squared: 0.2068
## F-statistic: 14.29 on 2 and 100 DF, p-value: 3.471e-06
plot(model6)
model6$coefficients
## (Intercept) work_yrs sex
## 99676.944 2629.973 -4860.589
Multiple Regression Model 3 -> The first model has salary as response variable or dependent variable. -> Predictor variables are work_yrs and gmat_tot. -> The R^2 value is 0.9712 which is a very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 4 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs , gmat_tot., age, frstlang and sex. -> The R^2 value is 0.9785 which is very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 5 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs and age. -> The R^2 value is 0.2506which is not so good model -> The model’s, p-value:5.438e-07is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 6 -> The first model has salary as response variable or dependent variable. ->Predictor variables are are work_yrs , and sex. -> The R^2 value is 0.2223which is very good model -> The model’s, p-value: 3.471e-06 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
placed$sex <- factor(placed$sex) #Generating model for those who got placed
is.factor(placed$sex)
## [1] TRUE
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed)
summary(fit1)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = placed)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4863 -0.7894 -0.5805 0.7626 2.3292
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1064455 8.2384884 0.013 0.9897
## age -0.3643742 0.1889782 -1.928 0.0538 .
## gmat_tot 0.0162521 0.0269925 0.602 0.5471
## gmat_qpc -0.0435054 0.0770321 -0.565 0.5722
## gmat_vpc 0.0084836 0.0780797 0.109 0.9135
## gmat_tpc -0.0561304 0.1181993 -0.475 0.6349
## s_avg 0.1751868 1.5508906 0.113 0.9101
## f_avg 1.5943945 1.0429927 1.529 0.1263
## quarter 0.2901630 0.4253040 0.682 0.4951
## work_yrs 0.2410914 0.1783851 1.352 0.1765
## frstlang 2.4111026 1.0665299 2.261 0.0238 *
## salary -0.0000184 0.0000191 -0.963 0.3353
## satis -0.2638553 0.3332759 -0.792 0.4285
## mb_avg NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 126.01 on 102 degrees of freedom
## Residual deviance: 107.49 on 90 degrees of freedom
## AIC: 133.49
##
## Number of Fisher Scoring iterations: 5
anova(fit1, test = "Chisq")#Now we can run the anova() function on the model to analyze the table of deviance
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 102 126.01
## age 1 2.3856 101 123.62 0.12245
## gmat_tot 1 0.0744 100 123.55 0.78507
## gmat_qpc 1 4.1847 99 119.36 0.04079 *
## gmat_vpc 1 1.8543 98 117.51 0.17329
## gmat_tpc 1 0.0823 97 117.43 0.77423
## s_avg 1 0.4155 96 117.01 0.51919
## f_avg 1 2.1057 95 114.90 0.14675
## quarter 1 0.4742 94 114.43 0.49107
## work_yrs 1 0.5956 93 113.83 0.44026
## frstlang 1 4.6687 92 109.17 0.03072 *
## salary 1 1.0389 91 108.13 0.30808
## satis 1 0.6359 90 107.49 0.42521
## mb_avg 0 0.0000 90 107.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=placed,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != placed$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0485436893203883"
notplaced$sex <- factor(notplaced$sex) #Generating model for those who got not placed
is.factor(notplaced$sex)
## [1] TRUE
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = notplaced)
summary(fit2)
##
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = notplaced)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5451 -0.7582 -0.4838 0.6019 2.1976
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.39699 7.84772 1.707 0.0878 .
## age 0.05353 0.12071 0.443 0.6574
## gmat_tot -0.03439 0.02183 -1.576 0.1151
## gmat_qpc 0.02944 0.06260 0.470 0.6381
## gmat_vpc 0.10328 0.06711 1.539 0.1238
## gmat_tpc 0.03205 0.06128 0.523 0.6010
## s_avg -0.47864 1.17187 -0.408 0.6830
## f_avg -0.58170 0.57645 -1.009 0.3129
## quarter -0.49321 0.36673 -1.345 0.1787
## work_yrs -0.08643 0.14181 -0.609 0.5422
## frstlang -0.31776 1.29059 -0.246 0.8055
## salary NA NA NA NA
## satis -0.51118 0.40913 -1.249 0.2115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 102.304 on 89 degrees of freedom
## Residual deviance: 86.742 on 78 degrees of freedom
## AIC: 110.74
##
## Number of Fisher Scoring iterations: 5
anova(fit2, test = "Chisq") #anova test
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 89 102.304
## age 1 0.4712 88 101.833 0.49244
## gmat_tot 1 0.3130 87 101.520 0.57585
## gmat_qpc 1 4.3705 86 97.150 0.03657 *
## gmat_vpc 1 5.0395 85 92.110 0.02478 *
## gmat_tpc 1 0.6560 84 91.454 0.41798
## s_avg 1 0.0490 83 91.405 0.82487
## f_avg 1 0.5497 82 90.855 0.45844
## quarter 1 1.6354 81 89.220 0.20096
## work_yrs 1 0.8609 80 88.359 0.35348
## frstlang 1 0.0078 79 88.351 0.92960
## salary 0 0.0000 79 88.351
## satis 1 1.6093 78 86.742 0.20459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=notplaced,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results !=notplaced$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0444444444444444"
In logistics regression the dependent variable can only be categorical , so we choose sex as the dependent variable. ->The model is prepared for both who were placed and not placed MBA Students . ->The accuracy of both the models came out to be very less and they are 4.85% and 4.44% repectively.