The data set with which we are dealing with here is a classification data set about MBA students and about their starting salaries , GMAT Scores , percentiles, age, sex, etc. Some of them answered the survey,some did not reveal their salary amount and few did not respond to the survey.
setwd("C:\\Users\\PSrikanth\\Documents\\Internship")
MBA.df<-read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(MBA.df)
summary(MBA.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(MBA.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
boxplot(MBA.df$salary ~ MBA.df$sex, data=MBA.df, horizontal=TRUE, yaxt="n",
ylab="Gender", xlab="Salary",
main="Salaries of Males and Females")
axis(side=2, at=c(1,2), labels=c("F", "M"))
placed<-MBA.df[which(MBA.df$salary>999),]
PLACED<-MBA.df$salary[MBA.df$salary>999]
Notplaced<-MBA.df[which(MBA.df$salary==0),]
MBA1=MBA.df[,1:13]
MBA2=log(MBA1+1)
boxplot(MBA2,xlab="Value",ylab="Parameters",main="Boxplot Representation of fields")
boxplot(MBA.df$gmat_tot,horizontal = TRUE, xlab="GMAT Score",main="BoxPlot Representation Of GMAT Score")
par(mfrow=c(1,3))
with(MBA.df, boxplot(MBA.df$gmat_qpc,main="GMAT percentage quantitive",ylab="Percentage %"))
with(MBA.df, boxplot(MBA.df$gmat_vpc,main="GMAT percentage verbal",ylab="Percentage %"))
with(MBA.df, boxplot(MBA.df$gmat_tpc,main="GMAT percentage total",ylab="Percentage %"))
hist(MBA.df$s_avg,xlab="Average",ylab="Frequency",main="Spring MBA average", col=c("pink","red","purple","navy"))
boxplot(MBA.df$s_avg,main="Spring MBA average", xlab="Average",ylab="Frequency")
barplot(MBA.df$s_avg,main = "Spring MBA average", xlab="Average",ylab="Frequency")
with(MBA.df, boxplot(MBA.df$f_avg,main="Fall MBA Average",ylab="Average"))
par(mfrow=c(1,3))
hist(MBA.df$f_avg,xlab="Fall Average",ylab="Frequency",main="Fall MBA average", col=c("maroon","navy","purple","cyan"))
boxplot(MBA.df$f_avg,main="Fall MBA average", xlab="Average",ylab="Frequency")
barplot(MBA.df$f_avg,main = "Fall MBA average", xlab="Average",ylab="Frequency")
par(mfrow=c(1,1)) #Working years
boxplot(MBA.df$work_yrs,horizontal = TRUE, xlab="Working Yeats",main=" Working experience in years")
table(MBA.df$salary>999)
##
## FALSE TRUE
## 171 103
boxplot(placed,horizontal = TRUE, xlab="Salary",main="Boxplot presentation of starting salary ")
count<-table(MBA.df$age)
barplot(count, main = "Barplot for age", xlab = "Age in Years")
par(mfrow=c(1,3))
hist(placed$age,xlab="Age",ylab="Frequency",main="AGE", col=c("red","black","pink","yellow"))
boxplot(placed$age,main="AGE", xlab="Age",ylab="Frequency")
barplot(placed$age,main = "AGE", xlab="Age",ylab="Frequency")
count1<-table(MBA.df$sex)
par(mfrow=c(1,1))
pie(table(MBA.df$sex),col=c("red","blue"),main="Gender Split Up")
count2<-table(MBA.df$quarter)
par(mfrow=c(1,1))
pie(table(MBA.df$quarter),col=c("pink","purple","black","cyan"),main="Quartile Ranking")
par(mfrow=c(1,3))
hist(placed$quarter,xlab="quarter",ylab="Frequency",main="Quartile Ranking", col=c("pink","blue","black","yellow"))
boxplot(placed$quarter,main="quarter", xlab="quarter",ylab="Quartile Ranking")
barplot(placed$quarter,main = "quarter", xlab="quarter",ylab="Quartile Ranking")
count3<-table(MBA.df$frstlang)
par(mfrow=c(1,1))
pie(table(MBA.df$frstlang),col=c("maroon","yellow"),main="First Language")
count4<-table(MBA.df$satis[MBA.df$satis<998])
par(mfrow=c(1,1))
pie(table(MBA.df$satis[MBA.df$satis<998]),col=c("red","yellow","violet","black","blue","green","orange"),main="Degree of Satisfaction with MBA Program ")
par(mfrow=c(1,3))
hist(placed$satis,xlab="satis",ylab="Frequency",main="Degree of satisfaction", col=c("red","blue","green","yellow"))
boxplot(placed$satis,main="Degree of satisfaction", xlab="satis",ylab="Frequency")
barplot(placed$satis,main = "Degree of satisfaction", xlab="satis",ylab="Frequency")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(MBA.df$salary,MBA.df$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(MBA.df$salary,MBA.df$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(MBA.df$salary,MBA.df$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)
scatterplot(placed$salary,placed$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(placed$salary,placed$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)
scatterplot(placed$salary,placed$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)
plot(jitter(placed$sex),jitter(placed$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)
plot(jitter(placed$frstlang),jitter(placed$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)
plot(jitter(placed$satis),jitter(placed$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)
plot(jitter(MBA.df$sex),jitter(MBA.df$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)
plot(jitter(MBA.df$frstlang),jitter(MBA.df$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)
plot(jitter(MBA.df$satis),jitter(MBA.df$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
## Sample Size
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sex 0.15 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## gmat_tot 0.43 0.84 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## gmat_qpc 0.10 0.14 0.00 0.00 1.00 0.00 1.00 1.00 1.00
## gmat_vpc 0.86 0.59 0.00 0.34 0.00 0.00 1.00 1.00 1.00
## gmat_tpc 0.33 0.64 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## s_avg 0.11 0.42 0.08 0.88 0.11 0.16 0.00 0.00 0.00
## f_avg 0.03 0.09 0.22 0.32 0.82 0.48 0.00 0.00 0.00
## quarter 0.21 0.83 0.29 0.90 0.20 0.32 0.00 0.00 0.00
## work_yrs 0.00 0.35 0.22 0.06 0.78 0.18 0.10 0.03 0.19
## frstlang 0.00 0.45 0.19 0.89 0.03 0.10 0.16 0.61 0.27
## salary 0.00 0.09 0.36 0.89 0.17 0.18 0.31 0.29 0.20
## satis 0.28 0.36 0.52 0.97 0.13 0.24 0.15 0.24 0.02
## work_yrs frstlang salary satis
## age 0.00 0.02 0.00 1
## sex 1.00 1.00 1.00 1
## gmat_tot 1.00 1.00 1.00 1
## gmat_qpc 1.00 1.00 1.00 1
## gmat_vpc 1.00 1.00 1.00 1
## gmat_tpc 1.00 1.00 1.00 1
## s_avg 1.00 1.00 1.00 1
## f_avg 1.00 1.00 1.00 1
## quarter 1.00 1.00 1.00 1
## work_yrs 0.00 1.00 0.00 1
## frstlang 0.05 0.00 0.42 1
## salary 0.00 0.01 0.00 1
## satis 0.53 0.37 0.69 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
x<-placed[,c("age","sex","gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","quarter", "work_yrs", "frstlang", "salary","satis")]
y<-placed[,c("salary","gmat_tpc","work_yrs","satis")]
cor(x,y)
## salary gmat_tpc work_yrs satis
## age 0.49964284 -0.09609156 0.88052470 0.108323083
## sex -0.16628869 -0.04686981 -0.09233003 -0.091995338
## gmat_tot -0.09067141 0.96680810 -0.12280018 0.064742057
## gmat_qpc 0.01414130 0.65865003 -0.18270126 -0.003984632
## gmat_vpc -0.13743230 0.78443167 -0.02812182 0.148634805
## gmat_tpc -0.13201783 1.00000000 -0.13246963 0.116308417
## s_avg 0.10173175 0.13938500 0.16328236 -0.143565573
## f_avg -0.10603897 0.07051391 -0.21633018 -0.117733043
## quarter -0.12848526 -0.09955033 -0.12896722 0.225119851
## work_yrs 0.45466634 -0.13246963 1.00000000 0.062999256
## frstlang 0.26701953 -0.16437561 0.19627277 0.089834769
## salary 1.00000000 -0.13201783 0.45466634 -0.040050600
## satis -0.04005060 0.11630842 0.06299926 1.000000000
cov(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
var(x,y)
## salary gmat_tpc work_yrs satis
## age 2.921052e+04 -3.460213e+00 8.6728536 0.27765087
## sex -1.369577e+03 -2.377689e-01 -0.1281173 -0.03321911
## gmat_tot -8.212449e+04 5.393623e+02 -18.7388159 2.57091186
## gmat_qpc 3.382438e+03 9.703607e+01 -7.3624595 -0.04178565
## gmat_vpc -3.964803e+04 1.393882e+02 -1.3668380 1.87997335
## gmat_tpc -2.596339e+04 1.211342e+02 -4.3892062 1.00285551
## s_avg 6.880204e+02 5.806292e-01 0.1860480 -0.04256901
## f_avg -9.241129e+02 3.785056e-01 -0.3176271 -0.04498382
## quarter -2.571117e+03 -1.227013e+00 -0.4347992 0.19750619
## work_yrs 2.445820e+04 -4.389206e+00 9.0630116 0.14858176
## frstlang 1.206714e+03 -4.575481e-01 0.1494384 0.01779935
## salary 3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis -5.606583e+02 1.002856e+00 0.1485818 0.61374453
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
## Sample Size
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## sex 0.15 0.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## gmat_tot 0.43 0.84 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## gmat_qpc 0.10 0.14 0.00 0.00 1.00 0.00 1.00 1.00 1.00
## gmat_vpc 0.86 0.59 0.00 0.34 0.00 0.00 1.00 1.00 1.00
## gmat_tpc 0.33 0.64 0.00 0.00 0.00 0.00 1.00 1.00 1.00
## s_avg 0.11 0.42 0.08 0.88 0.11 0.16 0.00 0.00 0.00
## f_avg 0.03 0.09 0.22 0.32 0.82 0.48 0.00 0.00 0.00
## quarter 0.21 0.83 0.29 0.90 0.20 0.32 0.00 0.00 0.00
## work_yrs 0.00 0.35 0.22 0.06 0.78 0.18 0.10 0.03 0.19
## frstlang 0.00 0.45 0.19 0.89 0.03 0.10 0.16 0.61 0.27
## salary 0.00 0.09 0.36 0.89 0.17 0.18 0.31 0.29 0.20
## satis 0.28 0.36 0.52 0.97 0.13 0.24 0.15 0.24 0.02
## work_yrs frstlang salary satis
## age 0.00 0.02 0.00 1
## sex 1.00 1.00 1.00 1
## gmat_tot 1.00 1.00 1.00 1
## gmat_qpc 1.00 1.00 1.00 1
## gmat_vpc 1.00 1.00 1.00 1
## gmat_tpc 1.00 1.00 1.00 1
## s_avg 1.00 1.00 1.00 1
## f_avg 1.00 1.00 1.00 1
## quarter 1.00 1.00 1.00 1
## work_yrs 0.00 1.00 0.00 1
## frstlang 0.05 0.00 0.42 1
## salary 0.00 0.01 0.00 1
## satis 0.53 0.37 0.69 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
library(corrplot)
## corrplot 0.84 loaded
corrplot(corr=cor(placed[,c(1:13)],use = "complete.obs"), method = "ellipse")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
corrplot.mixed(corr=cor(placed[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)
library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")
corrplot(corr=cor(MBA.df[,c(1:13)],use = "complete.obs"), method = "ellipse")
corrplot.mixed(corr=cor(MBA.df[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)
corrgram(MBA.df, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")
library(ggvis)
placed %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_tot, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
Hypothesis: The factors ‘works_yr’ and ‘sex’ are independent.
mytable1<-xtabs(~work_yrs+satis, data = placed)
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 131.13, df = 44, p-value = 1.35e-10
Since p value is less than 0.05 we can reject the null hypothesis. We can predict work experience from his or her degree of satisfaction.
mytable3<-xtabs(~work_yrs+frstlang, data = placed)
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 22.274, df = 11, p-value = 0.02233
Since, p-value = 0.02233(<0.05) we can reject the null hypothesis and the parameters works_yr and frstlang are not independent. That is, we can predict work experience from her first language.
Hypothesis: There is no significant difference between the average salaries of males and females.
t.test(salary~sex,alternative="greater",data=placed)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.09047
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -1527.96 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
Since, p-value = 0.09047 (>0.05) We can’t say that the average salary of males is greater than females as we can’t reject the null hypothesis that Average Salary of Males is greater than the average salary of Females
Hypthesis: There is no significant difference in average salary of people who have English as their first language and the average salary of those who speak otherlanguage.
t.test(salary~frstlang, alternative="greater", data = placed)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.8476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -51508.45 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
Since, p-value = 0.8476 (>0.05) We can’t say that the average salary of English speaking students is greater than other language speaking students as we can’t reject the null hypothesis that Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage.
Hypothesis: There is no significant difference in the average GMAT Percentile of males and the average GMAT percentile of females.
t.test(gmat_tpc~sex, alternative="greater", data = placed)
##
## Welch Two Sample t-test
##
## data: gmat_tpc by sex
## t = 0.43873, df = 48.83, p-value = 0.3314
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -3.157889 Inf
## sample estimates:
## mean in group 1 mean in group 2
## 84.86111 83.74194
Since, p-value = 0.3314 (>0.05) & we can’t reject the null hypothesis that Average GMAT Percentile of Males is greater than the average GMAT percentile of Females.
placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
model1 <- lm(salary ~ age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
model1$coefficients
## (Intercept) age work_yrs mb_avg gmat_tot sex
## 57744.91736 2314.47880 407.55619 -2606.42188 -2.67356 -3128.42951
## satis gmat_vpc gmat_qpc gmat_tpc s_avg f_avg
## -1362.35189 551.91609 836.30336 -1434.01111 4777.67360 NA
Initially a model can be generated for salary where variables are taken maximum but removing few based on our previous hypothesis testing. So first model is created with predictive variables age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg.
And Adjusted R-squared: 0.2546 is noted. Then we remove one by one variable in subsequent steps and keep doing regression until maximum adjustable R-squared is obtained which is actually best fit for regression.
placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
modelA <- lm(salary ~ age + work_yrs + mb_avg + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelA)
##
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + sex + satis +
## gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27980 -7307 -92 5027 72329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57125.6 31624.0 1.806 0.0741 .
## age 2313.4 1005.5 2.301 0.0236 *
## work_yrs 407.9 1085.3 0.376 0.7079
## mb_avg -2625.6 7500.5 -0.350 0.7271
## sex -3135.1 3451.2 -0.908 0.3660
## satis -1356.3 2022.4 -0.671 0.5041
## gmat_vpc 546.7 358.6 1.524 0.1308
## gmat_qpc 831.1 358.6 2.318 0.0227 *
## gmat_tpc -1435.8 699.3 -2.053 0.0429 *
## s_avg 4779.9 7484.0 0.639 0.5246
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15340 on 93 degrees of freedom
## Multiple R-squared: 0.3276, Adjusted R-squared: 0.2626
## F-statistic: 5.036 on 9 and 93 DF, p-value: 1.573e-05
modelB <- lm(salary ~ age + work_yrs + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelB)
##
## Call:
## lm(formula = salary ~ age + work_yrs + sex + satis + gmat_vpc +
## gmat_qpc + gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27658 -7463 -293 4791 71831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55014.6 30898.5 1.780 0.0782 .
## age 2336.0 998.7 2.339 0.0215 *
## work_yrs 446.0 1074.8 0.415 0.6791
## sex -3281.6 3409.7 -0.962 0.3383
## satis -1357.9 2012.9 -0.675 0.5016
## gmat_vpc 539.6 356.4 1.514 0.1334
## gmat_qpc 818.4 355.1 2.305 0.0234 *
## gmat_tpc -1415.7 693.7 -2.041 0.0441 *
## s_avg 2620.6 4218.4 0.621 0.5359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15270 on 94 degrees of freedom
## Multiple R-squared: 0.3268, Adjusted R-squared: 0.2695
## F-statistic: 5.703 on 8 and 94 DF, p-value: 6.544e-06
modelC <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelC)
##
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc +
## gmat_tpc + s_avg, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27509 -7439 -100 4348 72056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47208.3 24404.0 1.934 0.0560 .
## age 2697.3 486.8 5.541 2.68e-07 ***
## sex -3200.4 3389.2 -0.944 0.3474
## satis -1394.6 2002.2 -0.697 0.4878
## gmat_vpc 526.8 353.5 1.490 0.1394
## gmat_qpc 806.5 352.4 2.289 0.0243 *
## gmat_tpc -1397.3 689.2 -2.027 0.0454 *
## s_avg 2710.2 4194.4 0.646 0.5198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15210 on 95 degrees of freedom
## Multiple R-squared: 0.3255, Adjusted R-squared: 0.2758
## F-statistic: 6.55 on 7 and 95 DF, p-value: 2.59e-06
modelD <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelD)
##
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc +
## gmat_tpc, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25949 -7801 -299 5037 70273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53662.6 22198.3 2.417 0.0175 *
## age 2759.8 475.6 5.803 8.38e-08 ***
## sex -3002.5 3365.1 -0.892 0.3745
## satis -1640.2 1959.8 -0.837 0.4047
## gmat_vpc 516.1 352.0 1.466 0.1459
## gmat_qpc 788.6 350.2 2.252 0.0266 *
## gmat_tpc -1353.4 683.8 -1.979 0.0507 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15160 on 96 degrees of freedom
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.2802
## F-statistic: 7.618 on 6 and 96 DF, p-value: 1.057e-06
modelE <- lm(salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelE)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28067 -8243 10 5811 69769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45736.1 20045.4 2.282 0.0247 *
## age 2723.1 472.9 5.759 9.95e-08 ***
## sex -2753.2 3346.7 -0.823 0.4127
## gmat_vpc 520.6 351.4 1.481 0.1418
## gmat_qpc 806.5 349.0 2.311 0.0230 *
## gmat_tpc -1387.1 681.6 -2.035 0.0446 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15140 on 97 degrees of freedom
## Multiple R-squared: 0.3176, Adjusted R-squared: 0.2824
## F-statistic: 9.03 on 5 and 97 DF, p-value: 4.494e-07
plot(modelA)
plot(modelB)
plot(modelC)
plot(modelD)
plot(modelE)
model2 <- lm(salary ~ age + gmat_tot + mb_avg, data = placed)
model2$coefficients
## (Intercept) age gmat_tot mb_avg
## 39016.56653 2712.81802 -19.53809 1102.87444
model3<-lm(salary~work_yrs+gmat_tot-1, data = placed)
model3$coefficients
## work_yrs gmat_tot
## 3264.2887 146.7158
model4<-lm(salary~work_yrs+age*frstlang+gmat_tot+sex-1, data = placed)
model4$coefficients
## work_yrs age frstlang gmat_tot sex
## -958.86681 2905.80137 -15290.15225 40.35853 -2260.92128
## age:frstlang
## 794.41173
model5<-lm(salary~work_yrs+age, data = placed)
model5$coefficients
## (Intercept) work_yrs age
## 36967.4546 388.8347 2413.7599
model6<-lm(salary~work_yrs+sex, data = placed)
model6$coefficients
## (Intercept) work_yrs sex
## 99676.944 2629.973 -4860.589
Multiple Regression Model 3 -> The first model has salary as response variable or dependent variable. -> Predictor variables are work_yrs and gmat_tot. -> The R^2 value is 0.9712 which is a very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 4 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs , gmat_tot., age, frstlang and sex. -> The R^2 value is 0.9785 which is very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 5 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs and age. -> The R^2 value is 0.2506which is not so good model -> The model’s, p-value:5.438e-07is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
Multiple regression model 6 -> The first model has salary as response variable or dependent variable. ->Predictor variables are are work_yrs , and sex. -> The R^2 value is 0.2223which is very good model -> The model’s, p-value: 3.471e-06 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).
placed$sex <- factor(placed$sex)
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed)
anova(fit1, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 102 126.01
## age 1 2.3856 101 123.62 0.12245
## gmat_tot 1 0.0744 100 123.55 0.78507
## gmat_qpc 1 4.1847 99 119.36 0.04079 *
## gmat_vpc 1 1.8543 98 117.51 0.17329
## gmat_tpc 1 0.0823 97 117.43 0.77423
## s_avg 1 0.4155 96 117.01 0.51919
## f_avg 1 2.1057 95 114.90 0.14675
## quarter 1 0.4742 94 114.43 0.49107
## work_yrs 1 0.5956 93 113.83 0.44026
## frstlang 1 4.6687 92 109.17 0.03072 *
## salary 1 1.0389 91 108.13 0.30808
## satis 1 0.6359 90 107.49 0.42521
## mb_avg 0 0.0000 90 107.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=placed,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != placed$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0485436893203883"
Notplaced$sex <- factor(Notplaced$sex)
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = Notplaced)
anova(fit2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: sex
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 89 102.304
## age 1 0.4712 88 101.833 0.49244
## gmat_tot 1 0.3130 87 101.520 0.57585
## gmat_qpc 1 4.3705 86 97.150 0.03657 *
## gmat_vpc 1 5.0395 85 92.110 0.02478 *
## gmat_tpc 1 0.6560 84 91.454 0.41798
## s_avg 1 0.0490 83 91.405 0.82487
## f_avg 1 0.5497 82 90.855 0.45844
## quarter 1 1.6354 81 89.220 0.20096
## work_yrs 1 0.8609 80 88.359 0.35348
## frstlang 1 0.0078 79 88.351 0.92960
## salary 0 0.0000 79 88.351
## satis 1 1.6093 78 86.742 0.20459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=Notplaced,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results !=Notplaced$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0444444444444444"
In logistics regression the dependent variable can only be categorical , so we choose sex as the dependent variable. ->The model is prepared for both who were placed and not placed MBA Students . ->The accuracy of both the models came out to be very less and they are 4.85% and 4.44% repectively.