Description of the Case

The data set with which we are dealing with here is a classification data set about MBA students and about their starting salaries , GMAT Scores , percentiles, age, sex, etc. Some of them answered the survey,some did not reveal their salary amount and few did not respond to the survey.

Reading the data

setwd("C:\\Users\\PSrikanth\\Documents\\Internship")
MBA.df<-read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(MBA.df)

Summary statistics

summary(MBA.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(MBA.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Boxplot of Gender vs Salary

boxplot(MBA.df$salary ~ MBA.df$sex, data=MBA.df, horizontal=TRUE, yaxt="n", 
        ylab="Gender", xlab="Salary",
        main="Salaries of Males and Females")
axis(side=2, at=c(1,2), labels=c("F", "M"))

placed<-MBA.df[which(MBA.df$salary>999),] 
PLACED<-MBA.df$salary[MBA.df$salary>999]
Notplaced<-MBA.df[which(MBA.df$salary==0),]
MBA1=MBA.df[,1:13]
MBA2=log(MBA1+1)
boxplot(MBA2,xlab="Value",ylab="Parameters",main="Boxplot Representation of fields")

boxplot(MBA.df$gmat_tot,horizontal = TRUE, xlab="GMAT Score",main="BoxPlot Representation Of GMAT Score")

par(mfrow=c(1,3)) 
with(MBA.df, boxplot(MBA.df$gmat_qpc,main="GMAT percentage quantitive",ylab="Percentage %")) 
with(MBA.df, boxplot(MBA.df$gmat_vpc,main="GMAT percentage verbal",ylab="Percentage %")) 
with(MBA.df, boxplot(MBA.df$gmat_tpc,main="GMAT percentage total",ylab="Percentage %"))

Histogram representation of the average

hist(MBA.df$s_avg,xlab="Average",ylab="Frequency",main="Spring MBA average", col=c("pink","red","purple","navy"))

boxplot(MBA.df$s_avg,main="Spring MBA average", xlab="Average",ylab="Frequency")

barplot(MBA.df$s_avg,main = "Spring MBA average", xlab="Average",ylab="Frequency")

with(MBA.df, boxplot(MBA.df$f_avg,main="Fall MBA Average",ylab="Average"))

par(mfrow=c(1,3))
hist(MBA.df$f_avg,xlab="Fall Average",ylab="Frequency",main="Fall MBA average", col=c("maroon","navy","purple","cyan"))

boxplot(MBA.df$f_avg,main="Fall MBA average", xlab="Average",ylab="Frequency")

barplot(MBA.df$f_avg,main = "Fall MBA average", xlab="Average",ylab="Frequency")

par(mfrow=c(1,1)) #Working years 
boxplot(MBA.df$work_yrs,horizontal = TRUE, xlab="Working Yeats",main=" Working experience in years")

table(MBA.df$salary>999)
## 
## FALSE  TRUE 
##   171   103
boxplot(placed,horizontal = TRUE, xlab="Salary",main="Boxplot presentation of starting salary ")

count<-table(MBA.df$age) 
barplot(count, main = "Barplot for age", xlab = "Age in Years")

par(mfrow=c(1,3))
 hist(placed$age,xlab="Age",ylab="Frequency",main="AGE", col=c("red","black","pink","yellow"))
 boxplot(placed$age,main="AGE", xlab="Age",ylab="Frequency")
 barplot(placed$age,main = "AGE", xlab="Age",ylab="Frequency")

count1<-table(MBA.df$sex)
par(mfrow=c(1,1))
pie(table(MBA.df$sex),col=c("red","blue"),main="Gender Split Up")

count2<-table(MBA.df$quarter)
par(mfrow=c(1,1))
pie(table(MBA.df$quarter),col=c("pink","purple","black","cyan"),main="Quartile Ranking")

par(mfrow=c(1,3))
hist(placed$quarter,xlab="quarter",ylab="Frequency",main="Quartile Ranking", col=c("pink","blue","black","yellow"))
boxplot(placed$quarter,main="quarter", xlab="quarter",ylab="Quartile Ranking")
barplot(placed$quarter,main = "quarter", xlab="quarter",ylab="Quartile Ranking")

count3<-table(MBA.df$frstlang)
par(mfrow=c(1,1))
pie(table(MBA.df$frstlang),col=c("maroon","yellow"),main="First Language")

count4<-table(MBA.df$satis[MBA.df$satis<998])
par(mfrow=c(1,1))
pie(table(MBA.df$satis[MBA.df$satis<998]),col=c("red","yellow","violet","black","blue","green","orange"),main="Degree of Satisfaction with MBA Program ")

par(mfrow=c(1,3))
hist(placed$satis,xlab="satis",ylab="Frequency",main="Degree of satisfaction", col=c("red","blue","green","yellow"))
boxplot(placed$satis,main="Degree of satisfaction", xlab="satis",ylab="Frequency")
barplot(placed$satis,main = "Degree of satisfaction", xlab="satis",ylab="Frequency")

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(MBA.df$salary,MBA.df$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(MBA.df$salary,MBA.df$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(MBA.df$salary,MBA.df$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)

scatterplot(placed$salary,placed$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(placed$salary,placed$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(placed$salary,placed$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)

plot(jitter(placed$sex),jitter(placed$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)

plot(jitter(placed$frstlang),jitter(placed$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)

plot(jitter(placed$satis),jitter(placed$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)

plot(jitter(MBA.df$sex),jitter(MBA.df$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)

plot(jitter(MBA.df$frstlang),jitter(MBA.df$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)

plot(jitter(MBA.df$satis),jitter(MBA.df$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)

Creating A Corrgram

corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix 
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.14    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## sex      -0.14  1.00    -0.02    -0.15     0.05    -0.05  0.08  0.17
## gmat_tot -0.08 -0.02     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17 -0.15     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02  0.05     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10 -0.05     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16  0.08     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22  0.17     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13 -0.02    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## work_yrs  0.88 -0.09    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## frstlang  0.35  0.08    -0.13     0.01    -0.22    -0.16 -0.14 -0.05
## salary    0.50 -0.17    -0.09     0.01    -0.14    -0.13  0.10 -0.11
## satis     0.11 -0.09     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter work_yrs frstlang salary satis
## age        -0.13     0.88     0.35   0.50  0.11
## sex        -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot   -0.11    -0.12    -0.13  -0.09  0.06
## gmat_qpc    0.01    -0.18     0.01   0.01  0.00
## gmat_vpc   -0.13    -0.03    -0.22  -0.14  0.15
## gmat_tpc   -0.10    -0.13    -0.16  -0.13  0.12
## s_avg      -0.84     0.16    -0.14   0.10 -0.14
## f_avg      -0.43    -0.22    -0.05  -0.11 -0.12
## quarter     1.00    -0.13     0.11  -0.13  0.23
## work_yrs   -0.13     1.00     0.20   0.45  0.06
## frstlang    0.11     0.20     1.00   0.27  0.09
## salary     -0.13     0.45     0.27   1.00 -0.04
## satis       0.23     0.06     0.09  -0.04  1.00
## Sample Size 
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           age  sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age      0.00 1.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## sex      0.15 0.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## gmat_tot 0.43 0.84     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## gmat_qpc 0.10 0.14     0.00     0.00     1.00     0.00  1.00  1.00    1.00
## gmat_vpc 0.86 0.59     0.00     0.34     0.00     0.00  1.00  1.00    1.00
## gmat_tpc 0.33 0.64     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## s_avg    0.11 0.42     0.08     0.88     0.11     0.16  0.00  0.00    0.00
## f_avg    0.03 0.09     0.22     0.32     0.82     0.48  0.00  0.00    0.00
## quarter  0.21 0.83     0.29     0.90     0.20     0.32  0.00  0.00    0.00
## work_yrs 0.00 0.35     0.22     0.06     0.78     0.18  0.10  0.03    0.19
## frstlang 0.00 0.45     0.19     0.89     0.03     0.10  0.16  0.61    0.27
## salary   0.00 0.09     0.36     0.89     0.17     0.18  0.31  0.29    0.20
## satis    0.28 0.36     0.52     0.97     0.13     0.24  0.15  0.24    0.02
##          work_yrs frstlang salary satis
## age          0.00     0.02   0.00     1
## sex          1.00     1.00   1.00     1
## gmat_tot     1.00     1.00   1.00     1
## gmat_qpc     1.00     1.00   1.00     1
## gmat_vpc     1.00     1.00   1.00     1
## gmat_tpc     1.00     1.00   1.00     1
## s_avg        1.00     1.00   1.00     1
## f_avg        1.00     1.00   1.00     1
## quarter      1.00     1.00   1.00     1
## work_yrs     0.00     1.00   0.00     1
## frstlang     0.05     0.00   0.42     1
## salary       0.00     0.01   0.00     1
## satis        0.53     0.37   0.69     0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
x<-placed[,c("age","sex","gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","quarter", "work_yrs", "frstlang", "salary","satis")] 
y<-placed[,c("salary","gmat_tpc","work_yrs","satis")]
cor(x,y)
##               salary    gmat_tpc    work_yrs        satis
## age       0.49964284 -0.09609156  0.88052470  0.108323083
## sex      -0.16628869 -0.04686981 -0.09233003 -0.091995338
## gmat_tot -0.09067141  0.96680810 -0.12280018  0.064742057
## gmat_qpc  0.01414130  0.65865003 -0.18270126 -0.003984632
## gmat_vpc -0.13743230  0.78443167 -0.02812182  0.148634805
## gmat_tpc -0.13201783  1.00000000 -0.13246963  0.116308417
## s_avg     0.10173175  0.13938500  0.16328236 -0.143565573
## f_avg    -0.10603897  0.07051391 -0.21633018 -0.117733043
## quarter  -0.12848526 -0.09955033 -0.12896722  0.225119851
## work_yrs  0.45466634 -0.13246963  1.00000000  0.062999256
## frstlang  0.26701953 -0.16437561  0.19627277  0.089834769
## salary    1.00000000 -0.13201783  0.45466634 -0.040050600
## satis    -0.04005060  0.11630842  0.06299926  1.000000000
cov(x,y)
##                 salary      gmat_tpc      work_yrs         satis
## age       2.921052e+04 -3.460213e+00     8.6728536    0.27765087
## sex      -1.369577e+03 -2.377689e-01    -0.1281173   -0.03321911
## gmat_tot -8.212449e+04  5.393623e+02   -18.7388159    2.57091186
## gmat_qpc  3.382438e+03  9.703607e+01    -7.3624595   -0.04178565
## gmat_vpc -3.964803e+04  1.393882e+02    -1.3668380    1.87997335
## gmat_tpc -2.596339e+04  1.211342e+02    -4.3892062    1.00285551
## s_avg     6.880204e+02  5.806292e-01     0.1860480   -0.04256901
## f_avg    -9.241129e+02  3.785056e-01    -0.3176271   -0.04498382
## quarter  -2.571117e+03 -1.227013e+00    -0.4347992    0.19750619
## work_yrs  2.445820e+04 -4.389206e+00     9.0630116    0.14858176
## frstlang  1.206714e+03 -4.575481e-01     0.1494384    0.01779935
## salary    3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis    -5.606583e+02  1.002856e+00     0.1485818    0.61374453
var(x,y)
##                 salary      gmat_tpc      work_yrs         satis
## age       2.921052e+04 -3.460213e+00     8.6728536    0.27765087
## sex      -1.369577e+03 -2.377689e-01    -0.1281173   -0.03321911
## gmat_tot -8.212449e+04  5.393623e+02   -18.7388159    2.57091186
## gmat_qpc  3.382438e+03  9.703607e+01    -7.3624595   -0.04178565
## gmat_vpc -3.964803e+04  1.393882e+02    -1.3668380    1.87997335
## gmat_tpc -2.596339e+04  1.211342e+02    -4.3892062    1.00285551
## s_avg     6.880204e+02  5.806292e-01     0.1860480   -0.04256901
## f_avg    -9.241129e+02  3.785056e-01    -0.3176271   -0.04498382
## quarter  -2.571117e+03 -1.227013e+00    -0.4347992    0.19750619
## work_yrs  2.445820e+04 -4.389206e+00     9.0630116    0.14858176
## frstlang  1.206714e+03 -4.575481e-01     0.1494384    0.01779935
## salary    3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis    -5.606583e+02  1.002856e+00     0.1485818    0.61374453
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix 
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.14    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## sex      -0.14  1.00    -0.02    -0.15     0.05    -0.05  0.08  0.17
## gmat_tot -0.08 -0.02     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17 -0.15     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02  0.05     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10 -0.05     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16  0.08     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22  0.17     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13 -0.02    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## work_yrs  0.88 -0.09    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## frstlang  0.35  0.08    -0.13     0.01    -0.22    -0.16 -0.14 -0.05
## salary    0.50 -0.17    -0.09     0.01    -0.14    -0.13  0.10 -0.11
## satis     0.11 -0.09     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter work_yrs frstlang salary satis
## age        -0.13     0.88     0.35   0.50  0.11
## sex        -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot   -0.11    -0.12    -0.13  -0.09  0.06
## gmat_qpc    0.01    -0.18     0.01   0.01  0.00
## gmat_vpc   -0.13    -0.03    -0.22  -0.14  0.15
## gmat_tpc   -0.10    -0.13    -0.16  -0.13  0.12
## s_avg      -0.84     0.16    -0.14   0.10 -0.14
## f_avg      -0.43    -0.22    -0.05  -0.11 -0.12
## quarter     1.00    -0.13     0.11  -0.13  0.23
## work_yrs   -0.13     1.00     0.20   0.45  0.06
## frstlang    0.11     0.20     1.00   0.27  0.09
## salary     -0.13     0.45     0.27   1.00 -0.04
## satis       0.23     0.06     0.09  -0.04  1.00
## Sample Size 
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           age  sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age      0.00 1.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## sex      0.15 0.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## gmat_tot 0.43 0.84     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## gmat_qpc 0.10 0.14     0.00     0.00     1.00     0.00  1.00  1.00    1.00
## gmat_vpc 0.86 0.59     0.00     0.34     0.00     0.00  1.00  1.00    1.00
## gmat_tpc 0.33 0.64     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## s_avg    0.11 0.42     0.08     0.88     0.11     0.16  0.00  0.00    0.00
## f_avg    0.03 0.09     0.22     0.32     0.82     0.48  0.00  0.00    0.00
## quarter  0.21 0.83     0.29     0.90     0.20     0.32  0.00  0.00    0.00
## work_yrs 0.00 0.35     0.22     0.06     0.78     0.18  0.10  0.03    0.19
## frstlang 0.00 0.45     0.19     0.89     0.03     0.10  0.16  0.61    0.27
## salary   0.00 0.09     0.36     0.89     0.17     0.18  0.31  0.29    0.20
## satis    0.28 0.36     0.52     0.97     0.13     0.24  0.15  0.24    0.02
##          work_yrs frstlang salary satis
## age          0.00     0.02   0.00     1
## sex          1.00     1.00   1.00     1
## gmat_tot     1.00     1.00   1.00     1
## gmat_qpc     1.00     1.00   1.00     1
## gmat_vpc     1.00     1.00   1.00     1
## gmat_tpc     1.00     1.00   1.00     1
## s_avg        1.00     1.00   1.00     1
## f_avg        1.00     1.00   1.00     1
## quarter      1.00     1.00   1.00     1
## work_yrs     0.00     1.00   0.00     1
## frstlang     0.05     0.00   0.42     1
## salary       0.00     0.01   0.00     1
## satis        0.53     0.37   0.69     0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
library(corrplot) 
## corrplot 0.84 loaded
corrplot(corr=cor(placed[,c(1:13)],use = "complete.obs"), method = "ellipse")

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
corrplot.mixed(corr=cor(placed[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)

library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")

corrplot(corr=cor(MBA.df[,c(1:13)],use = "complete.obs"), method = "ellipse")

corrplot.mixed(corr=cor(MBA.df[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)

corrgram(MBA.df, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")

library(ggvis)
placed %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~gmat_tot, fill = ~satis) %>% layer_points()
MBA.df %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()

Chi Square test

Hypothesis: The factors ‘works_yr’ and ‘sex’ are independent.

mytable1<-xtabs(~work_yrs+satis, data = placed)
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 131.13, df = 44, p-value = 1.35e-10

Since p value is less than 0.05 we can reject the null hypothesis. We can predict work experience from his or her degree of satisfaction.

mytable3<-xtabs(~work_yrs+frstlang, data = placed)
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 22.274, df = 11, p-value = 0.02233

Since, p-value = 0.02233(<0.05) we can reject the null hypothesis and the parameters works_yr and frstlang are not independent. That is, we can predict work experience from her first language.

T-test

Hypothesis: There is no significant difference between the average salaries of males and females.

t.test(salary~sex,alternative="greater",data=placed)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.09047
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -1527.96      Inf
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39

Since, p-value = 0.09047 (>0.05) We can’t say that the average salary of males is greater than females as we can’t reject the null hypothesis that Average Salary of Males is greater than the average salary of Females

Hypthesis: There is no significant difference in average salary of people who have English as their first language and the average salary of those who speak otherlanguage.

t.test(salary~frstlang, alternative="greater", data = placed)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.8476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -51508.45       Inf
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3

Since, p-value = 0.8476 (>0.05) We can’t say that the average salary of English speaking students is greater than other language speaking students as we can’t reject the null hypothesis that Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage.

Hypothesis: There is no significant difference in the average GMAT Percentile of males and the average GMAT percentile of females.

t.test(gmat_tpc~sex, alternative="greater", data = placed)
## 
##  Welch Two Sample t-test
## 
## data:  gmat_tpc by sex
## t = 0.43873, df = 48.83, p-value = 0.3314
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -3.157889       Inf
## sample estimates:
## mean in group 1 mean in group 2 
##        84.86111        83.74194

Since, p-value = 0.3314 (>0.05) & we can’t reject the null hypothesis that Average GMAT Percentile of Males is greater than the average GMAT percentile of Females.

Model

placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
model1 <- lm(salary ~ age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
model1$coefficients
## (Intercept)         age    work_yrs      mb_avg    gmat_tot         sex 
## 57744.91736  2314.47880   407.55619 -2606.42188    -2.67356 -3128.42951 
##       satis    gmat_vpc    gmat_qpc    gmat_tpc       s_avg       f_avg 
## -1362.35189   551.91609   836.30336 -1434.01111  4777.67360          NA

Initially a model can be generated for salary where variables are taken maximum but removing few based on our previous hypothesis testing. So first model is created with predictive variables age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg.

And Adjusted R-squared: 0.2546 is noted. Then we remove one by one variable in subsequent steps and keep doing regression until maximum adjustable R-squared is obtained which is actually best fit for regression.

Multi-variable Regression Line

placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
modelA <- lm(salary ~ age + work_yrs + mb_avg + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelA)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + sex + satis + 
##     gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27980  -7307    -92   5027  72329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  57125.6    31624.0   1.806   0.0741 .
## age           2313.4     1005.5   2.301   0.0236 *
## work_yrs       407.9     1085.3   0.376   0.7079  
## mb_avg       -2625.6     7500.5  -0.350   0.7271  
## sex          -3135.1     3451.2  -0.908   0.3660  
## satis        -1356.3     2022.4  -0.671   0.5041  
## gmat_vpc       546.7      358.6   1.524   0.1308  
## gmat_qpc       831.1      358.6   2.318   0.0227 *
## gmat_tpc     -1435.8      699.3  -2.053   0.0429 *
## s_avg         4779.9     7484.0   0.639   0.5246  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15340 on 93 degrees of freedom
## Multiple R-squared:  0.3276, Adjusted R-squared:  0.2626 
## F-statistic: 5.036 on 9 and 93 DF,  p-value: 1.573e-05
modelB <- lm(salary ~ age + work_yrs + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelB)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + sex + satis + gmat_vpc + 
##     gmat_qpc + gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27658  -7463   -293   4791  71831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  55014.6    30898.5   1.780   0.0782 .
## age           2336.0      998.7   2.339   0.0215 *
## work_yrs       446.0     1074.8   0.415   0.6791  
## sex          -3281.6     3409.7  -0.962   0.3383  
## satis        -1357.9     2012.9  -0.675   0.5016  
## gmat_vpc       539.6      356.4   1.514   0.1334  
## gmat_qpc       818.4      355.1   2.305   0.0234 *
## gmat_tpc     -1415.7      693.7  -2.041   0.0441 *
## s_avg         2620.6     4218.4   0.621   0.5359  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15270 on 94 degrees of freedom
## Multiple R-squared:  0.3268, Adjusted R-squared:  0.2695 
## F-statistic: 5.703 on 8 and 94 DF,  p-value: 6.544e-06
modelC <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelC)
## 
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc + 
##     gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27509  -7439   -100   4348  72056 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47208.3    24404.0   1.934   0.0560 .  
## age           2697.3      486.8   5.541 2.68e-07 ***
## sex          -3200.4     3389.2  -0.944   0.3474    
## satis        -1394.6     2002.2  -0.697   0.4878    
## gmat_vpc       526.8      353.5   1.490   0.1394    
## gmat_qpc       806.5      352.4   2.289   0.0243 *  
## gmat_tpc     -1397.3      689.2  -2.027   0.0454 *  
## s_avg         2710.2     4194.4   0.646   0.5198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15210 on 95 degrees of freedom
## Multiple R-squared:  0.3255, Adjusted R-squared:  0.2758 
## F-statistic:  6.55 on 7 and 95 DF,  p-value: 2.59e-06
modelD <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelD)
## 
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc + 
##     gmat_tpc, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25949  -7801   -299   5037  70273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53662.6    22198.3   2.417   0.0175 *  
## age           2759.8      475.6   5.803 8.38e-08 ***
## sex          -3002.5     3365.1  -0.892   0.3745    
## satis        -1640.2     1959.8  -0.837   0.4047    
## gmat_vpc       516.1      352.0   1.466   0.1459    
## gmat_qpc       788.6      350.2   2.252   0.0266 *  
## gmat_tpc     -1353.4      683.8  -1.979   0.0507 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15160 on 96 degrees of freedom
## Multiple R-squared:  0.3226, Adjusted R-squared:  0.2802 
## F-statistic: 7.618 on 6 and 96 DF,  p-value: 1.057e-06
modelE <- lm(salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelE)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, 
##     data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28067  -8243     10   5811  69769 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45736.1    20045.4   2.282   0.0247 *  
## age           2723.1      472.9   5.759 9.95e-08 ***
## sex          -2753.2     3346.7  -0.823   0.4127    
## gmat_vpc       520.6      351.4   1.481   0.1418    
## gmat_qpc       806.5      349.0   2.311   0.0230 *  
## gmat_tpc     -1387.1      681.6  -2.035   0.0446 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15140 on 97 degrees of freedom
## Multiple R-squared:  0.3176, Adjusted R-squared:  0.2824 
## F-statistic:  9.03 on 5 and 97 DF,  p-value: 4.494e-07
plot(modelA)

plot(modelB)

plot(modelC)

plot(modelD)

plot(modelE)

model2 <- lm(salary ~ age  + gmat_tot + mb_avg, data = placed)
model2$coefficients
## (Intercept)         age    gmat_tot      mb_avg 
## 39016.56653  2712.81802   -19.53809  1102.87444
model3<-lm(salary~work_yrs+gmat_tot-1, data = placed)
model3$coefficients
##  work_yrs  gmat_tot 
## 3264.2887  146.7158
model4<-lm(salary~work_yrs+age*frstlang+gmat_tot+sex-1, data = placed)
model4$coefficients
##     work_yrs          age     frstlang     gmat_tot          sex 
##   -958.86681   2905.80137 -15290.15225     40.35853  -2260.92128 
## age:frstlang 
##    794.41173
model5<-lm(salary~work_yrs+age, data = placed)
model5$coefficients
## (Intercept)    work_yrs         age 
##  36967.4546    388.8347   2413.7599
model6<-lm(salary~work_yrs+sex, data = placed)
model6$coefficients
## (Intercept)    work_yrs         sex 
##   99676.944    2629.973   -4860.589

Inference

Multiple Regression Model 3 -> The first model has salary as response variable or dependent variable. -> Predictor variables are work_yrs and gmat_tot. -> The R^2 value is 0.9712 which is a very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 4 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs , gmat_tot., age, frstlang and sex. -> The R^2 value is 0.9785 which is very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 5 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs and age. -> The R^2 value is 0.2506which is not so good model -> The model’s, p-value:5.438e-07is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 6 -> The first model has salary as response variable or dependent variable. ->Predictor variables are are work_yrs , and sex. -> The R^2 value is 0.2223which is very good model -> The model’s, p-value: 3.471e-06 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Logistic Regression Model

placed$sex <- factor(placed$sex) 
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed) 
anova(fit1, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                       102     126.01           
## age       1   2.3856       101     123.62  0.12245  
## gmat_tot  1   0.0744       100     123.55  0.78507  
## gmat_qpc  1   4.1847        99     119.36  0.04079 *
## gmat_vpc  1   1.8543        98     117.51  0.17329  
## gmat_tpc  1   0.0823        97     117.43  0.77423  
## s_avg     1   0.4155        96     117.01  0.51919  
## f_avg     1   2.1057        95     114.90  0.14675  
## quarter   1   0.4742        94     114.43  0.49107  
## work_yrs  1   0.5956        93     113.83  0.44026  
## frstlang  1   4.6687        92     109.17  0.03072 *
## salary    1   1.0389        91     108.13  0.30808  
## satis     1   0.6359        90     107.49  0.42521  
## mb_avg    0   0.0000        90     107.49           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit1,data=placed,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != placed$sex)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0485436893203883"
Notplaced$sex <- factor(Notplaced$sex)
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = Notplaced)
anova(fit2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                        89    102.304           
## age       1   0.4712        88    101.833  0.49244  
## gmat_tot  1   0.3130        87    101.520  0.57585  
## gmat_qpc  1   4.3705        86     97.150  0.03657 *
## gmat_vpc  1   5.0395        85     92.110  0.02478 *
## gmat_tpc  1   0.6560        84     91.454  0.41798  
## s_avg     1   0.0490        83     91.405  0.82487  
## f_avg     1   0.5497        82     90.855  0.45844  
## quarter   1   1.6354        81     89.220  0.20096  
## work_yrs  1   0.8609        80     88.359  0.35348  
## frstlang  1   0.0078        79     88.351  0.92960  
## salary    0   0.0000        79     88.351           
## satis     1   1.6093        78     86.742  0.20459  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=Notplaced,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results !=Notplaced$sex) 
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0444444444444444"

In logistics regression the dependent variable can only be categorical , so we choose sex as the dependent variable. ->The model is prepared for both who were placed and not placed MBA Students . ->The accuracy of both the models came out to be very less and they are 4.85% and 4.44% repectively.