Reading the data into data set:

mbasal.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))

head(mbasal.df)
##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 5  24   2      710       93       98       98   3.6  3.75       1        2
## 6  24   1      640       82       89       91   3.9  3.75       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 5        1    999     5
## 6        1      0     6

Data Summary

1.Summery Statistics ->(mean median mode min, max of variables)

library(psych)
describe(mbasal.df)[,c(2,3,4,5,8,9)]
##            n     mean       sd median min    max
## age      274    27.36     3.71     27  22     48
## sex      274     1.25     0.43      1   1      2
## gmat_tot 274   619.45    57.54    620 450    790
## gmat_qpc 274    80.64    14.87     83  28     99
## gmat_vpc 274    78.32    16.86     81  16     99
## gmat_tpc 274    84.20    14.02     87   0     99
## s_avg    274     3.03     0.38      3   2      4
## f_avg    274     3.06     0.53      3   0      4
## quarter  274     2.48     1.11      2   1      4
## work_yrs 274     3.87     3.23      3   0     22
## frstlang 274     1.12     0.32      1   1      2
## salary   274 39025.69 50951.56    999   0 220000
## satis    274   172.18   371.61      6   1    998

2 . Boxplot and barplot to analysis the data:

boxplot(mbasal.df$age)

boxplot(mbasal.df$gmat_tot)

boxplot(mbasal.df$gmat_qpc)

boxplot(mbasal.df$gmat_vpc)

boxplot(mbasal.df$gmat_tpc)

boxplot(mbasal.df$s_avg)

boxplot(mbasal.df$f_avg)

boxplot(mbasal.df$quarter)

boxplot(mbasal.df$work_yrs)

boxplot(mbasal.df$salary)

Scattaplot of all the variables

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(mbasal.df$gmat_tot~mbasal.df$salary)

Visualization:

the candidates who got marks in gmat between 600-650 got salary 100000

Corelation

library(car)
scatterplotMatrix(~mbasal.df$age+mbasal.df$gmat_tot+mbasal.df$salary+mbasal.df$f_avg)

CORRGRAM

sal<- c("salary","satis","age","sex","f_avg","gmat_tot","work_yrs")
library(corrgram)
corrgram(mbasal.df[,sal],lower.panel=panel.pts, upper.panel=panel.pie,
diag.panel=panel.minmax, text.panel=panel.txt)

WHO GOT HOW MUCH SALARY?

stujob<- mbasal.df[which(mbasal.df$salary>0),]
mytable1 <- xtabs(~gmat_qpc+gmat_tot+gmat_vpc+s_avg, data=stujob)

T-test

Based on gmat_qpc and gmat_tpc

t.test(stujob$gmat_qpc,stujob$gmat_tpc)
## 
##  Welch Two Sample t-test
## 
## data:  stujob$gmat_qpc and stujob$gmat_tpc
## t = -2.6236, df = 364.89, p-value = 0.009065
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.370583 -0.912026
## sample estimates:
## mean of x mean of y 
##  81.48913  85.13043

As pvalue is < .05 so we failed to reject Null hypothesis. ### Based on S_avg and f_avg

t.test(stujob$s_avg,stujob$f_avg)
## 
##  Welch Two Sample t-test
## 
## data:  stujob$s_avg and stujob$f_avg
## t = -0.81877, df = 339.4, p-value = 0.4135
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.13110158  0.05403637
## sample estimates:
## mean of x mean of y 
##  3.022554  3.061087

REGRESSION

Model-1

model1 <- salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + quarter + work_yrs + satis
fit1<- lm(model1,data = stujob)
summary(fit1)
## 
## Call:
## lm(formula = model1, data = stujob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100841  -12557    9496   21978  138696 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 183580.230  76707.697   2.393   0.0178 *  
## gmat_tot      -459.960    220.074  -2.090   0.0381 *  
## gmat_qpc       402.048    605.388   0.664   0.5075    
## gmat_vpc       583.130    551.533   1.057   0.2918    
## gmat_tpc       810.333    469.157   1.727   0.0859 .  
## s_avg        15542.154  14298.346   1.087   0.2785    
## f_avg        -2633.390   7030.440  -0.375   0.7084    
## quarter      -5478.825   4708.506  -1.164   0.2462    
## work_yrs      1783.934   1229.253   1.451   0.1485    
## satis          -74.136      6.762 -10.963   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39040 on 174 degrees of freedom
## Multiple R-squared:  0.4744, Adjusted R-squared:  0.4472 
## F-statistic: 17.45 on 9 and 174 DF,  p-value: < 2.2e-16

Now we check which we can exclude :

library(leaps)
leap1 <- regsubsets(model1, data = stujob, nbest=1)
# summary(leap1)
plot(leap1, scale="adjr2")

so we can exclude f_avg and gmat_qpc and predict the model2

Model- 2

model2 <- salary ~ gmat_tot +  gmat_vpc + gmat_tpc + s_avg + quarter + work_yrs + satis
fit2<- lm(model2,data = stujob)
summary(fit2)
## 
## Call:
## lm(formula = model2, data = stujob)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -99191 -10172   9409  22908 137328 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 153632.849  65347.530   2.351  0.01983 *  
## gmat_tot      -342.682    115.102  -2.977  0.00332 ** 
## gmat_vpc       273.327    263.707   1.036  0.30140    
## gmat_tpc       925.346    442.020   2.093  0.03774 *  
## s_avg        14107.018  13787.180   1.023  0.30762    
## quarter      -5221.040   4648.993  -1.123  0.26295    
## work_yrs      1796.668   1183.512   1.518  0.13079    
## satis          -74.346      6.732 -11.044  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38890 on 176 degrees of freedom
## Multiple R-squared:  0.4724, Adjusted R-squared:  0.4514 
## F-statistic: 22.51 on 7 and 176 DF,  p-value: < 2.2e-16

VISUALIZE THE BETA COEFFICIENTS AND THEIR CONFI-DENCE INTERVALS FROM MODEL 2

library(coefplot)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
coefplot(fit2, intercept= FALSE, outerCI=1.96,coefficients=c("gmat_tot" ,"work_yrs" , "s_avg" , "f_avg","quater","satis"))

so model2 fits better according to the previous table

summary(fit1)$adj.r.squared
## [1] 0.4471795
AIC(fit1)
## [1] 4424.538
summary(fit2)$adj.r.squared
## [1] 0.4513714
AIC(fit2)
## [1] 4421.241

So we can say that this model 2 is fitted as r square fitted value is the lowest so we can say that the function is relatied with gmat_tot gmat_vpc gmat_tpc s_avg quarter work_yrs and satis

COMPARE THOSE WHO GOT A JOB WITH THOSE WHO DID NOT GET A JOB?

IDENTIFY WHY?

stunojob<- mbasal.df[which(mbasal.df$salary==0),]
head(stunojob)
##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 6  24   1      640       82       89       91   3.9  3.75       1        2
## 7  25   1      610       89       74       87   3.4  3.50       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 6        1      0     6
## 7        1      0     5
mytable <- xtabs(~gmat_qpc+gmat_tot+gmat_vpc+s_avg, data=stunojob)


summary(mytable) 
## Call: xtabs(formula = ~gmat_qpc + gmat_tot + gmat_vpc + s_avg, data = stunojob)
## Number of cases in table: 90 
## Number of factors: 4 
## Test for independence of all factors:
##  Chisq = 1116089, df = 693462, p-value = 0
##  Chi-squared approximation may be incorrect
chisq.test(stunojob)
## Warning in chisq.test(stunojob): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  stunojob
## X-squared = NaN, df = 1068, p-value = NA
chisq.test(stujob)
## Warning in chisq.test(stujob): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  stujob
## X-squared = 5482100, df = 2196, p-value < 2.2e-16

Thus i ahve compared those groups who has got the job and who has not got the job.