mbasal.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
head(mbasal.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1 23 2 620 77 87 87 3.4 3.00 1 2
## 2 24 1 610 90 71 87 3.5 4.00 1 2
## 3 24 1 670 99 78 95 3.3 3.25 1 2
## 4 24 1 570 56 81 75 3.3 2.67 1 1
## 5 24 2 710 93 98 98 3.6 3.75 1 2
## 6 24 1 640 82 89 91 3.9 3.75 1 2
## frstlang salary satis
## 1 1 0 7
## 2 1 0 6
## 3 1 0 6
## 4 1 0 7
## 5 1 999 5
## 6 1 0 6
library(psych)
describe(mbasal.df)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## age 274 27.36 3.71 27 22 48
## sex 274 1.25 0.43 1 1 2
## gmat_tot 274 619.45 57.54 620 450 790
## gmat_qpc 274 80.64 14.87 83 28 99
## gmat_vpc 274 78.32 16.86 81 16 99
## gmat_tpc 274 84.20 14.02 87 0 99
## s_avg 274 3.03 0.38 3 2 4
## f_avg 274 3.06 0.53 3 0 4
## quarter 274 2.48 1.11 2 1 4
## work_yrs 274 3.87 3.23 3 0 22
## frstlang 274 1.12 0.32 1 1 2
## salary 274 39025.69 50951.56 999 0 220000
## satis 274 172.18 371.61 6 1 998
boxplot(mbasal.df$age)
boxplot(mbasal.df$gmat_tot)
boxplot(mbasal.df$gmat_qpc)
boxplot(mbasal.df$gmat_vpc)
boxplot(mbasal.df$gmat_tpc)
boxplot(mbasal.df$s_avg)
boxplot(mbasal.df$f_avg)
boxplot(mbasal.df$quarter)
boxplot(mbasal.df$work_yrs)
boxplot(mbasal.df$salary)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(mbasal.df$gmat_tot~mbasal.df$salary)
the candidates who got marks in gmat between 600-650 got salary 100000
library(car)
scatterplotMatrix(~mbasal.df$age+mbasal.df$gmat_tot+mbasal.df$salary+mbasal.df$f_avg)
sal<- c("salary","satis","age","sex","f_avg","gmat_tot","work_yrs")
library(corrgram)
corrgram(mbasal.df[,sal],lower.panel=panel.pts, upper.panel=panel.pie,
diag.panel=panel.minmax, text.panel=panel.txt)
stujob<- mbasal.df[which(mbasal.df$salary>0),]
mytable1 <- xtabs(~gmat_qpc+gmat_tot+gmat_vpc+s_avg, data=stujob)
t.test(stujob$gmat_qpc,stujob$gmat_tpc)
##
## Welch Two Sample t-test
##
## data: stujob$gmat_qpc and stujob$gmat_tpc
## t = -2.6236, df = 364.89, p-value = 0.009065
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.370583 -0.912026
## sample estimates:
## mean of x mean of y
## 81.48913 85.13043
As pvalue is < .05 so we failed to reject Null hypothesis. ### Based on S_avg and f_avg
t.test(stujob$s_avg,stujob$f_avg)
##
## Welch Two Sample t-test
##
## data: stujob$s_avg and stujob$f_avg
## t = -0.81877, df = 339.4, p-value = 0.4135
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.13110158 0.05403637
## sample estimates:
## mean of x mean of y
## 3.022554 3.061087
model1 <- salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + quarter + work_yrs + satis
fit1<- lm(model1,data = stujob)
summary(fit1)
##
## Call:
## lm(formula = model1, data = stujob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100841 -12557 9496 21978 138696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 183580.230 76707.697 2.393 0.0178 *
## gmat_tot -459.960 220.074 -2.090 0.0381 *
## gmat_qpc 402.048 605.388 0.664 0.5075
## gmat_vpc 583.130 551.533 1.057 0.2918
## gmat_tpc 810.333 469.157 1.727 0.0859 .
## s_avg 15542.154 14298.346 1.087 0.2785
## f_avg -2633.390 7030.440 -0.375 0.7084
## quarter -5478.825 4708.506 -1.164 0.2462
## work_yrs 1783.934 1229.253 1.451 0.1485
## satis -74.136 6.762 -10.963 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39040 on 174 degrees of freedom
## Multiple R-squared: 0.4744, Adjusted R-squared: 0.4472
## F-statistic: 17.45 on 9 and 174 DF, p-value: < 2.2e-16
Now we check which we can exclude :
library(leaps)
leap1 <- regsubsets(model1, data = stujob, nbest=1)
# summary(leap1)
plot(leap1, scale="adjr2")
so we can exclude
f_avg and gmat_qpc and predict the model2
model2 <- salary ~ gmat_tot + gmat_vpc + gmat_tpc + s_avg + quarter + work_yrs + satis
fit2<- lm(model2,data = stujob)
summary(fit2)
##
## Call:
## lm(formula = model2, data = stujob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99191 -10172 9409 22908 137328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153632.849 65347.530 2.351 0.01983 *
## gmat_tot -342.682 115.102 -2.977 0.00332 **
## gmat_vpc 273.327 263.707 1.036 0.30140
## gmat_tpc 925.346 442.020 2.093 0.03774 *
## s_avg 14107.018 13787.180 1.023 0.30762
## quarter -5221.040 4648.993 -1.123 0.26295
## work_yrs 1796.668 1183.512 1.518 0.13079
## satis -74.346 6.732 -11.044 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38890 on 176 degrees of freedom
## Multiple R-squared: 0.4724, Adjusted R-squared: 0.4514
## F-statistic: 22.51 on 7 and 176 DF, p-value: < 2.2e-16
library(coefplot)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
coefplot(fit2, intercept= FALSE, outerCI=1.96,coefficients=c("gmat_tot" ,"work_yrs" , "s_avg" , "f_avg","quater","satis"))
so model2 fits better according to the previous table
summary(fit1)$adj.r.squared
## [1] 0.4471795
AIC(fit1)
## [1] 4424.538
summary(fit2)$adj.r.squared
## [1] 0.4513714
AIC(fit2)
## [1] 4421.241
So we can say that this model 2 is fitted as r square fitted value is the lowest so we can say that the function is relatied with gmat_tot gmat_vpc gmat_tpc s_avg quarter work_yrs and satis
IDENTIFY WHY?
stunojob<- mbasal.df[which(mbasal.df$salary==0),]
head(stunojob)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1 23 2 620 77 87 87 3.4 3.00 1 2
## 2 24 1 610 90 71 87 3.5 4.00 1 2
## 3 24 1 670 99 78 95 3.3 3.25 1 2
## 4 24 1 570 56 81 75 3.3 2.67 1 1
## 6 24 1 640 82 89 91 3.9 3.75 1 2
## 7 25 1 610 89 74 87 3.4 3.50 1 2
## frstlang salary satis
## 1 1 0 7
## 2 1 0 6
## 3 1 0 6
## 4 1 0 7
## 6 1 0 6
## 7 1 0 5
mytable <- xtabs(~gmat_qpc+gmat_tot+gmat_vpc+s_avg, data=stunojob)
summary(mytable)
## Call: xtabs(formula = ~gmat_qpc + gmat_tot + gmat_vpc + s_avg, data = stunojob)
## Number of cases in table: 90
## Number of factors: 4
## Test for independence of all factors:
## Chisq = 1116089, df = 693462, p-value = 0
## Chi-squared approximation may be incorrect
chisq.test(stunojob)
## Warning in chisq.test(stunojob): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: stunojob
## X-squared = NaN, df = 1068, p-value = NA
chisq.test(stujob)
## Warning in chisq.test(stujob): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: stujob
## X-squared = 5482100, df = 2196, p-value < 2.2e-16
Thus i ahve compared those groups who has got the job and who has not got the job.