mba_sal <- read.csv("C:/Program Files/RStudio/files/MBA Starting Salaries Data.csv")
View(mba_sal)
summary(mba_sal)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
==> Different age groups
hist(mba_sal$age, main="Ages", xlab="ages", col="cyan")
==> Score Distributions
par(mfrow=c(2,2))
with(mba_sal, hist(mba_sal$gmat_tot,
main = "GMAT total",
ylab = "Frequency",
xlab = "score",
col = "cyan",
breaks = 10))
with(mba_sal, hist(mba_sal$gmat_qpc,
main = "quantitative GMAT percentile",
ylab = "Frequency",
xlab = "percentile",
col = "cyan",
breaks = 10))
with(mba_sal, hist(mba_sal$gmat_vpc,
main = "verbal GMAT percentile",
ylab = "Frequency",
xlab = "percentile",
col = "cyan",
breaks = 10))
with(mba_sal, hist(mba_sal$gmat_tpc,
main = "overall GMAT percentile",
ylab = "Frequency",
xlab = "percentile",
col = "cyan",
breaks = 10))
==> Fall MBA average
hist(mba_sal$f_avg, main="Fall MBA average", xlab="average", col="cyan")
==> Spring MBA average
hist(mba_sal$s_avg, main="Spring MBA average", xlab="average", col="cyan")
==> Quartile ranking
hist(mba_sal$quarter, main="Quartile ranking", xlab="ranking", col="cyan")
==> Years of work experience.
hist(mba_sal$work_yrs, main="Years of work experience", xlab="years", col="cyan")
==> Starting salaries.
plot(mba_sal$salary, main="Starting salaries", xlab="salary", col="red")
==> Scatterplot of Years Of Work Experience vs Starting salaries
plot(mba_sal$work_yrs,mba_sal$salary, main = "Years Of Work Experience vs Starting Salary",xlab="Years Of Work Experience", ylab="Starting Salary", col="red")
==> Scatterplot of spring MBA average vs Starting Salary
plot(mba_sal$s_avg,mba_sal$salary, main = "MBA average vs Starting Salary",xlab="Spring MBA Average", ylab="Starting Salary", col="red")
==> Corrgram representing the distributions of the dataset
library(corrgram)
corrgram(mba_sal, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Starting salaries corrgram")
==> Placed dataset
placed <- mba_sal[which(mba_sal$salary>999),]
View(placed)
attach(placed)
==> Regression models y = f(x)
==> GMAT Score
x <-lm(placed$salary~placed$gmat_tot)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$gmat_tot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40821 -8223 -2543 3756 113261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122721.08 21591.40 5.684 1.28e-07 ***
## placed$gmat_tot -31.96 34.93 -0.915 0.362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17880 on 101 degrees of freedom
## Multiple R-squared: 0.008221, Adjusted R-squared: -0.001598
## F-statistic: 0.8372 on 1 and 101 DF, p-value: 0.3624
==> GMAT Quant Percentile
x <-lm(placed$salary~placed$gmat_qpc)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$gmat_qpc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38564 -7621 -2979 3162 117342
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101525.69 10735.85 9.457 1.41e-15 ***
## placed$gmat_qpc 18.88 132.81 0.142 0.887
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17960 on 101 degrees of freedom
## Multiple R-squared: 0.0002, Adjusted R-squared: -0.009699
## F-statistic: 0.0202 on 1 and 101 DF, p-value: 0.8873
==> GMAT Total percentile
x <-lm(placed$salary~placed$gmat_tpc)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$gmat_tpc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41929 -7964 -2071 4107 109784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121147.3 13648.4 8.876 2.66e-14 ***
## placed$gmat_tpc -214.3 160.1 -1.338 0.184
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17800 on 101 degrees of freedom
## Multiple R-squared: 0.01743, Adjusted R-squared: 0.0077
## F-statistic: 1.792 on 1 and 101 DF, p-value: 0.1837
==> GMAT Verbal Percentile
x <-lm(placed$salary~placed$gmat_vpc)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$gmat_vpc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39116 -8268 -2660 4264 111864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 114980.7 8747.3 13.145 <2e-16 ***
## placed$gmat_vpc -152.1 109.1 -1.394 0.166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17790 on 101 degrees of freedom
## Multiple R-squared: 0.01889, Adjusted R-squared: 0.009174
## F-statistic: 1.944 on 1 and 101 DF, p-value: 0.1663
==> Spring MBA Average
x <-lm(placed$salary~placed$s_avg)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$s_avg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40989 -8087 -2068 3682 119814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88179 14558 6.057 2.39e-08 ***
## placed$s_avg 4803 4673 1.028 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17860 on 101 degrees of freedom
## Multiple R-squared: 0.01035, Adjusted R-squared: 0.0005508
## F-statistic: 1.056 on 1 and 101 DF, p-value: 0.3065
==> Fall MBA Average
x <-lm(placed$salary~placed$f_avg)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$f_avg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38413 -7413 -3384 4087 115645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115039 11342 10.143 <2e-16 ***
## placed$f_avg -3885 3625 -1.072 0.286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17860 on 101 degrees of freedom
## Multiple R-squared: 0.01124, Adjusted R-squared: 0.001455
## F-statistic: 1.149 on 1 and 101 DF, p-value: 0.2864
==> Years of work experience
x <-lm(placed$salary~placed$work_yrs)
summary(x)
##
## Call:
## lm(formula = placed$salary ~ placed$work_yrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34498 -7745 -498 3803 86419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93101 2496 37.30 < 2e-16 ***
## placed$work_yrs 2699 526 5.13 1.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15990 on 101 degrees of freedom
## Multiple R-squared: 0.2067, Adjusted R-squared: 0.1989
## F-statistic: 26.32 on 1 and 101 DF, p-value: 1.403e-06
p-value < 0.05, so it seems statistically significant, and hence is the best model.
==> Chi Square test
chisq.test(placed$salary,placed$work_yrs, correct=FALSE)
## Warning in chisq.test(placed$salary, placed$work_yrs, correct = FALSE):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: placed$salary and placed$work_yrs
## X-squared = 535.23, df = 451, p-value = 0.003809
==> t-Test
t.test(placed$salary,placed$work_yrs,var.equal=TRUE, paired=FALSE)
##
## Two Sample t-test
##
## data: placed$salary and placed$work_yrs
## t = 58.516, df = 204, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99555.62 106498.49
## sample estimates:
## mean of x mean of y
## 1.030307e+05 3.679612e+00
==> Not placed dataset
notplaced <- mba_sal[which(mba_sal$salary==0),]
View(notplaced)
attach(notplaced)
## The following objects are masked from placed:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
==> New Subset
mba_sub <- subset(mba_sal,mba_sal$salary==0|mba_sal$salary>999)
View(mba_sub)
attach(mba_sub)
## The following objects are masked from notplaced:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
## The following objects are masked from placed:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
mba_sub$placed <- ifelse(mba_sub$salary > 0,1,0)
mba_sub$placed <- factor(mba_sub$placed)
x <-glm(mba_sub$placed~., data = mba_sub, family = "binomial"(link='logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(x)
##
## Call:
## glm(formula = mba_sub$placed ~ ., family = binomial(link = "logit"),
## data = mba_sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.269e-05 -6.461e-07 2.110e-08 2.110e-08 1.857e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.285e+01 8.710e+05 0.000 1.000
## age -5.727e-01 1.529e+04 0.000 1.000
## sex 3.012e+00 5.274e+04 0.000 1.000
## gmat_tot 1.089e-03 2.893e+03 0.000 1.000
## gmat_qpc -4.586e-02 7.209e+03 0.000 1.000
## gmat_vpc 5.529e-02 7.914e+03 0.000 1.000
## gmat_tpc -4.487e-02 4.255e+03 0.000 1.000
## s_avg 2.796e+00 8.174e+04 0.000 1.000
## f_avg 1.356e+00 8.324e+04 0.000 1.000
## quarter 2.354e+00 2.603e+04 0.000 1.000
## work_yrs -2.840e-01 1.556e+04 0.000 1.000
## frstlang -3.253e+00 2.582e+05 0.000 1.000
## salary 6.110e-04 5.454e-01 0.001 0.999
## satis 7.063e-01 3.452e+04 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.6668e+02 on 192 degrees of freedom
## Residual deviance: 1.9027e-09 on 179 degrees of freedom
## AIC: 28
##
## Number of Fisher Scoring iterations: 25