setwd("C:/Users/harsh/Desktop/r")
MBA<- read.csv("MBA Starting Salaries Data.csv")
attach(MBA)
library(psych)
describe(MBA)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
group1 <- MBA[which(satis!=998),]
group2 <- MBA[which(salary>999),]
hist(age, breaks=18, col="skyblue", xlab="Age", main="AGE")
hist(sex, breaks=18, col="lightgreen", xlab="1 : Male
2 : Female", main="Sex")
hist(gmat_tot, breaks=18, col="skyblue", xlab="Gmat Total", main="Gmat Total")
hist(gmat_qpc, breaks=18, col="lightgreen", xlab="Gmat qpc", main="Quantitative GMAT Percentile")
hist(gmat_vpc, breaks=18, col="gray", xlab="Gmat Vpc", main="Gmat Verbal Percentile")
hist(gmat_tpc, breaks=18, col="Yellow", xlab="Gmat tpc", main="Gmat Overall Percentile")
hist(group2$salary, breaks=18, col="gray", xlab="Salary", main="Salary of placed students")
hist(group1$satis, breaks=18, col="lightgreen", xlab="Satisfaction", main="Satisfaction")
hist(work_yrs, breaks=18, col="skyblue", xlab="Work Years", main="Work Experience in Years")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~ age, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Age", xlab="Age", ylab="Salary")
scatterplot(salary ~ sex, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Sex", xlab="Sex", ylab="Salary")
scatterplot(salary ~ gmat_tpc, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Overall GMAT Percentile ", xlab="gmat percentile", ylab="Salary")
scatterplot(salary ~ s_avg, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Spring MBA AVerage", xlab="Spring MBA AVerage", ylab="Salary")
scatterplot(salary ~ f_avg, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Fall MBA AVerage", xlab="Fall MBA AVerage", ylab="Salary")
scatterplot(salary ~ work_yrs, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Work Experience", xlab="Work Experience", ylab="Salary")
scatterplot(salary ~ frstlang, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs First Language", xlab="First Language ( 1= English
2= Others)", ylab="Salary")
library(corrgram)
corrgram(MBA, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Variables")
x <- group2[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary", "sex", "frstlang")]
y <- group2[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary","sex", "frstlang")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc
## age 10.7045498 -1.305445e+01 -7.22796497 9.505045e-01
## gmat_tot -13.0544451 2.569294e+03 452.14258519 6.386360e+02
## gmat_qpc -7.2279650 4.521426e+02 179.18027794 2.045850e+01
## gmat_vpc 0.9505045 6.386360e+02 20.45849990 2.606602e+02
## gmat_tpc -3.4602132 5.393623e+02 97.03607462 1.393882e+02
## s_avg 0.1938587 3.299562e+00 0.07838473 9.694594e-01
## f_avg -0.3462517 3.027432e+00 0.64252142 1.803303e-01
## work_yrs 8.6728536 -1.873882e+01 -7.36245955 -1.366838e+00
## salary 29210.5193223 -8.212449e+04 3382.43784504 -3.964803e+04
## sex -0.2164477 -4.568818e-01 -0.90757662 3.974872e-01
## frstlang 0.2898344 -1.687607e+00 0.04806777 -8.915858e-01
## gmat_tpc s_avg f_avg work_yrs
## age -3.460213e+00 0.19385875 -3.462517e-01 8.6728536
## gmat_tot 5.393623e+02 3.29956215 3.027432e+00 -18.7388159
## gmat_qpc 9.703607e+01 0.07838473 6.425214e-01 -7.3624595
## gmat_vpc 1.393882e+02 0.96945936 1.803303e-01 -1.3668380
## gmat_tpc 1.211342e+02 0.58062916 3.785056e-01 -4.3892062
## s_avg 5.806292e-01 0.14325138 8.231046e-02 0.1860480
## f_avg 3.785056e-01 0.08231046 2.378638e-01 -0.3176271
## work_yrs -4.389206e+00 0.18604797 -3.176271e-01 9.0630116
## salary -2.596339e+04 688.02042071 -9.241129e+02 24458.1995050
## sex -2.377689e-01 0.01409575 3.725395e-02 -0.1281173
## frstlang -4.575481e-01 -0.01319912 -6.243099e-03 0.1494384
## salary sex frstlang
## age 2.921052e+04 -2.164477e-01 2.898344e-01
## gmat_tot -8.212449e+04 -4.568818e-01 -1.687607e+00
## gmat_qpc 3.382438e+03 -9.075766e-01 4.806777e-02
## gmat_vpc -3.964803e+04 3.974872e-01 -8.915858e-01
## gmat_tpc -2.596339e+04 -2.377689e-01 -4.575481e-01
## s_avg 6.880204e+02 1.409575e-02 -1.319912e-02
## f_avg -9.241129e+02 3.725395e-02 -6.243099e-03
## work_yrs 2.445820e+04 -1.281173e-01 1.494384e-01
## salary 3.192940e+08 -1.369577e+03 1.206714e+03
## sex -1.369577e+03 2.124500e-01 8.756901e-03
## frstlang 1.206714e+03 8.756901e-03 6.396345e-02
corrgram(group2, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Variables")
table1<-xtabs(~age+salary,data=group2)
table2<-xtabs(~sex+salary,data=group2)
table3<-xtabs(~work_yrs+salary,data=group2)
table4<-xtabs(~gmat_tpc+salary,data=group2)
table5<-xtabs(~frstlang+salary,data=group2)
table6<-xtabs(~quarter+salary,data=group2)
table7<-xtabs(~s_avg+salary,data=group2)
table8<-xtabs(~f_avg+salary,data=group2)
table9<-xtabs(~satis+salary,data=group2)
chisq.test(table1)
## Warning in chisq.test(table1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table1
## X-squared = 717.62, df = 574, p-value = 3.929e-05
chisq.test(table2)
## Warning in chisq.test(table2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table2
## X-squared = 52.681, df = 41, p-value = 0.1045
chisq.test(table3)
## Warning in chisq.test(table3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table3
## X-squared = 535.23, df = 451, p-value = 0.003809
chisq.test(table4)
## Warning in chisq.test(table4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table4
## X-squared = 1422.2, df = 1230, p-value = 0.0001065
chisq.test(table5)
## Warning in chisq.test(table5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table5
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(table6)
## Warning in chisq.test(table6): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table6
## X-squared = 129.85, df = 123, p-value = 0.3186
chisq.test(table7)
## Warning in chisq.test(table7): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table7
## X-squared = 792.97, df = 861, p-value = 0.9524
chisq.test(table8)
## Warning in chisq.test(table8): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table8
## X-squared = 596.28, df = 574, p-value = 0.2518
chisq.test(table9)
## Warning in chisq.test(table9): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table9
## X-squared = 109.1, df = 164, p-value = 0.9997
model1 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang, data = group2)
summary(model1)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc +
## gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang,
## data = group2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25773 -7903 -609 5617 70568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70768.71 50785.86 1.393 0.1669
## age 1718.73 1124.50 1.528 0.1299
## sex -3423.13 3566.90 -0.960 0.3398
## gmat_tpc -1502.62 706.28 -2.127 0.0361 *
## gmat_vpc 506.10 493.54 1.025 0.3079
## gmat_qpc 769.13 491.76 1.564 0.1213
## gmat_tot 39.19 172.21 0.228 0.8205
## s_avg -1518.43 8123.94 -0.187 0.8521
## f_avg -2363.96 3868.50 -0.611 0.5427
## quarter -2702.70 2612.19 -1.035 0.3036
## work_yrs 758.24 1131.11 0.670 0.5043
## frstlang 7303.97 7296.86 1.001 0.3195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15360 on 91 degrees of freedom
## Multiple R-squared: 0.3403, Adjusted R-squared: 0.2606
## F-statistic: 4.268 on 11 and 91 DF, p-value: 4.112e-05
Itteration(removing two variables having P-value >80%)
model2 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + f_avg + quarter + work_yrs + frstlang + satis, data = group2)
summary(model2)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc +
## f_avg + quarter + work_yrs + frstlang + satis, data = group2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26345 -7968 -321 6064 70799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79216.9 32739.0 2.420 0.0175 *
## age 1741.3 1113.0 1.565 0.1211
## sex -3612.3 3514.6 -1.028 0.3067
## gmat_tpc -1448.6 695.3 -2.084 0.0400 *
## gmat_vpc 579.0 357.2 1.621 0.1085
## gmat_qpc 827.8 357.9 2.313 0.0229 *
## f_avg -2228.7 3696.2 -0.603 0.5480
## quarter -2085.0 1641.9 -1.270 0.2073
## work_yrs 749.7 1123.3 0.667 0.5062
## frstlang 7918.3 7129.5 1.111 0.2696
## satis -1150.4 2057.9 -0.559 0.5775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15260 on 92 degrees of freedom
## Multiple R-squared: 0.3421, Adjusted R-squared: 0.2706
## F-statistic: 4.783 on 10 and 92 DF, p-value: 1.616e-05
Itteration2(removing two variables with p > 40% and we get the following results)
model3 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + f_avg + quarter + frstlang, data = group2)
summary(model3)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc +
## f_avg + quarter + frstlang, data = group2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24208 -7290 -967 5190 71732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63049.8 26242.1 2.403 0.0182 *
## age 2356.5 545.7 4.319 3.89e-05 ***
## sex -3109.8 3444.9 -0.903 0.3690
## gmat_tpc -1460.8 687.2 -2.126 0.0361 *
## gmat_vpc 561.9 353.2 1.591 0.1150
## gmat_qpc 834.9 353.5 2.361 0.0203 *
## f_avg -2485.8 3660.2 -0.679 0.4987
## quarter -2347.8 1580.4 -1.486 0.1407
## frstlang 6265.2 6771.5 0.925 0.3572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15160 on 94 degrees of freedom
## Multiple R-squared: 0.3366, Adjusted R-squared: 0.2801
## F-statistic: 5.961 on 8 and 94 DF, p-value: 3.569e-06
Itteration3(removing the variable with p >30%)
model4 <- lm(salary ~ age + gmat_tpc + gmat_vpc + gmat_qpc + quarter, data = group2)
summary(model4)
##
## Call:
## lm(formula = salary ~ age + gmat_tpc + gmat_vpc + gmat_qpc +
## quarter, data = group2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26838 -7493 -245 5380 69938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46396.9 19354.6 2.397 0.0184 *
## age 2721.0 467.6 5.819 7.62e-08 ***
## gmat_tpc -1444.4 680.9 -2.121 0.0365 *
## gmat_vpc 529.1 350.3 1.511 0.1342
## gmat_qpc 852.2 348.1 2.448 0.0162 *
## quarter -1611.9 1358.9 -1.186 0.2384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15080 on 97 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.2878
## F-statistic: 9.243 on 5 and 97 DF, p-value: 3.202e-07
Now we have achieved the model with a relatively higher r-squared value. Thus, a regression model with r-squared value of 28% is derived for the purpose of salary prediction.
Now we will consider two different groups from the given data set, one with students who are placed and other with students who are not placed.
mba <- read.csv('MBA Starting Salaries Data.csv',header=T, na.strings=c(""," ","999","998","NA"))
mba$Placed = (mba$salary >1000)
mytab <- xtabs(~ Placed+sex, data=mba)
round(ftable(addmargins(prop.table(mytab))),2)
## sex 1 2 Sum
## Placed
## FALSE 0.35 0.12 0.47
## TRUE 0.37 0.16 0.53
## Sum 0.72 0.28 1.00
mytab2 <- xtabs(~ Placed+frstlang, data=mba)
round(ftable(addmargins(prop.table(mytab2))),2)
## frstlang 1 2 Sum
## Placed
## FALSE 0.42 0.04 0.47
## TRUE 0.50 0.04 0.53
## Sum 0.92 0.08 1.00
chisq.test(mytab)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytab
## X-squared = 0.29208, df = 1, p-value = 0.5889
chisq.test(mytab2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytab2
## X-squared = 0.074127, df = 1, p-value = 0.7854
SO, > Among placed, Males are in higher proportions than females. > Students with English as the first language are in higher proportion than the ones having other languages as first language in placed students.